diff --git a/llvm/lib/Target/X86/CMakeLists.txt b/llvm/lib/Target/X86/CMakeLists.txt --- a/llvm/lib/Target/X86/CMakeLists.txt +++ b/llvm/lib/Target/X86/CMakeLists.txt @@ -45,6 +45,7 @@ X86FixupBWInsts.cpp X86FixupLEAs.cpp X86FixupInstTuning.cpp + X86FixupVectorConstants.cpp X86AvoidStoreForwardingBlocks.cpp X86DynAllocaExpander.cpp X86FixupSetCC.cpp diff --git a/llvm/lib/Target/X86/X86.h b/llvm/lib/Target/X86/X86.h --- a/llvm/lib/Target/X86/X86.h +++ b/llvm/lib/Target/X86/X86.h @@ -63,10 +63,13 @@ /// instructions, in order to eliminate execution delays in some processors. FunctionPass *createX86FixupLEAs(); -/// Return as pass that replaces equivilent slower instructions with faster +/// Return as pass that replaces equivalent slower instructions with faster /// ones. FunctionPass *createX86FixupInstTuning(); +/// Return as pass that reduces the size of vector constant pool loads. +FunctionPass *createX86FixupVectorConstants(); + /// Return a pass that removes redundant LEA instructions and redundant address /// recalculations. FunctionPass *createX86OptimizeLEAs(); @@ -174,6 +177,7 @@ void initializeFixupLEAPassPass(PassRegistry &); void initializeX86ArgumentStackSlotPassPass(PassRegistry &); void initializeX86FixupInstTuningPassPass(PassRegistry &); +void initializeX86FixupVectorConstantsPassPass(PassRegistry &); void initializeWinEHStatePassPass(PassRegistry &); void initializeX86AvoidSFBPassPass(PassRegistry &); void initializeX86AvoidTrailingCallPassPass(PassRegistry &); diff --git a/llvm/lib/Target/X86/X86FixupVectorConstants.cpp b/llvm/lib/Target/X86/X86FixupVectorConstants.cpp new file mode 100644 --- /dev/null +++ b/llvm/lib/Target/X86/X86FixupVectorConstants.cpp @@ -0,0 +1,255 @@ +//===-- X86FixupVectorConstants.cpp - optimize constant generation -------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file examines all full size vector constant pool loads and attempts to +// replace them with smaller constant pool entries, including folding AVX512 +// broadcast-able instructions (TODO), broadcasting and extending (TODO) loads. +// +//===----------------------------------------------------------------------===// + +#include "X86.h" +#include "X86InstrInfo.h" +#include "X86Subtarget.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/CodeGen/MachineConstantPool.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" + +using namespace llvm; + +#define DEBUG_TYPE "x86-fixup-vector-constants" + +STATISTIC(NumInstChanges, "Number of instructions changes"); + +namespace { +class X86FixupVectorConstantsPass : public MachineFunctionPass { +public: + static char ID; + + X86FixupVectorConstantsPass() : MachineFunctionPass(ID) {} + + StringRef getPassName() const override { + return "X86 Fixup Vector Constants"; + } + + bool runOnMachineFunction(MachineFunction &MF) override; + bool processInstruction(MachineFunction &MF, MachineBasicBlock &MBB, + MachineBasicBlock::iterator &I); + + // This pass runs after regalloc and doesn't support VReg operands. + MachineFunctionProperties getRequiredProperties() const override { + return MachineFunctionProperties().set( + MachineFunctionProperties::Property::NoVRegs); + } + +private: + const X86InstrInfo *TII = nullptr; + const X86Subtarget *ST = nullptr; + const MCSchedModel *SM = nullptr; +}; +} // end anonymous namespace + +char X86FixupVectorConstantsPass::ID = 0; + +INITIALIZE_PASS(X86FixupVectorConstantsPass, DEBUG_TYPE, DEBUG_TYPE, false, false) + +FunctionPass *llvm::createX86FixupVectorConstants() { + return new X86FixupVectorConstantsPass(); +} + +static const Constant *getConstantFromPool(const MachineInstr &MI, + const MachineOperand &Op) { + if (!Op.isCPI() || Op.getOffset() != 0) + return nullptr; + + ArrayRef Constants = + MI.getParent()->getParent()->getConstantPool()->getConstants(); + const MachineConstantPoolEntry &ConstantEntry = Constants[Op.getIndex()]; + + // Bail if this is a machine constant pool entry, we won't be able to dig out + // anything useful. + if (ConstantEntry.isMachineConstantPoolEntry()) + return nullptr; + + return ConstantEntry.Val.ConstVal; +} + +std::optional getConstantBits(const Constant *C) { + unsigned NumBits = C->getType()->getPrimitiveSizeInBits(); + + if (auto *CInt = dyn_cast(C)) + return CInt->getValue(); + + if (auto *CFP = dyn_cast(C)) + return CFP->getValue().bitcastToAPInt(); + + if (auto *CV = dyn_cast(C)) + if (auto *CVSplat = CV->getSplatValue(/*AllowUndefs*/ true)) + if (std::optional Bits = getConstantBits(CVSplat)) { + assert((NumBits % Bits->getBitWidth()) == 0 && "Illegal splat"); + return APInt::getSplat(NumBits, *Bits); + } + + if (auto *CDS = dyn_cast(C)) { + bool IsInteger = CDS->getElementType()->isIntegerTy(); + bool IsFloat = CDS->getElementType()->isHalfTy() || + CDS->getElementType()->isFloatTy() || + CDS->getElementType()->isDoubleTy(); + if (IsInteger || IsFloat) { + APInt Bits = APInt::getZero(NumBits); + unsigned EltBits = CDS->getElementType()->getPrimitiveSizeInBits(); + for (unsigned I = 0, E = CDS->getNumElements(); I != E; ++I) { + if (IsInteger) + Bits.insertBits(CDS->getElementAsAPInt(I), I * EltBits); + else + Bits.insertBits(CDS->getElementAsAPFloat(I).bitcastToAPInt(), + I * EltBits); + } + return Bits; + } + } + + return std::nullopt; +} + +bool X86FixupVectorConstantsPass::processInstruction( + MachineFunction &MF, MachineBasicBlock &MBB, + MachineBasicBlock::iterator &I) { + MachineInstr &MI = *I; + unsigned Opc = MI.getOpcode(); + MachineConstantPool *CP = MI.getParent()->getParent()->getConstantPool(); + + auto ConvertToBroadcastInt = [&](unsigned OpBcst64, unsigned OpBcst32, + unsigned OpBcst16, unsigned OpBcst8) { + assert(MI.getNumOperands() >= (1 + X86::AddrNumOperands) && + "Unexpected number of operands!"); + + MachineOperand &CstOp = MI.getOperand(1 + X86::AddrDisp); + if (auto *C = getConstantFromPool(MI, CstOp)) { + if (std::optional Bits = getConstantBits(C)) { + if (OpBcst8 && Bits->isSplat(8)) { + Type *CstTy = Type::getInt8Ty(C->getType()->getContext()); + Constant *NewCst = Constant::getIntegerValue(CstTy, Bits->trunc(8)); + unsigned NewCPIdx = CP->getConstantPoolIndex(NewCst, Align(1)); + MI.setDesc(TII->get(OpBcst8)); + CstOp.setIndex(NewCPIdx); + return true; + } + if (OpBcst16 && Bits->isSplat(16)) { + Type *CstTy = Type::getInt16Ty(C->getType()->getContext()); + Constant *NewCst = Constant::getIntegerValue(CstTy, Bits->trunc(16)); + unsigned NewCPIdx = CP->getConstantPoolIndex(NewCst, Align(2)); + MI.setDesc(TII->get(OpBcst16)); + CstOp.setIndex(NewCPIdx); + return true; + } + if (OpBcst32 && Bits->isSplat(32)) { + Type *CstTy = Type::getInt32Ty(C->getType()->getContext()); + Constant *NewCst = Constant::getIntegerValue(CstTy, Bits->trunc(32)); + unsigned NewCPIdx = CP->getConstantPoolIndex(NewCst, Align(4)); + MI.setDesc(TII->get(OpBcst32)); + CstOp.setIndex(NewCPIdx); + return true; + } + if (OpBcst64 && Bits->isSplat(64)) { + Type *CstTy = Type::getInt64Ty(C->getType()->getContext()); + Constant *NewCst = Constant::getIntegerValue(CstTy, Bits->trunc(64)); + unsigned NewCPIdx = CP->getConstantPoolIndex(NewCst, Align(8)); + MI.setDesc(TII->get(OpBcst64)); + CstOp.setIndex(NewCPIdx); + return true; + } + return false; + } + } + + return false; + }; + + auto ConvertToBroadcastFP = [&](unsigned OpBcst64, unsigned OpBcst32) { + assert(MI.getNumOperands() >= (1 + X86::AddrNumOperands) && + "Unexpected number of operands!"); + + MachineOperand &CstOp = MI.getOperand(1 + X86::AddrDisp); + if (auto *C = getConstantFromPool(MI, CstOp)) { + if (std::optional Bits = getConstantBits(C)) { + if (OpBcst32 && Bits->isSplat(32)) { + APFloat FBits(APFloat::IEEEsingle(), Bits->trunc(32)); + Constant *NewCst = ConstantFP::get(C->getType()->getContext(), FBits); + unsigned NewCPIdx = CP->getConstantPoolIndex(NewCst, Align(4)); + MI.setDesc(TII->get(OpBcst32)); + CstOp.setIndex(NewCPIdx); + return true; + } + if (OpBcst64 && Bits->isSplat(64)) { + APFloat FBits(APFloat::IEEEdouble(), Bits->trunc(64)); + Constant *NewCst = ConstantFP::get(C->getType()->getContext(), FBits); + unsigned NewCPIdx = CP->getConstantPoolIndex(NewCst, Align(8)); + MI.setDesc(TII->get(OpBcst64)); + CstOp.setIndex(NewCPIdx); + return true; + } + return false; + } + } + + return false; + }; + + switch (Opc) { + case X86::VMOVDQArm: + case X86::VMOVDQUrm: + if (ST->hasInt256()) + return ConvertToBroadcastInt(X86::VPBROADCASTQrm, X86::VPBROADCASTDrm, + X86::VPBROADCASTWrm, X86::VPBROADCASTBrm); + return false; // TODO: Add AVX VMOVDDUP/VBROADCASTSS. + case X86::VMOVDQAYrm: + case X86::VMOVDQUYrm: + if (ST->hasInt256()) + return ConvertToBroadcastInt(X86::VPBROADCASTQYrm, X86::VPBROADCASTDYrm, + X86::VPBROADCASTWYrm, X86::VPBROADCASTBYrm); + return false; // TODO: Add AVX VBROADCASTSD/VBROADCASTSS. + case X86::MOVAPDrm: + case X86::MOVAPSrm: + case X86::MOVUPDrm: + case X86::MOVUPSrm: + return false; // TODO: Add SSE3 MOVDDUP. + case X86::VMOVAPDrm: + case X86::VMOVAPSrm: + case X86::VMOVUPDrm: + case X86::VMOVUPSrm: + return ConvertToBroadcastFP(X86::VMOVDDUPrm, X86::VBROADCASTSSrm); + case X86::VMOVAPDYrm: + case X86::VMOVAPSYrm: + case X86::VMOVUPDYrm: + case X86::VMOVUPSYrm: + return ConvertToBroadcastFP(X86::VBROADCASTSDYrm, X86::VBROADCASTSSYrm); + } + + return false; +} + +bool X86FixupVectorConstantsPass::runOnMachineFunction(MachineFunction &MF) { + LLVM_DEBUG(dbgs() << "Start X86FixupVectorConstants\n";); + bool Changed = false; + ST = &MF.getSubtarget(); + TII = ST->getInstrInfo(); + SM = &ST->getSchedModel(); + + for (MachineBasicBlock &MBB : MF) { + for (MachineBasicBlock::iterator I = MBB.begin(); I != MBB.end(); ++I) { + if (processInstruction(MF, MBB, I)) { + ++NumInstChanges; + Changed = true; + } + } + } + LLVM_DEBUG(dbgs() << "End X86FixupVectorConstants\n";); + return Changed; +} diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -9732,7 +9732,7 @@ // VBROADCAST requires AVX. // TODO: Splats could be generated for non-AVX CPUs using SSE // instructions, but there's less potential gain for only 128-bit vectors. - if (!Subtarget.hasAVX()) + if (!Subtarget.hasAVX() || (!Subtarget.hasAVX512() && BVOp->isConstant())) return SDValue(); MVT VT = BVOp->getSimpleValueType(0); diff --git a/llvm/lib/Target/X86/X86TargetMachine.cpp b/llvm/lib/Target/X86/X86TargetMachine.cpp --- a/llvm/lib/Target/X86/X86TargetMachine.cpp +++ b/llvm/lib/Target/X86/X86TargetMachine.cpp @@ -574,6 +574,7 @@ addPass(createX86PadShortFunctions()); addPass(createX86FixupLEAs()); addPass(createX86FixupInstTuning()); + addPass(createX86FixupVectorConstants()); } addPass(createX86EvexToVexInsts()); addPass(createX86DiscriminateMemOpsPass()); diff --git a/llvm/test/CodeGen/X86/abdu-vector-128.ll b/llvm/test/CodeGen/X86/abdu-vector-128.ll --- a/llvm/test/CodeGen/X86/abdu-vector-128.ll +++ b/llvm/test/CodeGen/X86/abdu-vector-128.ll @@ -261,7 +261,7 @@ ; ; AVX2-LABEL: abd_ext_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] ; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm3 ; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm2 ; AVX2-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2 @@ -338,7 +338,7 @@ ; ; AVX2-LABEL: abd_ext_v2i64_undef: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] ; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm3 ; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm2 ; AVX2-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2 @@ -508,7 +508,7 @@ ; ; AVX2-LABEL: abd_minmax_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] ; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm3 ; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm2 ; AVX2-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2 @@ -675,7 +675,7 @@ ; ; AVX2-LABEL: abd_cmp_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] ; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm3 ; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm2 ; AVX2-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2 @@ -757,7 +757,7 @@ ; ; AVX2-LABEL: abd_cmp_v2i64_multiuse_cmp: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] ; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm3 ; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm2 ; AVX2-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2 diff --git a/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll b/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll --- a/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll +++ b/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll @@ -1221,7 +1221,7 @@ ; AVX2-NEXT: vmovdqa (%rdi), %xmm1 ; AVX2-NEXT: vpaddb (%rsi), %xmm1, %xmm1 ; AVX2-NEXT: vpbroadcastd %xmm1, %ymm1 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255] +; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [4294967040,4294967040,4294967040,4294967040,4294967040,4294967040,4294967040,4294967040] ; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 ; AVX2-NEXT: vmovdqa %ymm0, (%rcx) @@ -1346,7 +1346,7 @@ ; AVX2-NEXT: vmovdqa (%rdi), %xmm1 ; AVX2-NEXT: vpaddb (%rsi), %xmm1, %xmm1 ; AVX2-NEXT: vpbroadcastq %xmm1, %ymm1 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255] +; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [18446744073709551360,18446744073709551360,18446744073709551360,18446744073709551360] ; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 ; AVX2-NEXT: vmovdqa %ymm0, (%rcx) diff --git a/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll b/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll --- a/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll +++ b/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll @@ -1012,7 +1012,7 @@ ; AVX2: # %bb.0: ; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] ; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255] +; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm1 = [4294967040,4294967040,4294967040,4294967040,4294967040,4294967040,4294967040,4294967040] ; AVX2-NEXT: vpblendvb %ymm1, 32(%rdi), %ymm0, %ymm0 ; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX2-NEXT: vmovdqa %ymm0, (%rdx) @@ -1110,7 +1110,7 @@ ; AVX2: # %bb.0: ; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] ; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,1,0,1,4,5,4,5] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255] +; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [18446744073709551360,18446744073709551360,18446744073709551360,18446744073709551360] ; AVX2-NEXT: vpblendvb %ymm1, 32(%rdi), %ymm0, %ymm0 ; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX2-NEXT: vmovdqa %ymm0, (%rdx) diff --git a/llvm/test/CodeGen/X86/avg.ll b/llvm/test/CodeGen/X86/avg.ll --- a/llvm/test/CodeGen/X86/avg.ll +++ b/llvm/test/CodeGen/X86/avg.ll @@ -1033,8 +1033,7 @@ ; ; AVX1-LABEL: avg_v32i8_const: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovddup {{.*#+}} xmm0 = [506097522914230528,506097522914230528] -; AVX1-NEXT: # xmm0 = mem[0,0] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] ; AVX1-NEXT: vpavgb (%rdi), %xmm0, %xmm1 ; AVX1-NEXT: vpavgb 16(%rdi), %xmm0, %xmm0 ; AVX1-NEXT: vmovdqu %xmm0, (%rax) @@ -1084,8 +1083,7 @@ ; ; AVX1-LABEL: avg_v64i8_const: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovddup {{.*#+}} xmm0 = [506097522914230528,506097522914230528] -; AVX1-NEXT: # xmm0 = mem[0,0] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] ; AVX1-NEXT: vpavgb (%rdi), %xmm0, %xmm1 ; AVX1-NEXT: vpavgb 16(%rdi), %xmm0, %xmm2 ; AVX1-NEXT: vpavgb 32(%rdi), %xmm0, %xmm3 @@ -1254,8 +1252,7 @@ ; ; AVX2-LABEL: avg_v32i16_const: ; AVX2: # %bb.0: -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] -; AVX2-NEXT: # ymm0 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm0 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] ; AVX2-NEXT: vpavgw (%rdi), %ymm0, %ymm1 ; AVX2-NEXT: vpavgw 32(%rdi), %ymm0, %ymm0 ; AVX2-NEXT: vmovdqu %ymm0, (%rax) diff --git a/llvm/test/CodeGen/X86/avx-basic.ll b/llvm/test/CodeGen/X86/avx-basic.ll --- a/llvm/test/CodeGen/X86/avx-basic.ll +++ b/llvm/test/CodeGen/X86/avx-basic.ll @@ -87,7 +87,7 @@ define <16 x float> @fneg(<16 x float> %a) nounwind { ; CHECK-LABEL: fneg: ; CHECK: ## %bb.0: -; CHECK-NEXT: vmovaps {{.*#+}} ymm2 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] +; CHECK-NEXT: vbroadcastss {{.*#+}} ymm2 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] ; CHECK-NEXT: vxorps %ymm2, %ymm0, %ymm0 ; CHECK-NEXT: vxorps %ymm2, %ymm1, %ymm1 ; CHECK-NEXT: retq diff --git a/llvm/test/CodeGen/X86/avx-logic.ll b/llvm/test/CodeGen/X86/avx-logic.ll --- a/llvm/test/CodeGen/X86/avx-logic.ll +++ b/llvm/test/CodeGen/X86/avx-logic.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx | FileCheck %s --check-prefixes=ANY,AVX1 -; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2 | FileCheck %s --check-prefixes=ANY,INT256 -; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512f | FileCheck %s --check-prefixes=ANY,INT256 +; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx | FileCheck %s --check-prefixes=ANY,AVX,AVX1 +; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2 | FileCheck %s --check-prefixes=ANY,AVX,INT256 +; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512f | FileCheck %s --check-prefixes=ANY,AVX512 define <4 x double> @andpd256(<4 x double> %y, <4 x double> %x) nounwind uwtable readnone ssp { ; ANY-LABEL: andpd256: @@ -273,32 +273,32 @@ } define <4 x i32> @and_xor_splat1_v4i32(<4 x i32> %x) nounwind { -; AVX1-LABEL: and_xor_splat1_v4i32: -; AVX1: # %bb.0: -; AVX1-NEXT: vandnps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX1-NEXT: retq +; AVX-LABEL: and_xor_splat1_v4i32: +; AVX: # %bb.0: +; AVX-NEXT: vandnps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX-NEXT: retq ; -; INT256-LABEL: and_xor_splat1_v4i32: -; INT256: # %bb.0: -; INT256-NEXT: vbroadcastss {{.*#+}} xmm1 = [1,1,1,1] -; INT256-NEXT: vandnps %xmm1, %xmm0, %xmm0 -; INT256-NEXT: retq +; AVX512-LABEL: and_xor_splat1_v4i32: +; AVX512: # %bb.0: +; AVX512-NEXT: vbroadcastss {{.*#+}} xmm1 = [1,1,1,1] +; AVX512-NEXT: vandnps %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: retq %xor = xor <4 x i32> %x, %and = and <4 x i32> %xor, ret <4 x i32> %and } define <4 x i64> @and_xor_splat1_v4i64(<4 x i64> %x) nounwind { -; AVX1-LABEL: and_xor_splat1_v4i64: -; AVX1: # %bb.0: -; AVX1-NEXT: vandnps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; AVX1-NEXT: retq +; AVX-LABEL: and_xor_splat1_v4i64: +; AVX: # %bb.0: +; AVX-NEXT: vandnps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX-NEXT: retq ; -; INT256-LABEL: and_xor_splat1_v4i64: -; INT256: # %bb.0: -; INT256-NEXT: vbroadcastsd {{.*#+}} ymm1 = [1,1,1,1] -; INT256-NEXT: vandnps %ymm1, %ymm0, %ymm0 -; INT256-NEXT: retq +; AVX512-LABEL: and_xor_splat1_v4i64: +; AVX512: # %bb.0: +; AVX512-NEXT: vbroadcastsd {{.*#+}} ymm1 = [1,1,1,1] +; AVX512-NEXT: vandnps %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: retq %xor = xor <4 x i64> %x, %and = and <4 x i64> %xor, ret <4 x i64> %and @@ -329,6 +329,13 @@ ; INT256-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; INT256-NEXT: vpaddd %ymm2, %ymm0, %ymm0 ; INT256-NEXT: retq +; +; AVX512-LABEL: and_disguised_i8_elts: +; AVX512: # %bb.0: +; AVX512-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX512-NEXT: vpaddd %ymm2, %ymm0, %ymm0 +; AVX512-NEXT: retq %a = add <8 x i32> %x, %y %l = and <8 x i32> %a, %t = add <8 x i32> %l, %z @@ -357,6 +364,13 @@ ; INT256-NEXT: vpandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; INT256-NEXT: vpaddd %ymm2, %ymm0, %ymm0 ; INT256-NEXT: retq +; +; AVX512-LABEL: andn_disguised_i8_elts: +; AVX512: # %bb.0: +; AVX512-NEXT: vpaddd %ymm0, %ymm1, %ymm0 +; AVX512-NEXT: vpandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX512-NEXT: vpaddd %ymm2, %ymm0, %ymm0 +; AVX512-NEXT: retq %add = add <8 x i32> %y, %x %neg = and <8 x i32> %add, %and = xor <8 x i32> %neg, @@ -382,6 +396,12 @@ ; INT256-NEXT: vpandn %ymm2, %ymm0, %ymm0 ; INT256-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; INT256-NEXT: retq +; +; AVX512-LABEL: andn_variable_mask_operand_no_concat: +; AVX512: # %bb.0: +; AVX512-NEXT: vpandn %ymm2, %ymm0, %ymm0 +; AVX512-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: retq %and = and <8 x i32> %x, %z %xor = xor <8 x i32> %and, %z ; demanded bits will make this a 'not' %add = add <8 x i32> %xor, %y @@ -406,6 +426,12 @@ ; INT256-NEXT: vpandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; INT256-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; INT256-NEXT: retq +; +; AVX512-LABEL: andn_constant_mask_operand_no_concat: +; AVX512: # %bb.0: +; AVX512-NEXT: vpandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX512-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: retq %xor = xor <8 x i32> %x, %and = and <8 x i32> %xor, %r = add <8 x i32> %and, %y @@ -436,6 +462,13 @@ ; INT256-NEXT: vpandn %ymm2, %ymm0, %ymm0 ; INT256-NEXT: vpaddd %ymm3, %ymm0, %ymm0 ; INT256-NEXT: retq +; +; AVX512-LABEL: andn_variable_mask_operand_concat: +; AVX512: # %bb.0: +; AVX512-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpandn %ymm2, %ymm0, %ymm0 +; AVX512-NEXT: vpaddd %ymm3, %ymm0, %ymm0 +; AVX512-NEXT: retq %add = add <8 x i32> %x, %y %xor = xor <8 x i32> %add, %and = and <8 x i32> %xor, %z @@ -462,10 +495,17 @@ ; INT256-LABEL: or_disguised_i8_elts: ; INT256: # %bb.0: ; INT256-NEXT: vpaddd %ymm1, %ymm0, %ymm0 -; INT256-NEXT: vpbroadcastd {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255] -; INT256-NEXT: vpor %ymm1, %ymm0, %ymm0 +; INT256-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; INT256-NEXT: vpaddd %ymm2, %ymm0, %ymm0 ; INT256-NEXT: retq +; +; AVX512-LABEL: or_disguised_i8_elts: +; AVX512: # %bb.0: +; AVX512-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpbroadcastd {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255] +; AVX512-NEXT: vpor %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpaddd %ymm2, %ymm0, %ymm0 +; AVX512-NEXT: retq %a = add <8 x i32> %x, %y %l = or <8 x i32> %a, %t = add <8 x i32> %l, %z @@ -491,10 +531,17 @@ ; INT256-LABEL: xor_disguised_i8_elts: ; INT256: # %bb.0: ; INT256-NEXT: vpaddd %ymm1, %ymm0, %ymm0 -; INT256-NEXT: vpbroadcastd {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255] -; INT256-NEXT: vpxor %ymm1, %ymm0, %ymm0 +; INT256-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; INT256-NEXT: vpaddd %ymm2, %ymm0, %ymm0 ; INT256-NEXT: retq +; +; AVX512-LABEL: xor_disguised_i8_elts: +; AVX512: # %bb.0: +; AVX512-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpbroadcastd {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255] +; AVX512-NEXT: vpxor %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpaddd %ymm2, %ymm0, %ymm0 +; AVX512-NEXT: retq %a = add <8 x i32> %x, %y %l = xor <8 x i32> %a, %t = add <8 x i32> %l, %z @@ -524,6 +571,14 @@ ; INT256-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] ; INT256-NEXT: vpaddd %ymm2, %ymm0, %ymm0 ; INT256-NEXT: retq +; +; AVX512-LABEL: and_disguised_i16_elts: +; AVX512: # %bb.0: +; AVX512-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] +; AVX512-NEXT: vpaddd %ymm2, %ymm0, %ymm0 +; AVX512-NEXT: retq %a = add <8 x i32> %x, %y %l = and <8 x i32> %a, %t = add <8 x i32> %l, %z @@ -549,10 +604,17 @@ ; INT256-LABEL: or_disguised_i16_elts: ; INT256: # %bb.0: ; INT256-NEXT: vpaddd %ymm1, %ymm0, %ymm0 -; INT256-NEXT: vpbroadcastd {{.*#+}} ymm1 = [65535,65535,65535,65535,65535,65535,65535,65535] -; INT256-NEXT: vpor %ymm1, %ymm0, %ymm0 +; INT256-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; INT256-NEXT: vpaddd %ymm2, %ymm0, %ymm0 ; INT256-NEXT: retq +; +; AVX512-LABEL: or_disguised_i16_elts: +; AVX512: # %bb.0: +; AVX512-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpbroadcastd {{.*#+}} ymm1 = [65535,65535,65535,65535,65535,65535,65535,65535] +; AVX512-NEXT: vpor %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpaddd %ymm2, %ymm0, %ymm0 +; AVX512-NEXT: retq %a = add <8 x i32> %x, %y %l = or <8 x i32> %a, %t = add <8 x i32> %l, %z @@ -578,10 +640,17 @@ ; INT256-LABEL: xor_disguised_i16_elts: ; INT256: # %bb.0: ; INT256-NEXT: vpaddd %ymm1, %ymm0, %ymm0 -; INT256-NEXT: vpbroadcastd {{.*#+}} ymm1 = [65535,65535,65535,65535,65535,65535,65535,65535] -; INT256-NEXT: vpxor %ymm1, %ymm0, %ymm0 +; INT256-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; INT256-NEXT: vpaddd %ymm2, %ymm0, %ymm0 ; INT256-NEXT: retq +; +; AVX512-LABEL: xor_disguised_i16_elts: +; AVX512: # %bb.0: +; AVX512-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpbroadcastd {{.*#+}} ymm1 = [65535,65535,65535,65535,65535,65535,65535,65535] +; AVX512-NEXT: vpxor %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpaddd %ymm2, %ymm0, %ymm0 +; AVX512-NEXT: retq %a = add <8 x i32> %x, %y %l = xor <8 x i32> %a, %t = add <8 x i32> %l, %z diff --git a/llvm/test/CodeGen/X86/avx-vbroadcast.ll b/llvm/test/CodeGen/X86/avx-vbroadcast.ll --- a/llvm/test/CodeGen/X86/avx-vbroadcast.ll +++ b/llvm/test/CodeGen/X86/avx-vbroadcast.ll @@ -300,12 +300,12 @@ define <4 x float> @_e2(ptr %ptr) nounwind uwtable readnone ssp { ; X86-LABEL: _e2: ; X86: ## %bb.0: ## %entry -; X86-NEXT: vmovaps {{.*#+}} xmm0 = [-7.8125E-3,-7.8125E-3,-7.8125E-3,-7.8125E-3] +; X86-NEXT: vbroadcastss {{.*#+}} xmm0 = [-7.8125E-3,-7.8125E-3,-7.8125E-3,-7.8125E-3] ; X86-NEXT: retl ; ; X64-LABEL: _e2: ; X64: ## %bb.0: ## %entry -; X64-NEXT: vmovaps {{.*#+}} xmm0 = [-7.8125E-3,-7.8125E-3,-7.8125E-3,-7.8125E-3] +; X64-NEXT: vbroadcastss {{.*#+}} xmm0 = [-7.8125E-3,-7.8125E-3,-7.8125E-3,-7.8125E-3] ; X64-NEXT: retq entry: %vecinit.i = insertelement <4 x float> undef, float 0xbf80000000000000, i32 0 diff --git a/llvm/test/CodeGen/X86/avx-vperm2x128.ll b/llvm/test/CodeGen/X86/avx-vperm2x128.ll --- a/llvm/test/CodeGen/X86/avx-vperm2x128.ll +++ b/llvm/test/CodeGen/X86/avx-vperm2x128.ll @@ -526,18 +526,11 @@ ;;; Memory folding cases define <4 x double> @ld0_hi0_lo1_4f64(ptr %pa, <4 x double> %b) nounwind uwtable readnone ssp { -; AVX1-LABEL: ld0_hi0_lo1_4f64: -; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vperm2f128 $3, (%rdi), %ymm0, %ymm0 # ymm0 = mem[2,3],ymm0[0,1] -; AVX1-NEXT: vaddpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: ld0_hi0_lo1_4f64: -; AVX2: # %bb.0: # %entry -; AVX2-NEXT: vperm2f128 $3, (%rdi), %ymm0, %ymm0 # ymm0 = mem[2,3],ymm0[0,1] -; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; AVX2-NEXT: vaddpd %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: retq +; ALL-LABEL: ld0_hi0_lo1_4f64: +; ALL: # %bb.0: # %entry +; ALL-NEXT: vperm2f128 $3, (%rdi), %ymm0, %ymm0 # ymm0 = mem[2,3],ymm0[0,1] +; ALL-NEXT: vaddpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; ALL-NEXT: retq entry: %a = load <4 x double>, ptr %pa %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> @@ -546,18 +539,11 @@ } define <4 x double> @ld1_hi0_hi1_4f64(<4 x double> %a, ptr %pb) nounwind uwtable readnone ssp { -; AVX1-LABEL: ld1_hi0_hi1_4f64: -; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vperm2f128 $49, (%rdi), %ymm0, %ymm0 # ymm0 = ymm0[2,3],mem[2,3] -; AVX1-NEXT: vaddpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: ld1_hi0_hi1_4f64: -; AVX2: # %bb.0: # %entry -; AVX2-NEXT: vperm2f128 $49, (%rdi), %ymm0, %ymm0 # ymm0 = ymm0[2,3],mem[2,3] -; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; AVX2-NEXT: vaddpd %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: retq +; ALL-LABEL: ld1_hi0_hi1_4f64: +; ALL: # %bb.0: # %entry +; ALL-NEXT: vperm2f128 $49, (%rdi), %ymm0, %ymm0 # ymm0 = ymm0[2,3],mem[2,3] +; ALL-NEXT: vaddpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; ALL-NEXT: retq entry: %b = load <4 x double>, ptr %pb %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> @@ -566,18 +552,11 @@ } define <8 x float> @ld0_hi0_lo1_8f32(ptr %pa, <8 x float> %b) nounwind uwtable readnone ssp { -; AVX1-LABEL: ld0_hi0_lo1_8f32: -; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vperm2f128 $3, (%rdi), %ymm0, %ymm0 # ymm0 = mem[2,3],ymm0[0,1] -; AVX1-NEXT: vaddps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: ld0_hi0_lo1_8f32: -; AVX2: # %bb.0: # %entry -; AVX2-NEXT: vperm2f128 $3, (%rdi), %ymm0, %ymm0 # ymm0 = mem[2,3],ymm0[0,1] -; AVX2-NEXT: vbroadcastss {{.*#+}} ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; AVX2-NEXT: vaddps %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: retq +; ALL-LABEL: ld0_hi0_lo1_8f32: +; ALL: # %bb.0: # %entry +; ALL-NEXT: vperm2f128 $3, (%rdi), %ymm0, %ymm0 # ymm0 = mem[2,3],ymm0[0,1] +; ALL-NEXT: vaddps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; ALL-NEXT: retq entry: %a = load <8 x float>, ptr %pa %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> @@ -586,18 +565,11 @@ } define <8 x float> @ld1_hi0_hi1_8f32(<8 x float> %a, ptr %pb) nounwind uwtable readnone ssp { -; AVX1-LABEL: ld1_hi0_hi1_8f32: -; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vperm2f128 $49, (%rdi), %ymm0, %ymm0 # ymm0 = ymm0[2,3],mem[2,3] -; AVX1-NEXT: vaddps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: ld1_hi0_hi1_8f32: -; AVX2: # %bb.0: # %entry -; AVX2-NEXT: vperm2f128 $49, (%rdi), %ymm0, %ymm0 # ymm0 = ymm0[2,3],mem[2,3] -; AVX2-NEXT: vbroadcastss {{.*#+}} ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; AVX2-NEXT: vaddps %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: retq +; ALL-LABEL: ld1_hi0_hi1_8f32: +; ALL: # %bb.0: # %entry +; ALL-NEXT: vperm2f128 $49, (%rdi), %ymm0, %ymm0 # ymm0 = ymm0[2,3],mem[2,3] +; ALL-NEXT: vaddps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; ALL-NEXT: retq entry: %b = load <8 x float>, ptr %pb %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> diff --git a/llvm/test/CodeGen/X86/avx2-arith.ll b/llvm/test/CodeGen/X86/avx2-arith.ll --- a/llvm/test/CodeGen/X86/avx2-arith.ll +++ b/llvm/test/CodeGen/X86/avx2-arith.ll @@ -174,7 +174,7 @@ ; X32-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; X32-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; X32-NEXT: vpmullw %ymm2, %ymm3, %ymm2 -; X32-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; X32-NEXT: vpbroadcastw {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; X32-NEXT: vpand %ymm3, %ymm2, %ymm2 ; X32-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] ; X32-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] @@ -188,7 +188,7 @@ ; X64-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; X64-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; X64-NEXT: vpmullw %ymm2, %ymm3, %ymm2 -; X64-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; X64-NEXT: vpbroadcastw {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; X64-NEXT: vpand %ymm3, %ymm2, %ymm2 ; X64-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] ; X64-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] @@ -364,14 +364,12 @@ define <4 x i32> @mul_const10(<4 x i32> %x) { ; X32-LABEL: mul_const10: ; X32: # %bb.0: -; X32-NEXT: vpbroadcastd {{.*#+}} xmm1 = [16843009,16843009,16843009,16843009] -; X32-NEXT: vpmulld %xmm1, %xmm0, %xmm0 +; X32-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 ; X32-NEXT: retl ; ; X64-LABEL: mul_const10: ; X64: # %bb.0: -; X64-NEXT: vpbroadcastd {{.*#+}} xmm1 = [16843009,16843009,16843009,16843009] -; X64-NEXT: vpmulld %xmm1, %xmm0, %xmm0 +; X64-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; X64-NEXT: retq %m = mul <4 x i32> %x, ret <4 x i32> %m @@ -381,14 +379,12 @@ define <4 x i32> @mul_const11(<4 x i32> %x) { ; X32-LABEL: mul_const11: ; X32: # %bb.0: -; X32-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2155905152,2155905152,2155905152,2155905152] -; X32-NEXT: vpmulld %xmm1, %xmm0, %xmm0 +; X32-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 ; X32-NEXT: retl ; ; X64-LABEL: mul_const11: ; X64: # %bb.0: -; X64-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2155905152,2155905152,2155905152,2155905152] -; X64-NEXT: vpmulld %xmm1, %xmm0, %xmm0 +; X64-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; X64-NEXT: retq %m = mul <4 x i32> %x, ret <4 x i32> %m diff --git a/llvm/test/CodeGen/X86/avx2-fma-fneg-combine.ll b/llvm/test/CodeGen/X86/avx2-fma-fneg-combine.ll --- a/llvm/test/CodeGen/X86/avx2-fma-fneg-combine.ll +++ b/llvm/test/CodeGen/X86/avx2-fma-fneg-combine.ll @@ -43,15 +43,13 @@ ; X32-LABEL: test3: ; X32: # %bb.0: ; X32-NEXT: vfnmadd213ss {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm2 -; X32-NEXT: vbroadcastss {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] -; X32-NEXT: vxorps %xmm1, %xmm0, %xmm0 +; X32-NEXT: vxorps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 ; X32-NEXT: retl ; ; X64-LABEL: test3: ; X64: # %bb.0: ; X64-NEXT: vfnmadd213ss {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm2 -; X64-NEXT: vbroadcastss {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] -; X64-NEXT: vxorps %xmm1, %xmm0, %xmm0 +; X64-NEXT: vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; X64-NEXT: retq %a0 = extractelement <4 x float> %a, i64 0 %b0 = extractelement <4 x float> %b, i64 0 diff --git a/llvm/test/CodeGen/X86/avx2-intrinsics-x86.ll b/llvm/test/CodeGen/X86/avx2-intrinsics-x86.ll --- a/llvm/test/CodeGen/X86/avx2-intrinsics-x86.ll +++ b/llvm/test/CodeGen/X86/avx2-intrinsics-x86.ll @@ -1489,9 +1489,9 @@ define <2 x i64> @test_x86_avx2_psrlv_q_const() { ; X86-AVX-LABEL: test_x86_avx2_psrlv_q_const: ; X86-AVX: # %bb.0: -; X86-AVX-NEXT: vmovdqa {{.*#+}} xmm0 = [4,0,4,0] -; X86-AVX-NEXT: # encoding: [0xc5,0xf9,0x6f,0x05,A,A,A,A] -; X86-AVX-NEXT: # fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 +; X86-AVX-NEXT: vpbroadcastq {{.*#+}} xmm0 = [4,4] +; X86-AVX-NEXT: # encoding: [0xc4,0xe2,0x79,0x59,0x05,A,A,A,A] +; X86-AVX-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 ; X86-AVX-NEXT: vpsrlvq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 # encoding: [0xc4,0xe2,0xf9,0x45,0x05,A,A,A,A] ; X86-AVX-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 ; X86-AVX-NEXT: retl # encoding: [0xc3] @@ -1507,9 +1507,9 @@ ; ; X64-AVX-LABEL: test_x86_avx2_psrlv_q_const: ; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vmovdqa {{.*#+}} xmm0 = [4,4] -; X64-AVX-NEXT: # encoding: [0xc5,0xf9,0x6f,0x05,A,A,A,A] -; X64-AVX-NEXT: # fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte +; X64-AVX-NEXT: vpbroadcastq {{.*#+}} xmm0 = [4,4] +; X64-AVX-NEXT: # encoding: [0xc4,0xe2,0x79,0x59,0x05,A,A,A,A] +; X64-AVX-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte ; X64-AVX-NEXT: vpsrlvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # encoding: [0xc4,0xe2,0xf9,0x45,0x05,A,A,A,A] ; X64-AVX-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte ; X64-AVX-NEXT: retq # encoding: [0xc3] @@ -1546,9 +1546,9 @@ define <4 x i64> @test_x86_avx2_psrlv_q_256_const() { ; X86-AVX-LABEL: test_x86_avx2_psrlv_q_256_const: ; X86-AVX: # %bb.0: -; X86-AVX-NEXT: vmovdqa {{.*#+}} ymm0 = [4,0,4,0,4,0,4,0] -; X86-AVX-NEXT: # encoding: [0xc5,0xfd,0x6f,0x05,A,A,A,A] -; X86-AVX-NEXT: # fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 +; X86-AVX-NEXT: vpbroadcastq {{.*#+}} ymm0 = [4,4,4,4] +; X86-AVX-NEXT: # encoding: [0xc4,0xe2,0x7d,0x59,0x05,A,A,A,A] +; X86-AVX-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 ; X86-AVX-NEXT: vpsrlvq {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0 # encoding: [0xc4,0xe2,0xfd,0x45,0x05,A,A,A,A] ; X86-AVX-NEXT: # fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 ; X86-AVX-NEXT: retl # encoding: [0xc3] diff --git a/llvm/test/CodeGen/X86/avx2-shift.ll b/llvm/test/CodeGen/X86/avx2-shift.ll --- a/llvm/test/CodeGen/X86/avx2-shift.ll +++ b/llvm/test/CodeGen/X86/avx2-shift.ll @@ -377,7 +377,7 @@ ; X86: # %bb.0: ; X86-NEXT: vpsrlw $3, %ymm0, %ymm0 ; X86-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0 -; X86-NEXT: vmovdqa {{.*#+}} ymm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; X86-NEXT: vpbroadcastb {{.*#+}} ymm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] ; X86-NEXT: vpxor %ymm1, %ymm0, %ymm0 ; X86-NEXT: vpsubb %ymm1, %ymm0, %ymm0 ; X86-NEXT: retl @@ -386,7 +386,7 @@ ; X64: # %bb.0: ; X64-NEXT: vpsrlw $3, %ymm0, %ymm0 ; X64-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; X64-NEXT: vmovdqa {{.*#+}} ymm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; X64-NEXT: vpbroadcastb {{.*#+}} ymm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] ; X64-NEXT: vpxor %ymm1, %ymm0, %ymm0 ; X64-NEXT: vpsubb %ymm1, %ymm0, %ymm0 ; X64-NEXT: retq diff --git a/llvm/test/CodeGen/X86/avx2-vbroadcast.ll b/llvm/test/CodeGen/X86/avx2-vbroadcast.ll --- a/llvm/test/CodeGen/X86/avx2-vbroadcast.ll +++ b/llvm/test/CodeGen/X86/avx2-vbroadcast.ll @@ -586,14 +586,12 @@ define <8 x i32> @V111(<8 x i32> %in) nounwind uwtable readnone ssp { ; X86-AVX2-LABEL: V111: ; X86-AVX2: ## %bb.0: ## %entry -; X86-AVX2-NEXT: vpbroadcastd {{.*#+}} ymm1 = [2,2,2,2,2,2,2,2] -; X86-AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0 ; X86-AVX2-NEXT: retl ; ; X64-AVX2-LABEL: V111: ; X64-AVX2: ## %bb.0: ## %entry -; X64-AVX2-NEXT: vpbroadcastd {{.*#+}} ymm1 = [2,2,2,2,2,2,2,2] -; X64-AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; X64-AVX2-NEXT: retq ; ; X86-AVX512VL-LABEL: V111: @@ -613,14 +611,12 @@ define <8 x float> @V113(<8 x float> %in) nounwind uwtable readnone ssp { ; X86-AVX2-LABEL: V113: ; X86-AVX2: ## %bb.0: ## %entry -; X86-AVX2-NEXT: vbroadcastss {{.*#+}} ymm1 = [-7.8125E-3,-7.8125E-3,-7.8125E-3,-7.8125E-3,-7.8125E-3,-7.8125E-3,-7.8125E-3,-7.8125E-3] -; X86-AVX2-NEXT: vaddps %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vaddps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0 ; X86-AVX2-NEXT: retl ; ; X64-AVX2-LABEL: V113: ; X64-AVX2: ## %bb.0: ## %entry -; X64-AVX2-NEXT: vbroadcastss {{.*#+}} ymm1 = [-7.8125E-3,-7.8125E-3,-7.8125E-3,-7.8125E-3,-7.8125E-3,-7.8125E-3,-7.8125E-3,-7.8125E-3] -; X64-AVX2-NEXT: vaddps %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vaddps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; X64-AVX2-NEXT: retq ; ; X86-AVX512VL-LABEL: V113: @@ -655,15 +651,25 @@ } define <8 x i8> @_e4(ptr %ptr) nounwind uwtable readnone ssp { -; X86-LABEL: _e4: -; X86: ## %bb.0: -; X86-NEXT: vmovaps {{.*#+}} xmm0 = <52,52,52,52,52,52,52,52,u,u,u,u,u,u,u,u> -; X86-NEXT: retl +; X86-AVX2-LABEL: _e4: +; X86-AVX2: ## %bb.0: +; X86-AVX2-NEXT: vbroadcastss {{.*#+}} xmm0 = [1.67827977E-7,1.67827977E-7,1.67827977E-7,1.67827977E-7] +; X86-AVX2-NEXT: retl ; -; X64-LABEL: _e4: -; X64: ## %bb.0: -; X64-NEXT: vmovaps {{.*#+}} xmm0 = <52,52,52,52,52,52,52,52,u,u,u,u,u,u,u,u> -; X64-NEXT: retq +; X64-AVX2-LABEL: _e4: +; X64-AVX2: ## %bb.0: +; X64-AVX2-NEXT: vbroadcastss {{.*#+}} xmm0 = [1.67827977E-7,1.67827977E-7,1.67827977E-7,1.67827977E-7] +; X64-AVX2-NEXT: retq +; +; X86-AVX512VL-LABEL: _e4: +; X86-AVX512VL: ## %bb.0: +; X86-AVX512VL-NEXT: vmovaps {{.*#+}} xmm0 = <52,52,52,52,52,52,52,52,u,u,u,u,u,u,u,u> +; X86-AVX512VL-NEXT: retl +; +; X64-AVX512VL-LABEL: _e4: +; X64-AVX512VL: ## %bb.0: +; X64-AVX512VL-NEXT: vmovaps {{.*#+}} xmm0 = <52,52,52,52,52,52,52,52,u,u,u,u,u,u,u,u> +; X64-AVX512VL-NEXT: retq %vecinit0.i = insertelement <8 x i8> undef, i8 52, i32 0 %vecinit1.i = insertelement <8 x i8> %vecinit0.i, i8 52, i32 1 %vecinit2.i = insertelement <8 x i8> %vecinit1.i, i8 52, i32 2 diff --git a/llvm/test/CodeGen/X86/avx2-vector-shifts.ll b/llvm/test/CodeGen/X86/avx2-vector-shifts.ll --- a/llvm/test/CodeGen/X86/avx2-vector-shifts.ll +++ b/llvm/test/CodeGen/X86/avx2-vector-shifts.ll @@ -268,8 +268,7 @@ ; X86-SLOW: # %bb.0: ; X86-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2 ; X86-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] -; X86-SLOW-NEXT: vbroadcastss {{.*#+}} xmm2 = [8,8,8,8] -; X86-SLOW-NEXT: vandps %xmm2, %xmm1, %xmm1 +; X86-SLOW-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1, %xmm1 ; X86-SLOW-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0 ; X86-SLOW-NEXT: vzeroupper ; X86-SLOW-NEXT: retl @@ -278,8 +277,7 @@ ; X86-FAST-ALL: # %bb.0: ; X86-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm2 = <0,2,4,6,u,u,u,u> ; X86-FAST-ALL-NEXT: vpermd %ymm1, %ymm2, %ymm1 -; X86-FAST-ALL-NEXT: vpbroadcastd {{.*#+}} xmm2 = [8,8,8,8] -; X86-FAST-ALL-NEXT: vpand %xmm2, %xmm1, %xmm1 +; X86-FAST-ALL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1, %xmm1 ; X86-FAST-ALL-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0 ; X86-FAST-ALL-NEXT: vzeroupper ; X86-FAST-ALL-NEXT: retl @@ -288,8 +286,7 @@ ; X86-FAST-PERLANE: # %bb.0: ; X86-FAST-PERLANE-NEXT: vextractf128 $1, %ymm1, %xmm2 ; X86-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] -; X86-FAST-PERLANE-NEXT: vbroadcastss {{.*#+}} xmm2 = [8,8,8,8] -; X86-FAST-PERLANE-NEXT: vandps %xmm2, %xmm1, %xmm1 +; X86-FAST-PERLANE-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1, %xmm1 ; X86-FAST-PERLANE-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0 ; X86-FAST-PERLANE-NEXT: vzeroupper ; X86-FAST-PERLANE-NEXT: retl @@ -298,8 +295,7 @@ ; X64-SLOW: # %bb.0: ; X64-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2 ; X64-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] -; X64-SLOW-NEXT: vbroadcastss {{.*#+}} xmm2 = [8,8,8,8] -; X64-SLOW-NEXT: vandps %xmm2, %xmm1, %xmm1 +; X64-SLOW-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; X64-SLOW-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0 ; X64-SLOW-NEXT: vzeroupper ; X64-SLOW-NEXT: retq @@ -308,8 +304,7 @@ ; X64-FAST-ALL: # %bb.0: ; X64-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm2 = <0,2,4,6,u,u,u,u> ; X64-FAST-ALL-NEXT: vpermd %ymm1, %ymm2, %ymm1 -; X64-FAST-ALL-NEXT: vpbroadcastd {{.*#+}} xmm2 = [8,8,8,8] -; X64-FAST-ALL-NEXT: vpand %xmm2, %xmm1, %xmm1 +; X64-FAST-ALL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; X64-FAST-ALL-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0 ; X64-FAST-ALL-NEXT: vzeroupper ; X64-FAST-ALL-NEXT: retq @@ -318,8 +313,7 @@ ; X64-FAST-PERLANE: # %bb.0: ; X64-FAST-PERLANE-NEXT: vextractf128 $1, %ymm1, %xmm2 ; X64-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] -; X64-FAST-PERLANE-NEXT: vbroadcastss {{.*#+}} xmm2 = [8,8,8,8] -; X64-FAST-PERLANE-NEXT: vandps %xmm2, %xmm1, %xmm1 +; X64-FAST-PERLANE-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; X64-FAST-PERLANE-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0 ; X64-FAST-PERLANE-NEXT: vzeroupper ; X64-FAST-PERLANE-NEXT: retq diff --git a/llvm/test/CodeGen/X86/avx512-arith.ll b/llvm/test/CodeGen/X86/avx512-arith.ll --- a/llvm/test/CodeGen/X86/avx512-arith.ll +++ b/llvm/test/CodeGen/X86/avx512-arith.ll @@ -293,7 +293,7 @@ define <2 x i64> @imulq128_bcast(<2 x i64> %x) { ; AVX512F-LABEL: imulq128_bcast: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm1 = [8086,8086] +; AVX512F-NEXT: vpbroadcastq {{.*#+}} xmm1 = [8086,8086] ; AVX512F-NEXT: vpmuludq %xmm1, %xmm0, %xmm2 ; AVX512F-NEXT: vpsrlq $32, %xmm0, %xmm0 ; AVX512F-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 @@ -313,7 +313,7 @@ ; ; AVX512BW-LABEL: imulq128_bcast: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = [8086,8086] +; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm1 = [8086,8086] ; AVX512BW-NEXT: vpmuludq %xmm1, %xmm0, %xmm2 ; AVX512BW-NEXT: vpsrlq $32, %xmm0, %xmm0 ; AVX512BW-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 @@ -324,7 +324,7 @@ ; AVX512DQ-LABEL: imulq128_bcast: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm1 = [8086,8086] +; AVX512DQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [8086,8086] ; AVX512DQ-NEXT: vpmullq %zmm1, %zmm0, %zmm0 ; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512DQ-NEXT: vzeroupper diff --git a/llvm/test/CodeGen/X86/avx512-regcall-Mask.ll b/llvm/test/CodeGen/X86/avx512-regcall-Mask.ll --- a/llvm/test/CodeGen/X86/avx512-regcall-Mask.ll +++ b/llvm/test/CodeGen/X86/avx512-regcall-Mask.ll @@ -98,7 +98,8 @@ ; X32: # %bb.0: # %entry ; X32-NEXT: pushl %edi ; X32-NEXT: subl $88, %esp -; X32-NEXT: vmovaps {{.*#+}} xmm0 = [2,1,2,1] +; X32-NEXT: vmovddup {{.*#+}} xmm0 = [2.1219957919534036E-314,2.1219957919534036E-314] +; X32-NEXT: # xmm0 = mem[0,0] ; X32-NEXT: vmovups %xmm0, {{[0-9]+}}(%esp) ; X32-NEXT: vmovaps {{.*#+}} zmm0 = [2,1,2,1,2,1,2,1,2,1,2,1,2,1,2,1] ; X32-NEXT: vmovups %zmm0, (%esp) diff --git a/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool-sext.ll b/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool-sext.ll --- a/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool-sext.ll +++ b/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool-sext.ll @@ -157,8 +157,7 @@ ; AVX1: # %bb.0: ; AVX1-NEXT: vmovd %edi, %xmm0 ; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1] -; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [9241421688590303745,9241421688590303745] -; AVX1-NEXT: # xmm1 = mem[0,0] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: retq @@ -355,8 +354,7 @@ ; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0,1,1,4,4,5,5] ; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [9241421688590303745,9241421688590303745] -; AVX1-NEXT: # xmm2 = mem[0,0] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128] ; AVX1-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 @@ -618,21 +616,22 @@ ; AVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm1[2,2,3,3,4,5,6,7] ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0,1,1,4,4,5,5] -; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm2 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745] +; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm2 = [-1.7939930131212661E-307,-1.7939930131212661E-307,-1.7939930131212661E-307,-1.7939930131212661E-307] ; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX1-NEXT: vpcmpeqb %xmm2, %xmm3, %xmm3 -; AVX1-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128] +; AVX1-NEXT: vpcmpeqb %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vpcmpeqb %xmm4, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 ; AVX1-NEXT: vpshufhw {{.*#+}} xmm3 = xmm1[0,1,2,3,4,4,5,5] ; AVX1-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,6,7,7] ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 ; AVX1-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,2,3,3,6,6,7,7] ; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 -; AVX1-NEXT: vpcmpeqb %xmm2, %xmm3, %xmm3 -; AVX1-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vpcmpeqb %xmm4, %xmm2, %xmm2 +; AVX1-NEXT: vpcmpeqb %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ; AVX1-NEXT: retq ; ; AVX2-LABEL: ext_i64_64i8: diff --git a/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool-zext.ll b/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool-zext.ll --- a/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool-zext.ll +++ b/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool-zext.ll @@ -199,8 +199,7 @@ ; AVX1: # %bb.0: ; AVX1-NEXT: vmovd %edi, %xmm0 ; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1] -; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [9241421688590303745,9241421688590303745] -; AVX1-NEXT: # xmm1 = mem[0,0] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpsrlw $7, %xmm0, %xmm0 @@ -454,8 +453,7 @@ ; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0,1,1,4,4,5,5] ; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [9241421688590303745,9241421688590303745] -; AVX1-NEXT: # xmm2 = mem[0,0] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128] ; AVX1-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpsrlw $7, %xmm1, %xmm1 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] @@ -807,30 +805,31 @@ ; AVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm1[2,2,3,3,4,5,6,7] ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0,1,1,4,4,5,5] -; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm2 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745] +; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm2 = [-1.7939930131212661E-307,-1.7939930131212661E-307,-1.7939930131212661E-307,-1.7939930131212661E-307] ; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX1-NEXT: vpcmpeqb %xmm2, %xmm3, %xmm3 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128] +; AVX1-NEXT: vpcmpeqb %xmm4, %xmm3, %xmm3 ; AVX1-NEXT: vpsrlw $7, %xmm3, %xmm3 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] -; AVX1-NEXT: vpand %xmm4, %xmm3, %xmm3 -; AVX1-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX1-NEXT: vpand %xmm5, %xmm3, %xmm3 +; AVX1-NEXT: vpcmpeqb %xmm4, %xmm0, %xmm0 ; AVX1-NEXT: vpsrlw $7, %xmm0, %xmm0 -; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vpand %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 ; AVX1-NEXT: vpshufhw {{.*#+}} xmm3 = xmm1[0,1,2,3,4,4,5,5] ; AVX1-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,6,7,7] ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 ; AVX1-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,2,3,3,6,6,7,7] ; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 -; AVX1-NEXT: vpcmpeqb %xmm2, %xmm3, %xmm3 -; AVX1-NEXT: vpsrlw $7, %xmm3, %xmm3 -; AVX1-NEXT: vpand %xmm4, %xmm3, %xmm3 -; AVX1-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vpcmpeqb %xmm4, %xmm2, %xmm2 +; AVX1-NEXT: vpsrlw $7, %xmm2, %xmm2 +; AVX1-NEXT: vpand %xmm5, %xmm2, %xmm2 +; AVX1-NEXT: vpcmpeqb %xmm4, %xmm1, %xmm1 ; AVX1-NEXT: vpsrlw $7, %xmm1, %xmm1 -; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 +; AVX1-NEXT: vpand %xmm5, %xmm1, %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ; AVX1-NEXT: retq ; ; AVX2-LABEL: ext_i64_64i8: @@ -842,7 +841,7 @@ ; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpcmpeqb %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpsrlw $7, %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] ; AVX2-NEXT: vpand %ymm3, %ymm0, %ymm0 ; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5,22,22,22,22,22,22,22,22,23,23,23,23,23,23,23,23] ; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1 diff --git a/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool.ll b/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool.ll --- a/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool.ll +++ b/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool.ll @@ -204,8 +204,7 @@ ; AVX1: # %bb.0: ; AVX1-NEXT: vmovd %edi, %xmm0 ; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1] -; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [9241421688590303745,9241421688590303745] -; AVX1-NEXT: # xmm1 = mem[0,0] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpsrlw $7, %xmm0, %xmm0 @@ -249,8 +248,7 @@ ; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0,1,1,4,4,5,5] ; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [9241421688590303745,9241421688590303745] -; AVX1-NEXT: # xmm2 = mem[0,0] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128] ; AVX1-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpsrlw $7, %xmm1, %xmm1 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] diff --git a/llvm/test/CodeGen/X86/bitcast-vector-bool.ll b/llvm/test/CodeGen/X86/bitcast-vector-bool.ll --- a/llvm/test/CodeGen/X86/bitcast-vector-bool.ll +++ b/llvm/test/CodeGen/X86/bitcast-vector-bool.ll @@ -121,18 +121,11 @@ ; SSE41-NEXT: setb %al ; SSE41-NEXT: retq ; -; AVX1-LABEL: trunc_v4i32_cmp: -; AVX1: # %bb.0: -; AVX1-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; AVX1-NEXT: setb %al -; AVX1-NEXT: retq -; -; AVX2-LABEL: trunc_v4i32_cmp: -; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1,1,1,1] -; AVX2-NEXT: vptest %xmm1, %xmm0 -; AVX2-NEXT: setb %al -; AVX2-NEXT: retq +; AVX12-LABEL: trunc_v4i32_cmp: +; AVX12: # %bb.0: +; AVX12-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; AVX12-NEXT: setb %al +; AVX12-NEXT: retq ; ; AVX512-LABEL: trunc_v4i32_cmp: ; AVX512: # %bb.0: @@ -335,20 +328,12 @@ ; SSE41-NEXT: setne %al ; SSE41-NEXT: retq ; -; AVX1-LABEL: trunc_v4i64_cmp: -; AVX1: # %bb.0: -; AVX1-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0 -; AVX1-NEXT: setne %al -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: trunc_v4i64_cmp: -; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [1,1,1,1] -; AVX2-NEXT: vptest %ymm1, %ymm0 -; AVX2-NEXT: setne %al -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq +; AVX12-LABEL: trunc_v4i64_cmp: +; AVX12: # %bb.0: +; AVX12-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0 +; AVX12-NEXT: setne %al +; AVX12-NEXT: vzeroupper +; AVX12-NEXT: retq ; ; AVX512-LABEL: trunc_v4i64_cmp: ; AVX512: # %bb.0: @@ -411,20 +396,12 @@ ; SSE41-NEXT: setae %al ; SSE41-NEXT: retq ; -; AVX1-LABEL: trunc_v8i132_cmp: -; AVX1: # %bb.0: -; AVX1-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0 -; AVX1-NEXT: setae %al -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: trunc_v8i132_cmp: -; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm1 = [1,1,1,1,1,1,1,1] -; AVX2-NEXT: vptest %ymm1, %ymm0 -; AVX2-NEXT: setae %al -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq +; AVX12-LABEL: trunc_v8i132_cmp: +; AVX12: # %bb.0: +; AVX12-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0 +; AVX12-NEXT: setae %al +; AVX12-NEXT: vzeroupper +; AVX12-NEXT: retq ; ; AVX512-LABEL: trunc_v8i132_cmp: ; AVX512: # %bb.0: @@ -719,8 +696,7 @@ ; AVX2-LABEL: trunc_v8i64_cmp: ; AVX2: # %bb.0: ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [1,1,1,1] -; AVX2-NEXT: vptest %ymm1, %ymm0 +; AVX2-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0 ; AVX2-NEXT: setb %al ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -848,8 +824,7 @@ ; AVX2-LABEL: trunc_v16i32_cmp: ; AVX2: # %bb.0: ; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm1 = [1,1,1,1,1,1,1,1] -; AVX2-NEXT: vptest %ymm1, %ymm0 +; AVX2-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0 ; AVX2-NEXT: sete %al ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq diff --git a/llvm/test/CodeGen/X86/bool-ext-inc.ll b/llvm/test/CodeGen/X86/bool-ext-inc.ll --- a/llvm/test/CodeGen/X86/bool-ext-inc.ll +++ b/llvm/test/CodeGen/X86/bool-ext-inc.ll @@ -19,8 +19,7 @@ define <4 x i32> @sext_inc_vec(<4 x i1> %x) nounwind { ; CHECK-LABEL: sext_inc_vec: ; CHECK: # %bb.0: -; CHECK-NEXT: vbroadcastss {{.*#+}} xmm1 = [1,1,1,1] -; CHECK-NEXT: vandnps %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vandnps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; CHECK-NEXT: retq %ext = sext <4 x i1> %x to <4 x i32> %add = add <4 x i32> %ext, @@ -31,8 +30,7 @@ ; CHECK-LABEL: cmpgt_sext_inc_vec: ; CHECK: # %bb.0: ; CHECK-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 -; CHECK-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1,1,1,1] -; CHECK-NEXT: vpandn %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vpandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; CHECK-NEXT: retq %cmp = icmp sgt <4 x i32> %x, %y %ext = sext <4 x i1> %cmp to <4 x i32> @@ -56,8 +54,7 @@ ; CHECK-LABEL: cmpgt_sext_inc_vec256: ; CHECK: # %bb.0: ; CHECK-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 -; CHECK-NEXT: vpbroadcastq {{.*#+}} ymm1 = [1,1,1,1] -; CHECK-NEXT: vpandn %ymm1, %ymm0, %ymm0 +; CHECK-NEXT: vpandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; CHECK-NEXT: retq %cmp = icmp sgt <4 x i64> %x, %y %ext = sext <4 x i1> %cmp to <4 x i64> @@ -91,8 +88,7 @@ ; CHECK-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 ; CHECK-NEXT: vpxor %xmm2, %xmm1, %xmm1 ; CHECK-NEXT: vpandn %xmm1, %xmm0, %xmm0 -; CHECK-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1,1,1,1] -; CHECK-NEXT: vpandn %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vpandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; CHECK-NEXT: retq %cmp1 = icmp ne <4 x i32> %a, %b %cmp2 = icmp ne <4 x i32> %c, %d diff --git a/llvm/test/CodeGen/X86/broadcast-elm-cross-splat-vec.ll b/llvm/test/CodeGen/X86/broadcast-elm-cross-splat-vec.ll --- a/llvm/test/CodeGen/X86/broadcast-elm-cross-splat-vec.ll +++ b/llvm/test/CodeGen/X86/broadcast-elm-cross-splat-vec.ll @@ -53,7 +53,7 @@ define <16 x i8> @f16xi8_i32(<16 x i8> %a) { ; AVX-LABEL: f16xi8_i32: ; AVX: # %bb.0: -; AVX-NEXT: vbroadcastss {{.*#+}} xmm1 = [50462976,50462976,50462976,50462976] +; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX-NEXT: vpaddb %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retl @@ -67,7 +67,7 @@ ; ; AVX-64-LABEL: f16xi8_i32: ; AVX-64: # %bb.0: -; AVX-64-NEXT: vbroadcastss {{.*#+}} xmm1 = [50462976,50462976,50462976,50462976] +; AVX-64-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX-64-NEXT: vpaddb %xmm1, %xmm0, %xmm0 ; AVX-64-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX-64-NEXT: retq @@ -87,8 +87,7 @@ define <16 x i8> @f16xi8_i64(<16 x i8> %a) { ; AVX-LABEL: f16xi8_i64: ; AVX: # %bb.0: -; AVX-NEXT: vmovddup {{.*#+}} xmm1 = [506097522914230528,506097522914230528] -; AVX-NEXT: # xmm1 = mem[0,0] +; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] ; AVX-NEXT: vpaddb %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retl @@ -102,8 +101,7 @@ ; ; AVX-64-LABEL: f16xi8_i64: ; AVX-64: # %bb.0: -; AVX-64-NEXT: vmovddup {{.*#+}} xmm1 = [506097522914230528,506097522914230528] -; AVX-64-NEXT: # xmm1 = mem[0,0] +; AVX-64-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] ; AVX-64-NEXT: vpaddb %xmm1, %xmm0, %xmm0 ; AVX-64-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX-64-NEXT: retq @@ -164,7 +162,7 @@ ; AVX-LABEL: f32xi8_i32: ; AVX: # %bb.0: ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX-NEXT: vbroadcastss {{.*#+}} xmm2 = [50462976,50462976,50462976,50462976] +; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX-NEXT: vpaddb %xmm2, %xmm1, %xmm1 ; AVX-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 @@ -181,7 +179,7 @@ ; AVX-64-LABEL: f32xi8_i32: ; AVX-64: # %bb.0: ; AVX-64-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX-64-NEXT: vbroadcastss {{.*#+}} xmm2 = [50462976,50462976,50462976,50462976] +; AVX-64-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; AVX-64-NEXT: vpaddb %xmm2, %xmm1, %xmm1 ; AVX-64-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX-64-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 @@ -204,8 +202,7 @@ ; AVX-LABEL: f32xi8_i64: ; AVX: # %bb.0: ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX-NEXT: vmovddup {{.*#+}} xmm2 = [506097522914230528,506097522914230528] -; AVX-NEXT: # xmm2 = mem[0,0] +; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] ; AVX-NEXT: vpaddb %xmm2, %xmm1, %xmm1 ; AVX-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 @@ -222,8 +219,7 @@ ; AVX-64-LABEL: f32xi8_i64: ; AVX-64: # %bb.0: ; AVX-64-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX-64-NEXT: vmovddup {{.*#+}} xmm2 = [506097522914230528,506097522914230528] -; AVX-64-NEXT: # xmm2 = mem[0,0] +; AVX-64-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] ; AVX-64-NEXT: vpaddb %xmm2, %xmm1, %xmm1 ; AVX-64-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX-64-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 @@ -253,13 +249,20 @@ ; AVX-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0 ; AVX-NEXT: retl ; -; ALL32-LABEL: f32xi8_i128: -; ALL32: # %bb.0: -; ALL32-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] -; ALL32-NEXT: # ymm1 = mem[0,1,0,1] -; ALL32-NEXT: vpaddb %ymm1, %ymm0, %ymm0 -; ALL32-NEXT: vpand %ymm1, %ymm0, %ymm0 -; ALL32-NEXT: retl +; AVX2-LABEL: f32xi8_i128: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; AVX2-NEXT: vpaddb %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: retl +; +; AVX512-LABEL: f32xi8_i128: +; AVX512: # %bb.0: +; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; AVX512-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512-NEXT: vpaddb %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: retl ; ; AVX-64-LABEL: f32xi8_i128: ; AVX-64: # %bb.0: @@ -271,13 +274,20 @@ ; AVX-64-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX-64-NEXT: retq ; -; ALL64-LABEL: f32xi8_i128: -; ALL64: # %bb.0: -; ALL64-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] -; ALL64-NEXT: # ymm1 = mem[0,1,0,1] -; ALL64-NEXT: vpaddb %ymm1, %ymm0, %ymm0 -; ALL64-NEXT: vpand %ymm1, %ymm0, %ymm0 -; ALL64-NEXT: retq +; AVX2-64-LABEL: f32xi8_i128: +; AVX2-64: # %bb.0: +; AVX2-64-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; AVX2-64-NEXT: vpaddb %ymm1, %ymm0, %ymm0 +; AVX2-64-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX2-64-NEXT: retq +; +; AVX512F-64-LABEL: f32xi8_i128: +; AVX512F-64: # %bb.0: +; AVX512F-64-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; AVX512F-64-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512F-64-NEXT: vpaddb %ymm1, %ymm0, %ymm0 +; AVX512F-64-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX512F-64-NEXT: retq %res1 = add <32 x i8> , %a %res2 = and <32 x i8> , %res1 ret <32 x i8> %res2 @@ -296,7 +306,7 @@ ; AVX-NEXT: vpaddb %xmm3, %xmm2, %xmm2 ; AVX-NEXT: vpaddb %xmm3, %xmm0, %xmm0 ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX-NEXT: vmovaps {{.*#+}} ymm2 = [0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] +; AVX-NEXT: vbroadcastss {{.*#+}} ymm2 = [2.35106045E-38,2.35106045E-38,2.35106045E-38,2.35106045E-38,2.35106045E-38,2.35106045E-38,2.35106045E-38,2.35106045E-38] ; AVX-NEXT: vandps %ymm2, %ymm0, %ymm0 ; AVX-NEXT: vandps %ymm2, %ymm1, %ymm1 ; AVX-NEXT: retl @@ -328,7 +338,7 @@ ; AVX-64-NEXT: vpaddb %xmm3, %xmm2, %xmm2 ; AVX-64-NEXT: vpaddb %xmm3, %xmm0, %xmm0 ; AVX-64-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX-64-NEXT: vmovaps {{.*#+}} ymm2 = [0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] +; AVX-64-NEXT: vbroadcastss {{.*#+}} ymm2 = [2.35106045E-38,2.35106045E-38,2.35106045E-38,2.35106045E-38,2.35106045E-38,2.35106045E-38,2.35106045E-38,2.35106045E-38] ; AVX-64-NEXT: vandps %ymm2, %ymm0, %ymm0 ; AVX-64-NEXT: vandps %ymm2, %ymm1, %ymm1 ; AVX-64-NEXT: retq @@ -357,15 +367,16 @@ define <64 x i8> @f64i8_i32(<64 x i8> %a) { ; AVX-LABEL: f64i8_i32: ; AVX: # %bb.0: -; AVX-NEXT: vbroadcastss {{.*#+}} ymm2 = [50462976,50462976,50462976,50462976,50462976,50462976,50462976,50462976] -; AVX-NEXT: vextractf128 $1, %ymm1, %xmm3 -; AVX-NEXT: vpaddb %xmm2, %xmm3, %xmm3 -; AVX-NEXT: vpaddb %xmm2, %xmm1, %xmm1 -; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 -; AVX-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX-NEXT: vpaddb %xmm2, %xmm3, %xmm3 -; AVX-NEXT: vpaddb %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 +; AVX-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX-NEXT: vpaddb %xmm3, %xmm2, %xmm2 +; AVX-NEXT: vpaddb %xmm3, %xmm1, %xmm1 +; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX-NEXT: vpaddb %xmm3, %xmm2, %xmm2 +; AVX-NEXT: vpaddb %xmm3, %xmm0, %xmm0 +; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX-NEXT: vbroadcastss {{.*#+}} ymm2 = [3.82047143E-37,3.82047143E-37,3.82047143E-37,3.82047143E-37,3.82047143E-37,3.82047143E-37,3.82047143E-37,3.82047143E-37] ; AVX-NEXT: vandps %ymm2, %ymm0, %ymm0 ; AVX-NEXT: vandps %ymm2, %ymm1, %ymm1 ; AVX-NEXT: retl @@ -388,15 +399,16 @@ ; ; AVX-64-LABEL: f64i8_i32: ; AVX-64: # %bb.0: -; AVX-64-NEXT: vbroadcastss {{.*#+}} ymm2 = [50462976,50462976,50462976,50462976,50462976,50462976,50462976,50462976] -; AVX-64-NEXT: vextractf128 $1, %ymm1, %xmm3 -; AVX-64-NEXT: vpaddb %xmm2, %xmm3, %xmm3 -; AVX-64-NEXT: vpaddb %xmm2, %xmm1, %xmm1 -; AVX-64-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 -; AVX-64-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX-64-NEXT: vpaddb %xmm2, %xmm3, %xmm3 -; AVX-64-NEXT: vpaddb %xmm2, %xmm0, %xmm0 -; AVX-64-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 +; AVX-64-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX-64-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX-64-NEXT: vpaddb %xmm3, %xmm2, %xmm2 +; AVX-64-NEXT: vpaddb %xmm3, %xmm1, %xmm1 +; AVX-64-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX-64-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX-64-NEXT: vpaddb %xmm3, %xmm2, %xmm2 +; AVX-64-NEXT: vpaddb %xmm3, %xmm0, %xmm0 +; AVX-64-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX-64-NEXT: vbroadcastss {{.*#+}} ymm2 = [3.82047143E-37,3.82047143E-37,3.82047143E-37,3.82047143E-37,3.82047143E-37,3.82047143E-37,3.82047143E-37,3.82047143E-37] ; AVX-64-NEXT: vandps %ymm2, %ymm0, %ymm0 ; AVX-64-NEXT: vandps %ymm2, %ymm1, %ymm1 ; AVX-64-NEXT: retq @@ -425,15 +437,16 @@ define <64 x i8> @f64xi8_i64(<64 x i8> %a) { ; AVX-LABEL: f64xi8_i64: ; AVX: # %bb.0: -; AVX-NEXT: vbroadcastsd {{.*#+}} ymm2 = [506097522914230528,506097522914230528,506097522914230528,506097522914230528] -; AVX-NEXT: vextractf128 $1, %ymm1, %xmm3 -; AVX-NEXT: vpaddb %xmm2, %xmm3, %xmm3 -; AVX-NEXT: vpaddb %xmm2, %xmm1, %xmm1 -; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 -; AVX-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX-NEXT: vpaddb %xmm2, %xmm3, %xmm3 -; AVX-NEXT: vpaddb %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 +; AVX-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] +; AVX-NEXT: vpaddb %xmm3, %xmm2, %xmm2 +; AVX-NEXT: vpaddb %xmm3, %xmm1, %xmm1 +; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX-NEXT: vpaddb %xmm3, %xmm2, %xmm2 +; AVX-NEXT: vpaddb %xmm3, %xmm0, %xmm0 +; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX-NEXT: vbroadcastsd {{.*#+}} ymm2 = [7.9499288951273625E-275,7.9499288951273625E-275,7.9499288951273625E-275,7.9499288951273625E-275] ; AVX-NEXT: vandps %ymm2, %ymm0, %ymm0 ; AVX-NEXT: vandps %ymm2, %ymm1, %ymm1 ; AVX-NEXT: retl @@ -456,15 +469,16 @@ ; ; AVX-64-LABEL: f64xi8_i64: ; AVX-64: # %bb.0: -; AVX-64-NEXT: vbroadcastsd {{.*#+}} ymm2 = [506097522914230528,506097522914230528,506097522914230528,506097522914230528] -; AVX-64-NEXT: vextractf128 $1, %ymm1, %xmm3 -; AVX-64-NEXT: vpaddb %xmm2, %xmm3, %xmm3 -; AVX-64-NEXT: vpaddb %xmm2, %xmm1, %xmm1 -; AVX-64-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 -; AVX-64-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX-64-NEXT: vpaddb %xmm2, %xmm3, %xmm3 -; AVX-64-NEXT: vpaddb %xmm2, %xmm0, %xmm0 -; AVX-64-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 +; AVX-64-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX-64-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] +; AVX-64-NEXT: vpaddb %xmm3, %xmm2, %xmm2 +; AVX-64-NEXT: vpaddb %xmm3, %xmm1, %xmm1 +; AVX-64-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX-64-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX-64-NEXT: vpaddb %xmm3, %xmm2, %xmm2 +; AVX-64-NEXT: vpaddb %xmm3, %xmm0, %xmm0 +; AVX-64-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX-64-NEXT: vbroadcastsd {{.*#+}} ymm2 = [7.9499288951273625E-275,7.9499288951273625E-275,7.9499288951273625E-275,7.9499288951273625E-275] ; AVX-64-NEXT: vandps %ymm2, %ymm0, %ymm0 ; AVX-64-NEXT: vandps %ymm2, %ymm1, %ymm1 ; AVX-64-NEXT: retq @@ -493,24 +507,23 @@ define <64 x i8> @f64xi8_i128(<64 x i8> %a) { ; AVX-LABEL: f64xi8_i128: ; AVX: # %bb.0: -; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] -; AVX-NEXT: # ymm2 = mem[0,1,0,1] -; AVX-NEXT: vextractf128 $1, %ymm1, %xmm3 -; AVX-NEXT: vpaddb %xmm2, %xmm3, %xmm3 -; AVX-NEXT: vpaddb %xmm2, %xmm1, %xmm1 -; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 -; AVX-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX-NEXT: vpaddb %xmm2, %xmm3, %xmm3 -; AVX-NEXT: vpaddb %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 +; AVX-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; AVX-NEXT: vpaddb %xmm3, %xmm2, %xmm2 +; AVX-NEXT: vpaddb %xmm3, %xmm1, %xmm1 +; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX-NEXT: vpaddb %xmm3, %xmm2, %xmm2 +; AVX-NEXT: vpaddb %xmm3, %xmm0, %xmm0 +; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX-NEXT: vmovaps {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] ; AVX-NEXT: vandps %ymm2, %ymm0, %ymm0 ; AVX-NEXT: vandps %ymm2, %ymm1, %ymm1 ; AVX-NEXT: retl ; ; AVX2-LABEL: f64xi8_i128: ; AVX2: # %bb.0: -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] -; AVX2-NEXT: # ymm2 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] ; AVX2-NEXT: vpaddb %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 @@ -527,24 +540,23 @@ ; ; AVX-64-LABEL: f64xi8_i128: ; AVX-64: # %bb.0: -; AVX-64-NEXT: vbroadcastf128 {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] -; AVX-64-NEXT: # ymm2 = mem[0,1,0,1] -; AVX-64-NEXT: vextractf128 $1, %ymm1, %xmm3 -; AVX-64-NEXT: vpaddb %xmm2, %xmm3, %xmm3 -; AVX-64-NEXT: vpaddb %xmm2, %xmm1, %xmm1 -; AVX-64-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 -; AVX-64-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX-64-NEXT: vpaddb %xmm2, %xmm3, %xmm3 -; AVX-64-NEXT: vpaddb %xmm2, %xmm0, %xmm0 -; AVX-64-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 +; AVX-64-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX-64-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; AVX-64-NEXT: vpaddb %xmm3, %xmm2, %xmm2 +; AVX-64-NEXT: vpaddb %xmm3, %xmm1, %xmm1 +; AVX-64-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX-64-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX-64-NEXT: vpaddb %xmm3, %xmm2, %xmm2 +; AVX-64-NEXT: vpaddb %xmm3, %xmm0, %xmm0 +; AVX-64-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX-64-NEXT: vmovaps {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] ; AVX-64-NEXT: vandps %ymm2, %ymm0, %ymm0 ; AVX-64-NEXT: vandps %ymm2, %ymm1, %ymm1 ; AVX-64-NEXT: retq ; ; AVX2-64-LABEL: f64xi8_i128: ; AVX2-64: # %bb.0: -; AVX2-64-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] -; AVX2-64-NEXT: # ymm2 = mem[0,1,0,1] +; AVX2-64-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] ; AVX2-64-NEXT: vpaddb %ymm2, %ymm1, %ymm1 ; AVX2-64-NEXT: vpaddb %ymm2, %ymm0, %ymm0 ; AVX2-64-NEXT: vpand %ymm2, %ymm0, %ymm0 @@ -641,7 +653,7 @@ define <8 x i16> @f8xi16_i32(<8 x i16> %a) { ; AVX-LABEL: f8xi16_i32: ; AVX: # %bb.0: -; AVX-NEXT: vbroadcastss {{.*#+}} xmm1 = [65536,65536,65536,65536] +; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,0,1,0,1,0,1] ; AVX-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retl @@ -655,7 +667,7 @@ ; ; AVX-64-LABEL: f8xi16_i32: ; AVX-64: # %bb.0: -; AVX-64-NEXT: vbroadcastss {{.*#+}} xmm1 = [65536,65536,65536,65536] +; AVX-64-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,0,1,0,1,0,1] ; AVX-64-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ; AVX-64-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX-64-NEXT: retq @@ -675,8 +687,7 @@ define <8 x i16> @f8xi16_i64(<8 x i16> %a) { ; AVX-LABEL: f8xi16_i64: ; AVX: # %bb.0: -; AVX-NEXT: vmovddup {{.*#+}} xmm1 = [844433520132096,844433520132096] -; AVX-NEXT: # xmm1 = mem[0,0] +; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,0,1,2,3] ; AVX-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retl @@ -690,8 +701,7 @@ ; ; AVX-64-LABEL: f8xi16_i64: ; AVX-64: # %bb.0: -; AVX-64-NEXT: vmovddup {{.*#+}} xmm1 = [844433520132096,844433520132096] -; AVX-64-NEXT: # xmm1 = mem[0,0] +; AVX-64-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,0,1,2,3] ; AVX-64-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ; AVX-64-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX-64-NEXT: retq @@ -712,7 +722,7 @@ ; AVX-LABEL: f16xi16_i32: ; AVX: # %bb.0: ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX-NEXT: vbroadcastss {{.*#+}} xmm2 = [65536,65536,65536,65536] +; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,0,1,0,1,0,1] ; AVX-NEXT: vpaddw %xmm2, %xmm1, %xmm1 ; AVX-NEXT: vpaddw %xmm2, %xmm0, %xmm0 ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 @@ -729,7 +739,7 @@ ; AVX-64-LABEL: f16xi16_i32: ; AVX-64: # %bb.0: ; AVX-64-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX-64-NEXT: vbroadcastss {{.*#+}} xmm2 = [65536,65536,65536,65536] +; AVX-64-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,0,1,0,1,0,1] ; AVX-64-NEXT: vpaddw %xmm2, %xmm1, %xmm1 ; AVX-64-NEXT: vpaddw %xmm2, %xmm0, %xmm0 ; AVX-64-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 @@ -752,8 +762,7 @@ ; AVX-LABEL: f16xi16_i64: ; AVX: # %bb.0: ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX-NEXT: vmovddup {{.*#+}} xmm2 = [844433520132096,844433520132096] -; AVX-NEXT: # xmm2 = mem[0,0] +; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,0,1,2,3] ; AVX-NEXT: vpaddw %xmm2, %xmm1, %xmm1 ; AVX-NEXT: vpaddw %xmm2, %xmm0, %xmm0 ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 @@ -770,8 +779,7 @@ ; AVX-64-LABEL: f16xi16_i64: ; AVX-64: # %bb.0: ; AVX-64-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX-64-NEXT: vmovddup {{.*#+}} xmm2 = [844433520132096,844433520132096] -; AVX-64-NEXT: # xmm2 = mem[0,0] +; AVX-64-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,0,1,2,3] ; AVX-64-NEXT: vpaddw %xmm2, %xmm1, %xmm1 ; AVX-64-NEXT: vpaddw %xmm2, %xmm0, %xmm0 ; AVX-64-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 @@ -801,13 +809,20 @@ ; AVX-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0 ; AVX-NEXT: retl ; -; ALL32-LABEL: f16xi16_i128: -; ALL32: # %bb.0: -; ALL32-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] -; ALL32-NEXT: # ymm1 = mem[0,1,0,1] -; ALL32-NEXT: vpaddw %ymm1, %ymm0, %ymm0 -; ALL32-NEXT: vpand %ymm1, %ymm0, %ymm0 -; ALL32-NEXT: retl +; AVX2-LABEL: f16xi16_i128: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] +; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: retl +; +; AVX512-LABEL: f16xi16_i128: +; AVX512: # %bb.0: +; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] +; AVX512-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512-NEXT: vpaddw %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: retl ; ; AVX-64-LABEL: f16xi16_i128: ; AVX-64: # %bb.0: @@ -819,13 +834,20 @@ ; AVX-64-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX-64-NEXT: retq ; -; ALL64-LABEL: f16xi16_i128: -; ALL64: # %bb.0: -; ALL64-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] -; ALL64-NEXT: # ymm1 = mem[0,1,0,1] -; ALL64-NEXT: vpaddw %ymm1, %ymm0, %ymm0 -; ALL64-NEXT: vpand %ymm1, %ymm0, %ymm0 -; ALL64-NEXT: retq +; AVX2-64-LABEL: f16xi16_i128: +; AVX2-64: # %bb.0: +; AVX2-64-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] +; AVX2-64-NEXT: vpaddw %ymm1, %ymm0, %ymm0 +; AVX2-64-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX2-64-NEXT: retq +; +; AVX512F-64-LABEL: f16xi16_i128: +; AVX512F-64: # %bb.0: +; AVX512F-64-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] +; AVX512F-64-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512F-64-NEXT: vpaddw %ymm1, %ymm0, %ymm0 +; AVX512F-64-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX512F-64-NEXT: retq %res1 = add <16 x i16> , %a %res2 = and <16 x i16> , %res1 ret <16 x i16> %res2 @@ -835,15 +857,16 @@ define <32 x i16> @f32xi16_i32(<32 x i16> %a) { ; AVX-LABEL: f32xi16_i32: ; AVX: # %bb.0: -; AVX-NEXT: vbroadcastss {{.*#+}} ymm2 = [65536,65536,65536,65536,65536,65536,65536,65536] -; AVX-NEXT: vextractf128 $1, %ymm1, %xmm3 -; AVX-NEXT: vpaddw %xmm2, %xmm3, %xmm3 -; AVX-NEXT: vpaddw %xmm2, %xmm1, %xmm1 -; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 -; AVX-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX-NEXT: vpaddw %xmm2, %xmm3, %xmm3 -; AVX-NEXT: vpaddw %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 +; AVX-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,0,1,0,1,0,1] +; AVX-NEXT: vpaddw %xmm3, %xmm2, %xmm2 +; AVX-NEXT: vpaddw %xmm3, %xmm1, %xmm1 +; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX-NEXT: vpaddw %xmm3, %xmm2, %xmm2 +; AVX-NEXT: vpaddw %xmm3, %xmm0, %xmm0 +; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX-NEXT: vbroadcastss {{.*#+}} ymm2 = [9.18354962E-41,9.18354962E-41,9.18354962E-41,9.18354962E-41,9.18354962E-41,9.18354962E-41,9.18354962E-41,9.18354962E-41] ; AVX-NEXT: vandps %ymm2, %ymm0, %ymm0 ; AVX-NEXT: vandps %ymm2, %ymm1, %ymm1 ; AVX-NEXT: retl @@ -866,15 +889,16 @@ ; ; AVX-64-LABEL: f32xi16_i32: ; AVX-64: # %bb.0: -; AVX-64-NEXT: vbroadcastss {{.*#+}} ymm2 = [65536,65536,65536,65536,65536,65536,65536,65536] -; AVX-64-NEXT: vextractf128 $1, %ymm1, %xmm3 -; AVX-64-NEXT: vpaddw %xmm2, %xmm3, %xmm3 -; AVX-64-NEXT: vpaddw %xmm2, %xmm1, %xmm1 -; AVX-64-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 -; AVX-64-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX-64-NEXT: vpaddw %xmm2, %xmm3, %xmm3 -; AVX-64-NEXT: vpaddw %xmm2, %xmm0, %xmm0 -; AVX-64-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 +; AVX-64-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX-64-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,0,1,0,1,0,1] +; AVX-64-NEXT: vpaddw %xmm3, %xmm2, %xmm2 +; AVX-64-NEXT: vpaddw %xmm3, %xmm1, %xmm1 +; AVX-64-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX-64-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX-64-NEXT: vpaddw %xmm3, %xmm2, %xmm2 +; AVX-64-NEXT: vpaddw %xmm3, %xmm0, %xmm0 +; AVX-64-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX-64-NEXT: vbroadcastss {{.*#+}} ymm2 = [9.18354962E-41,9.18354962E-41,9.18354962E-41,9.18354962E-41,9.18354962E-41,9.18354962E-41,9.18354962E-41,9.18354962E-41] ; AVX-64-NEXT: vandps %ymm2, %ymm0, %ymm0 ; AVX-64-NEXT: vandps %ymm2, %ymm1, %ymm1 ; AVX-64-NEXT: retq @@ -903,15 +927,16 @@ define <32 x i16> @f32xi16_i64(<32 x i16> %a) { ; AVX-LABEL: f32xi16_i64: ; AVX: # %bb.0: -; AVX-NEXT: vbroadcastsd {{.*#+}} ymm2 = [844433520132096,844433520132096,844433520132096,844433520132096] -; AVX-NEXT: vextractf128 $1, %ymm1, %xmm3 -; AVX-NEXT: vpaddw %xmm2, %xmm3, %xmm3 -; AVX-NEXT: vpaddw %xmm2, %xmm1, %xmm1 -; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 -; AVX-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX-NEXT: vpaddw %xmm2, %xmm3, %xmm3 -; AVX-NEXT: vpaddw %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 +; AVX-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,0,1,2,3] +; AVX-NEXT: vpaddw %xmm3, %xmm2, %xmm2 +; AVX-NEXT: vpaddw %xmm3, %xmm1, %xmm1 +; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX-NEXT: vpaddw %xmm3, %xmm2, %xmm2 +; AVX-NEXT: vpaddw %xmm3, %xmm0, %xmm0 +; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX-NEXT: vbroadcastsd {{.*#+}} ymm2 = [4.1720559249406128E-309,4.1720559249406128E-309,4.1720559249406128E-309,4.1720559249406128E-309] ; AVX-NEXT: vandps %ymm2, %ymm0, %ymm0 ; AVX-NEXT: vandps %ymm2, %ymm1, %ymm1 ; AVX-NEXT: retl @@ -934,15 +959,16 @@ ; ; AVX-64-LABEL: f32xi16_i64: ; AVX-64: # %bb.0: -; AVX-64-NEXT: vbroadcastsd {{.*#+}} ymm2 = [844433520132096,844433520132096,844433520132096,844433520132096] -; AVX-64-NEXT: vextractf128 $1, %ymm1, %xmm3 -; AVX-64-NEXT: vpaddw %xmm2, %xmm3, %xmm3 -; AVX-64-NEXT: vpaddw %xmm2, %xmm1, %xmm1 -; AVX-64-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 -; AVX-64-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX-64-NEXT: vpaddw %xmm2, %xmm3, %xmm3 -; AVX-64-NEXT: vpaddw %xmm2, %xmm0, %xmm0 -; AVX-64-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 +; AVX-64-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX-64-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,0,1,2,3] +; AVX-64-NEXT: vpaddw %xmm3, %xmm2, %xmm2 +; AVX-64-NEXT: vpaddw %xmm3, %xmm1, %xmm1 +; AVX-64-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX-64-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX-64-NEXT: vpaddw %xmm3, %xmm2, %xmm2 +; AVX-64-NEXT: vpaddw %xmm3, %xmm0, %xmm0 +; AVX-64-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX-64-NEXT: vbroadcastsd {{.*#+}} ymm2 = [4.1720559249406128E-309,4.1720559249406128E-309,4.1720559249406128E-309,4.1720559249406128E-309] ; AVX-64-NEXT: vandps %ymm2, %ymm0, %ymm0 ; AVX-64-NEXT: vandps %ymm2, %ymm1, %ymm1 ; AVX-64-NEXT: retq @@ -971,24 +997,23 @@ define <32 x i16> @f32xi16_i128(<32 x i16> %a) { ; AVX-LABEL: f32xi16_i128: ; AVX: # %bb.0: -; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] -; AVX-NEXT: # ymm2 = mem[0,1,0,1] -; AVX-NEXT: vextractf128 $1, %ymm1, %xmm3 -; AVX-NEXT: vpaddw %xmm2, %xmm3, %xmm3 -; AVX-NEXT: vpaddw %xmm2, %xmm1, %xmm1 -; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 -; AVX-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX-NEXT: vpaddw %xmm2, %xmm3, %xmm3 -; AVX-NEXT: vpaddw %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 +; AVX-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,4,5,6,7] +; AVX-NEXT: vpaddw %xmm3, %xmm2, %xmm2 +; AVX-NEXT: vpaddw %xmm3, %xmm1, %xmm1 +; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX-NEXT: vpaddw %xmm3, %xmm2, %xmm2 +; AVX-NEXT: vpaddw %xmm3, %xmm0, %xmm0 +; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX-NEXT: vmovaps {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] ; AVX-NEXT: vandps %ymm2, %ymm0, %ymm0 ; AVX-NEXT: vandps %ymm2, %ymm1, %ymm1 ; AVX-NEXT: retl ; ; AVX2-LABEL: f32xi16_i128: ; AVX2: # %bb.0: -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] -; AVX2-NEXT: # ymm2 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] ; AVX2-NEXT: vpaddw %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vpaddw %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 @@ -1005,24 +1030,23 @@ ; ; AVX-64-LABEL: f32xi16_i128: ; AVX-64: # %bb.0: -; AVX-64-NEXT: vbroadcastf128 {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] -; AVX-64-NEXT: # ymm2 = mem[0,1,0,1] -; AVX-64-NEXT: vextractf128 $1, %ymm1, %xmm3 -; AVX-64-NEXT: vpaddw %xmm2, %xmm3, %xmm3 -; AVX-64-NEXT: vpaddw %xmm2, %xmm1, %xmm1 -; AVX-64-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 -; AVX-64-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX-64-NEXT: vpaddw %xmm2, %xmm3, %xmm3 -; AVX-64-NEXT: vpaddw %xmm2, %xmm0, %xmm0 -; AVX-64-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 +; AVX-64-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX-64-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,4,5,6,7] +; AVX-64-NEXT: vpaddw %xmm3, %xmm2, %xmm2 +; AVX-64-NEXT: vpaddw %xmm3, %xmm1, %xmm1 +; AVX-64-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX-64-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX-64-NEXT: vpaddw %xmm3, %xmm2, %xmm2 +; AVX-64-NEXT: vpaddw %xmm3, %xmm0, %xmm0 +; AVX-64-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX-64-NEXT: vmovaps {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] ; AVX-64-NEXT: vandps %ymm2, %ymm0, %ymm0 ; AVX-64-NEXT: vandps %ymm2, %ymm1, %ymm1 ; AVX-64-NEXT: retq ; ; AVX2-64-LABEL: f32xi16_i128: ; AVX2-64: # %bb.0: -; AVX2-64-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] -; AVX2-64-NEXT: # ymm2 = mem[0,1,0,1] +; AVX2-64-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7] ; AVX2-64-NEXT: vpaddw %ymm2, %ymm1, %ymm1 ; AVX2-64-NEXT: vpaddw %ymm2, %ymm0, %ymm0 ; AVX2-64-NEXT: vpand %ymm2, %ymm0, %ymm0 @@ -1119,8 +1143,7 @@ define <4 x i32> @f4xi32_i64(<4 x i32> %a) { ; AVX-LABEL: f4xi32_i64: ; AVX: # %bb.0: -; AVX-NEXT: vmovddup {{.*#+}} xmm1 = [4294967296,4294967296] -; AVX-NEXT: # xmm1 = mem[0,0] +; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,0,1] ; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retl @@ -1134,8 +1157,7 @@ ; ; AVX-64-LABEL: f4xi32_i64: ; AVX-64: # %bb.0: -; AVX-64-NEXT: vmovddup {{.*#+}} xmm1 = [4294967296,4294967296] -; AVX-64-NEXT: # xmm1 = mem[0,0] +; AVX-64-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,0,1] ; AVX-64-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX-64-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX-64-NEXT: retq @@ -1156,8 +1178,7 @@ ; AVX-LABEL: f8xi32_i64: ; AVX: # %bb.0: ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX-NEXT: vmovddup {{.*#+}} xmm2 = [4294967296,4294967296] -; AVX-NEXT: # xmm2 = mem[0,0] +; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,0,1] ; AVX-NEXT: vpaddd %xmm2, %xmm1, %xmm1 ; AVX-NEXT: vpaddd %xmm2, %xmm0, %xmm0 ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 @@ -1174,8 +1195,7 @@ ; AVX-64-LABEL: f8xi32_i64: ; AVX-64: # %bb.0: ; AVX-64-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX-64-NEXT: vmovddup {{.*#+}} xmm2 = [4294967296,4294967296] -; AVX-64-NEXT: # xmm2 = mem[0,0] +; AVX-64-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,0,1] ; AVX-64-NEXT: vpaddd %xmm2, %xmm1, %xmm1 ; AVX-64-NEXT: vpaddd %xmm2, %xmm0, %xmm0 ; AVX-64-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 @@ -1205,13 +1225,20 @@ ; AVX-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0 ; AVX-NEXT: retl ; -; ALL32-LABEL: f8xi32_i128: -; ALL32: # %bb.0: -; ALL32-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,1,2,3,0,1,2,3] -; ALL32-NEXT: # ymm1 = mem[0,1,0,1] -; ALL32-NEXT: vpaddd %ymm1, %ymm0, %ymm0 -; ALL32-NEXT: vpand %ymm1, %ymm0, %ymm0 -; ALL32-NEXT: retl +; AVX2-LABEL: f8xi32_i128: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,0,1,2,3] +; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: retl +; +; AVX512-LABEL: f8xi32_i128: +; AVX512: # %bb.0: +; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,1,2,3,0,1,2,3] +; AVX512-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: retl ; ; AVX-64-LABEL: f8xi32_i128: ; AVX-64: # %bb.0: @@ -1223,13 +1250,20 @@ ; AVX-64-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX-64-NEXT: retq ; -; ALL64-LABEL: f8xi32_i128: -; ALL64: # %bb.0: -; ALL64-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,1,2,3,0,1,2,3] -; ALL64-NEXT: # ymm1 = mem[0,1,0,1] -; ALL64-NEXT: vpaddd %ymm1, %ymm0, %ymm0 -; ALL64-NEXT: vpand %ymm1, %ymm0, %ymm0 -; ALL64-NEXT: retq +; AVX2-64-LABEL: f8xi32_i128: +; AVX2-64: # %bb.0: +; AVX2-64-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,0,1,2,3] +; AVX2-64-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; AVX2-64-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX2-64-NEXT: retq +; +; AVX512F-64-LABEL: f8xi32_i128: +; AVX512F-64: # %bb.0: +; AVX512F-64-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,1,2,3,0,1,2,3] +; AVX512F-64-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512F-64-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; AVX512F-64-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX512F-64-NEXT: retq %res1 = add <8 x i32> , %a %res2 = and <8 x i32> , %res1 ret <8 x i32> %res2 @@ -1239,15 +1273,16 @@ define <16 x i32> @f16xi32_i64(<16 x i32> %a) { ; AVX-LABEL: f16xi32_i64: ; AVX: # %bb.0: -; AVX-NEXT: vbroadcastsd {{.*#+}} ymm2 = [4294967296,4294967296,4294967296,4294967296] -; AVX-NEXT: vextractf128 $1, %ymm1, %xmm3 -; AVX-NEXT: vpaddd %xmm2, %xmm3, %xmm3 -; AVX-NEXT: vpaddd %xmm2, %xmm1, %xmm1 -; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 -; AVX-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX-NEXT: vpaddd %xmm2, %xmm3, %xmm3 -; AVX-NEXT: vpaddd %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 +; AVX-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,0,1] +; AVX-NEXT: vpaddd %xmm3, %xmm2, %xmm2 +; AVX-NEXT: vpaddd %xmm3, %xmm1, %xmm1 +; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX-NEXT: vpaddd %xmm3, %xmm2, %xmm2 +; AVX-NEXT: vpaddd %xmm3, %xmm0, %xmm0 +; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX-NEXT: vbroadcastsd {{.*#+}} ymm2 = [2.1219957909652723E-314,2.1219957909652723E-314,2.1219957909652723E-314,2.1219957909652723E-314] ; AVX-NEXT: vandps %ymm2, %ymm0, %ymm0 ; AVX-NEXT: vandps %ymm2, %ymm1, %ymm1 ; AVX-NEXT: retl @@ -1270,15 +1305,16 @@ ; ; AVX-64-LABEL: f16xi32_i64: ; AVX-64: # %bb.0: -; AVX-64-NEXT: vbroadcastsd {{.*#+}} ymm2 = [4294967296,4294967296,4294967296,4294967296] -; AVX-64-NEXT: vextractf128 $1, %ymm1, %xmm3 -; AVX-64-NEXT: vpaddd %xmm2, %xmm3, %xmm3 -; AVX-64-NEXT: vpaddd %xmm2, %xmm1, %xmm1 -; AVX-64-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 -; AVX-64-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX-64-NEXT: vpaddd %xmm2, %xmm3, %xmm3 -; AVX-64-NEXT: vpaddd %xmm2, %xmm0, %xmm0 -; AVX-64-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 +; AVX-64-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX-64-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,0,1] +; AVX-64-NEXT: vpaddd %xmm3, %xmm2, %xmm2 +; AVX-64-NEXT: vpaddd %xmm3, %xmm1, %xmm1 +; AVX-64-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX-64-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX-64-NEXT: vpaddd %xmm3, %xmm2, %xmm2 +; AVX-64-NEXT: vpaddd %xmm3, %xmm0, %xmm0 +; AVX-64-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX-64-NEXT: vbroadcastsd {{.*#+}} ymm2 = [2.1219957909652723E-314,2.1219957909652723E-314,2.1219957909652723E-314,2.1219957909652723E-314] ; AVX-64-NEXT: vandps %ymm2, %ymm0, %ymm0 ; AVX-64-NEXT: vandps %ymm2, %ymm1, %ymm1 ; AVX-64-NEXT: retq @@ -1307,24 +1343,23 @@ define <16 x i32> @f16xi32_i128(<16 x i32> %a) { ; AVX-LABEL: f16xi32_i128: ; AVX: # %bb.0: -; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm2 = [0,1,2,3,0,1,2,3] -; AVX-NEXT: # ymm2 = mem[0,1,0,1] -; AVX-NEXT: vextractf128 $1, %ymm1, %xmm3 -; AVX-NEXT: vpaddd %xmm2, %xmm3, %xmm3 -; AVX-NEXT: vpaddd %xmm2, %xmm1, %xmm1 -; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 -; AVX-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX-NEXT: vpaddd %xmm2, %xmm3, %xmm3 -; AVX-NEXT: vpaddd %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 +; AVX-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3] +; AVX-NEXT: vpaddd %xmm3, %xmm2, %xmm2 +; AVX-NEXT: vpaddd %xmm3, %xmm1, %xmm1 +; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX-NEXT: vpaddd %xmm3, %xmm2, %xmm2 +; AVX-NEXT: vpaddd %xmm3, %xmm0, %xmm0 +; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX-NEXT: vmovaps {{.*#+}} ymm2 = [0,1,2,3,0,1,2,3] ; AVX-NEXT: vandps %ymm2, %ymm0, %ymm0 ; AVX-NEXT: vandps %ymm2, %ymm1, %ymm1 ; AVX-NEXT: retl ; ; AVX2-LABEL: f16xi32_i128: ; AVX2: # %bb.0: -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,1,2,3,0,1,2,3] -; AVX2-NEXT: # ymm2 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,3,0,1,2,3] ; AVX2-NEXT: vpaddd %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vpaddd %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 @@ -1341,24 +1376,23 @@ ; ; AVX-64-LABEL: f16xi32_i128: ; AVX-64: # %bb.0: -; AVX-64-NEXT: vbroadcastf128 {{.*#+}} ymm2 = [0,1,2,3,0,1,2,3] -; AVX-64-NEXT: # ymm2 = mem[0,1,0,1] -; AVX-64-NEXT: vextractf128 $1, %ymm1, %xmm3 -; AVX-64-NEXT: vpaddd %xmm2, %xmm3, %xmm3 -; AVX-64-NEXT: vpaddd %xmm2, %xmm1, %xmm1 -; AVX-64-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 -; AVX-64-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX-64-NEXT: vpaddd %xmm2, %xmm3, %xmm3 -; AVX-64-NEXT: vpaddd %xmm2, %xmm0, %xmm0 -; AVX-64-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 +; AVX-64-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX-64-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3] +; AVX-64-NEXT: vpaddd %xmm3, %xmm2, %xmm2 +; AVX-64-NEXT: vpaddd %xmm3, %xmm1, %xmm1 +; AVX-64-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX-64-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX-64-NEXT: vpaddd %xmm3, %xmm2, %xmm2 +; AVX-64-NEXT: vpaddd %xmm3, %xmm0, %xmm0 +; AVX-64-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX-64-NEXT: vmovaps {{.*#+}} ymm2 = [0,1,2,3,0,1,2,3] ; AVX-64-NEXT: vandps %ymm2, %ymm0, %ymm0 ; AVX-64-NEXT: vandps %ymm2, %ymm1, %ymm1 ; AVX-64-NEXT: retq ; ; AVX2-64-LABEL: f16xi32_i128: ; AVX2-64: # %bb.0: -; AVX2-64-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,1,2,3,0,1,2,3] -; AVX2-64-NEXT: # ymm2 = mem[0,1,0,1] +; AVX2-64-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,3,0,1,2,3] ; AVX2-64-NEXT: vpaddd %ymm2, %ymm1, %ymm1 ; AVX2-64-NEXT: vpaddd %ymm2, %ymm0, %ymm0 ; AVX2-64-NEXT: vpand %ymm2, %ymm0, %ymm0 @@ -1389,13 +1423,20 @@ ; AVX-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0 ; AVX-NEXT: retl ; -; ALL32-LABEL: f4xi64_i128: -; ALL32: # %bb.0: -; ALL32-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,0,1,0,0,0,1,0] -; ALL32-NEXT: # ymm1 = mem[0,1,0,1] -; ALL32-NEXT: vpaddq %ymm1, %ymm0, %ymm0 -; ALL32-NEXT: vpand %ymm1, %ymm0, %ymm0 -; ALL32-NEXT: retl +; AVX2-LABEL: f4xi64_i128: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,1,0,0,0,1,0] +; AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: retl +; +; AVX512-LABEL: f4xi64_i128: +; AVX512: # %bb.0: +; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,0,1,0,0,0,1,0] +; AVX512-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512-NEXT: vpaddq %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: retl ; ; AVX-64-LABEL: f4xi64_i128: ; AVX-64: # %bb.0: @@ -1407,13 +1448,20 @@ ; AVX-64-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX-64-NEXT: retq ; -; ALL64-LABEL: f4xi64_i128: -; ALL64: # %bb.0: -; ALL64-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,1,0,1] -; ALL64-NEXT: # ymm1 = mem[0,1,0,1] -; ALL64-NEXT: vpaddq %ymm1, %ymm0, %ymm0 -; ALL64-NEXT: vpand %ymm1, %ymm0, %ymm0 -; ALL64-NEXT: retq +; AVX2-64-LABEL: f4xi64_i128: +; AVX2-64: # %bb.0: +; AVX2-64-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,0,1] +; AVX2-64-NEXT: vpaddq %ymm1, %ymm0, %ymm0 +; AVX2-64-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX2-64-NEXT: retq +; +; AVX512F-64-LABEL: f4xi64_i128: +; AVX512F-64: # %bb.0: +; AVX512F-64-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,1,0,1] +; AVX512F-64-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512F-64-NEXT: vpaddq %ymm1, %ymm0, %ymm0 +; AVX512F-64-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX512F-64-NEXT: retq %res1 = add <4 x i64> , %a %res2 = and <4 x i64> , %res1 ret <4 x i64> %res2 @@ -1423,24 +1471,23 @@ define <8 x i64> @f8xi64_i128(<8 x i64> %a) { ; AVX-LABEL: f8xi64_i128: ; AVX: # %bb.0: -; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm2 = [0,0,1,0,0,0,1,0] -; AVX-NEXT: # ymm2 = mem[0,1,0,1] -; AVX-NEXT: vextractf128 $1, %ymm1, %xmm3 -; AVX-NEXT: vpaddq %xmm2, %xmm3, %xmm3 -; AVX-NEXT: vpaddq %xmm2, %xmm1, %xmm1 -; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 -; AVX-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX-NEXT: vpaddq %xmm2, %xmm3, %xmm3 -; AVX-NEXT: vpaddq %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 +; AVX-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,0,1,0] +; AVX-NEXT: vpaddq %xmm3, %xmm2, %xmm2 +; AVX-NEXT: vpaddq %xmm3, %xmm1, %xmm1 +; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX-NEXT: vpaddq %xmm3, %xmm2, %xmm2 +; AVX-NEXT: vpaddq %xmm3, %xmm0, %xmm0 +; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX-NEXT: vmovaps {{.*#+}} ymm2 = [0,0,1,0,0,0,1,0] ; AVX-NEXT: vandps %ymm2, %ymm0, %ymm0 ; AVX-NEXT: vandps %ymm2, %ymm1, %ymm1 ; AVX-NEXT: retl ; ; AVX2-LABEL: f8xi64_i128: ; AVX2: # %bb.0: -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,0,1,0,0,0,1,0] -; AVX2-NEXT: # ymm2 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,1,0,0,0,1,0] ; AVX2-NEXT: vpaddq %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vpaddq %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 @@ -1466,16 +1513,14 @@ ; AVX-64-NEXT: vpaddq %xmm3, %xmm2, %xmm2 ; AVX-64-NEXT: vpaddq %xmm3, %xmm0, %xmm0 ; AVX-64-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX-64-NEXT: vbroadcastf128 {{.*#+}} ymm2 = [0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0] -; AVX-64-NEXT: # ymm2 = mem[0,1,0,1] +; AVX-64-NEXT: vmovaps {{.*#+}} ymm2 = [0,1,0,1] ; AVX-64-NEXT: vandps %ymm2, %ymm0, %ymm0 ; AVX-64-NEXT: vandps %ymm2, %ymm1, %ymm1 ; AVX-64-NEXT: retq ; ; AVX2-64-LABEL: f8xi64_i128: ; AVX2-64: # %bb.0: -; AVX2-64-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,1,0,1] -; AVX2-64-NEXT: # ymm2 = mem[0,1,0,1] +; AVX2-64-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,0,1] ; AVX2-64-NEXT: vpaddq %ymm2, %ymm1, %ymm1 ; AVX2-64-NEXT: vpaddq %ymm2, %ymm0, %ymm0 ; AVX2-64-NEXT: vpand %ymm2, %ymm0, %ymm0 @@ -1572,35 +1617,51 @@ define <4 x float> @f4xf32_f64(<4 x float> %a) { ; AVX-LABEL: f4xf32_f64: ; AVX: # %bb.0: -; AVX-NEXT: vmovddup {{.*#+}} xmm1 = [4575657222482165760,4575657222482165760] +; AVX-NEXT: vmovddup {{.*#+}} xmm1 = [7.8125018626451492E-3,7.8125018626451492E-3] ; AVX-NEXT: # xmm1 = mem[0,0] ; AVX-NEXT: vaddps %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vdivps %xmm0, %xmm1, %xmm0 ; AVX-NEXT: retl ; -; ALL32-LABEL: f4xf32_f64: -; ALL32: # %bb.0: -; ALL32-NEXT: vmovddup {{.*#+}} xmm1 = [4575657222482165760,4575657222482165760] -; ALL32-NEXT: # xmm1 = mem[0,0] -; ALL32-NEXT: vaddps %xmm1, %xmm0, %xmm0 -; ALL32-NEXT: vdivps %xmm0, %xmm1, %xmm0 -; ALL32-NEXT: retl +; AVX2-LABEL: f4xf32_f64: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovddup {{.*#+}} xmm1 = [7.8125018626451492E-3,7.8125018626451492E-3] +; AVX2-NEXT: # xmm1 = mem[0,0] +; AVX2-NEXT: vaddps %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vdivps %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: retl +; +; AVX512-LABEL: f4xf32_f64: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovddup {{.*#+}} xmm1 = [4575657222482165760,4575657222482165760] +; AVX512-NEXT: # xmm1 = mem[0,0] +; AVX512-NEXT: vaddps %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vdivps %xmm0, %xmm1, %xmm0 +; AVX512-NEXT: retl ; ; AVX-64-LABEL: f4xf32_f64: ; AVX-64: # %bb.0: -; AVX-64-NEXT: vmovddup {{.*#+}} xmm1 = [4575657222482165760,4575657222482165760] +; AVX-64-NEXT: vmovddup {{.*#+}} xmm1 = [7.8125018626451492E-3,7.8125018626451492E-3] ; AVX-64-NEXT: # xmm1 = mem[0,0] ; AVX-64-NEXT: vaddps %xmm1, %xmm0, %xmm0 ; AVX-64-NEXT: vdivps %xmm0, %xmm1, %xmm0 ; AVX-64-NEXT: retq ; -; ALL64-LABEL: f4xf32_f64: -; ALL64: # %bb.0: -; ALL64-NEXT: vmovddup {{.*#+}} xmm1 = [4575657222482165760,4575657222482165760] -; ALL64-NEXT: # xmm1 = mem[0,0] -; ALL64-NEXT: vaddps %xmm1, %xmm0, %xmm0 -; ALL64-NEXT: vdivps %xmm0, %xmm1, %xmm0 -; ALL64-NEXT: retq +; AVX2-64-LABEL: f4xf32_f64: +; AVX2-64: # %bb.0: +; AVX2-64-NEXT: vmovddup {{.*#+}} xmm1 = [7.8125018626451492E-3,7.8125018626451492E-3] +; AVX2-64-NEXT: # xmm1 = mem[0,0] +; AVX2-64-NEXT: vaddps %xmm1, %xmm0, %xmm0 +; AVX2-64-NEXT: vdivps %xmm0, %xmm1, %xmm0 +; AVX2-64-NEXT: retq +; +; AVX512F-64-LABEL: f4xf32_f64: +; AVX512F-64: # %bb.0: +; AVX512F-64-NEXT: vmovddup {{.*#+}} xmm1 = [4575657222482165760,4575657222482165760] +; AVX512F-64-NEXT: # xmm1 = mem[0,0] +; AVX512F-64-NEXT: vaddps %xmm1, %xmm0, %xmm0 +; AVX512F-64-NEXT: vdivps %xmm0, %xmm1, %xmm0 +; AVX512F-64-NEXT: retq %res1 = fadd <4 x float> , %a %res2 = fdiv <4 x float> , %res1 ret <4 x float> %res2 @@ -1610,31 +1671,45 @@ define <8 x float> @f8xf32_f64(<8 x float> %a) { ; AVX-LABEL: f8xf32_f64: ; AVX: # %bb.0: -; AVX-NEXT: vbroadcastsd {{.*#+}} ymm1 = [4575657222482165760,4575657222482165760,4575657222482165760,4575657222482165760] +; AVX-NEXT: vbroadcastsd {{.*#+}} ymm1 = [7.8125018626451492E-3,7.8125018626451492E-3,7.8125018626451492E-3,7.8125018626451492E-3] ; AVX-NEXT: vaddps %ymm1, %ymm0, %ymm0 ; AVX-NEXT: vdivps %ymm0, %ymm1, %ymm0 ; AVX-NEXT: retl ; -; ALL32-LABEL: f8xf32_f64: -; ALL32: # %bb.0: -; ALL32-NEXT: vbroadcastsd {{.*#+}} ymm1 = [4575657222482165760,4575657222482165760,4575657222482165760,4575657222482165760] -; ALL32-NEXT: vaddps %ymm1, %ymm0, %ymm0 -; ALL32-NEXT: vdivps %ymm0, %ymm1, %ymm0 -; ALL32-NEXT: retl +; AVX2-LABEL: f8xf32_f64: +; AVX2: # %bb.0: +; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm1 = [7.8125018626451492E-3,7.8125018626451492E-3,7.8125018626451492E-3,7.8125018626451492E-3] +; AVX2-NEXT: vaddps %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vdivps %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: retl +; +; AVX512-LABEL: f8xf32_f64: +; AVX512: # %bb.0: +; AVX512-NEXT: vbroadcastsd {{.*#+}} ymm1 = [4575657222482165760,4575657222482165760,4575657222482165760,4575657222482165760] +; AVX512-NEXT: vaddps %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vdivps %ymm0, %ymm1, %ymm0 +; AVX512-NEXT: retl ; ; AVX-64-LABEL: f8xf32_f64: ; AVX-64: # %bb.0: -; AVX-64-NEXT: vbroadcastsd {{.*#+}} ymm1 = [4575657222482165760,4575657222482165760,4575657222482165760,4575657222482165760] +; AVX-64-NEXT: vbroadcastsd {{.*#+}} ymm1 = [7.8125018626451492E-3,7.8125018626451492E-3,7.8125018626451492E-3,7.8125018626451492E-3] ; AVX-64-NEXT: vaddps %ymm1, %ymm0, %ymm0 ; AVX-64-NEXT: vdivps %ymm0, %ymm1, %ymm0 ; AVX-64-NEXT: retq ; -; ALL64-LABEL: f8xf32_f64: -; ALL64: # %bb.0: -; ALL64-NEXT: vbroadcastsd {{.*#+}} ymm1 = [4575657222482165760,4575657222482165760,4575657222482165760,4575657222482165760] -; ALL64-NEXT: vaddps %ymm1, %ymm0, %ymm0 -; ALL64-NEXT: vdivps %ymm0, %ymm1, %ymm0 -; ALL64-NEXT: retq +; AVX2-64-LABEL: f8xf32_f64: +; AVX2-64: # %bb.0: +; AVX2-64-NEXT: vbroadcastsd {{.*#+}} ymm1 = [7.8125018626451492E-3,7.8125018626451492E-3,7.8125018626451492E-3,7.8125018626451492E-3] +; AVX2-64-NEXT: vaddps %ymm1, %ymm0, %ymm0 +; AVX2-64-NEXT: vdivps %ymm0, %ymm1, %ymm0 +; AVX2-64-NEXT: retq +; +; AVX512F-64-LABEL: f8xf32_f64: +; AVX512F-64: # %bb.0: +; AVX512F-64-NEXT: vbroadcastsd {{.*#+}} ymm1 = [4575657222482165760,4575657222482165760,4575657222482165760,4575657222482165760] +; AVX512F-64-NEXT: vaddps %ymm1, %ymm0, %ymm0 +; AVX512F-64-NEXT: vdivps %ymm0, %ymm1, %ymm0 +; AVX512F-64-NEXT: retq %res1 = fadd <8 x float> , %a %res2 = fdiv <8 x float> , %res1 ret <8 x float> %res2 @@ -1644,35 +1719,47 @@ define <8 x float> @f8xf32_f128(<8 x float> %a) { ; AVX-LABEL: f8xf32_f128: ; AVX: # %bb.0: -; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm1 = [4.0E+0,1.0E+0,2.0E+0,3.0E+0,4.0E+0,1.0E+0,2.0E+0,3.0E+0] -; AVX-NEXT: # ymm1 = mem[0,1,0,1] +; AVX-NEXT: vmovaps {{.*#+}} ymm1 = [4.0E+0,1.0E+0,2.0E+0,3.0E+0,4.0E+0,1.0E+0,2.0E+0,3.0E+0] ; AVX-NEXT: vaddps %ymm1, %ymm0, %ymm0 ; AVX-NEXT: vdivps %ymm0, %ymm1, %ymm0 ; AVX-NEXT: retl ; -; ALL32-LABEL: f8xf32_f128: -; ALL32: # %bb.0: -; ALL32-NEXT: vbroadcastf128 {{.*#+}} ymm1 = [4.0E+0,1.0E+0,2.0E+0,3.0E+0,4.0E+0,1.0E+0,2.0E+0,3.0E+0] -; ALL32-NEXT: # ymm1 = mem[0,1,0,1] -; ALL32-NEXT: vaddps %ymm1, %ymm0, %ymm0 -; ALL32-NEXT: vdivps %ymm0, %ymm1, %ymm0 -; ALL32-NEXT: retl +; AVX2-LABEL: f8xf32_f128: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovaps {{.*#+}} ymm1 = [4.0E+0,1.0E+0,2.0E+0,3.0E+0,4.0E+0,1.0E+0,2.0E+0,3.0E+0] +; AVX2-NEXT: vaddps %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vdivps %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: retl +; +; AVX512-LABEL: f8xf32_f128: +; AVX512: # %bb.0: +; AVX512-NEXT: vbroadcastf128 {{.*#+}} ymm1 = [4.0E+0,1.0E+0,2.0E+0,3.0E+0,4.0E+0,1.0E+0,2.0E+0,3.0E+0] +; AVX512-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512-NEXT: vaddps %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vdivps %ymm0, %ymm1, %ymm0 +; AVX512-NEXT: retl ; ; AVX-64-LABEL: f8xf32_f128: ; AVX-64: # %bb.0: -; AVX-64-NEXT: vbroadcastf128 {{.*#+}} ymm1 = [4.0E+0,1.0E+0,2.0E+0,3.0E+0,4.0E+0,1.0E+0,2.0E+0,3.0E+0] -; AVX-64-NEXT: # ymm1 = mem[0,1,0,1] +; AVX-64-NEXT: vmovaps {{.*#+}} ymm1 = [4.0E+0,1.0E+0,2.0E+0,3.0E+0,4.0E+0,1.0E+0,2.0E+0,3.0E+0] ; AVX-64-NEXT: vaddps %ymm1, %ymm0, %ymm0 ; AVX-64-NEXT: vdivps %ymm0, %ymm1, %ymm0 ; AVX-64-NEXT: retq ; -; ALL64-LABEL: f8xf32_f128: -; ALL64: # %bb.0: -; ALL64-NEXT: vbroadcastf128 {{.*#+}} ymm1 = [4.0E+0,1.0E+0,2.0E+0,3.0E+0,4.0E+0,1.0E+0,2.0E+0,3.0E+0] -; ALL64-NEXT: # ymm1 = mem[0,1,0,1] -; ALL64-NEXT: vaddps %ymm1, %ymm0, %ymm0 -; ALL64-NEXT: vdivps %ymm0, %ymm1, %ymm0 -; ALL64-NEXT: retq +; AVX2-64-LABEL: f8xf32_f128: +; AVX2-64: # %bb.0: +; AVX2-64-NEXT: vmovaps {{.*#+}} ymm1 = [4.0E+0,1.0E+0,2.0E+0,3.0E+0,4.0E+0,1.0E+0,2.0E+0,3.0E+0] +; AVX2-64-NEXT: vaddps %ymm1, %ymm0, %ymm0 +; AVX2-64-NEXT: vdivps %ymm0, %ymm1, %ymm0 +; AVX2-64-NEXT: retq +; +; AVX512F-64-LABEL: f8xf32_f128: +; AVX512F-64: # %bb.0: +; AVX512F-64-NEXT: vbroadcastf128 {{.*#+}} ymm1 = [4.0E+0,1.0E+0,2.0E+0,3.0E+0,4.0E+0,1.0E+0,2.0E+0,3.0E+0] +; AVX512F-64-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512F-64-NEXT: vaddps %ymm1, %ymm0, %ymm0 +; AVX512F-64-NEXT: vdivps %ymm0, %ymm1, %ymm0 +; AVX512F-64-NEXT: retq %res1 = fadd <8 x float> , %a %res2 = fdiv <8 x float> , %res1 ret <8 x float> %res2 @@ -1682,7 +1769,7 @@ define <16 x float> @f16xf32_f64(<16 x float> %a) { ; AVX-LABEL: f16xf32_f64: ; AVX: # %bb.0: -; AVX-NEXT: vbroadcastsd {{.*#+}} ymm2 = [4575657222482165760,4575657222482165760,4575657222482165760,4575657222482165760] +; AVX-NEXT: vbroadcastsd {{.*#+}} ymm2 = [7.8125018626451492E-3,7.8125018626451492E-3,7.8125018626451492E-3,7.8125018626451492E-3] ; AVX-NEXT: vaddps %ymm2, %ymm1, %ymm1 ; AVX-NEXT: vaddps %ymm2, %ymm0, %ymm0 ; AVX-NEXT: vdivps %ymm0, %ymm2, %ymm0 @@ -1691,7 +1778,7 @@ ; ; AVX2-LABEL: f16xf32_f64: ; AVX2: # %bb.0: -; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm2 = [4575657222482165760,4575657222482165760,4575657222482165760,4575657222482165760] +; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm2 = [7.8125018626451492E-3,7.8125018626451492E-3,7.8125018626451492E-3,7.8125018626451492E-3] ; AVX2-NEXT: vaddps %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vaddps %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vdivps %ymm0, %ymm2, %ymm0 @@ -1707,7 +1794,7 @@ ; ; AVX-64-LABEL: f16xf32_f64: ; AVX-64: # %bb.0: -; AVX-64-NEXT: vbroadcastsd {{.*#+}} ymm2 = [4575657222482165760,4575657222482165760,4575657222482165760,4575657222482165760] +; AVX-64-NEXT: vbroadcastsd {{.*#+}} ymm2 = [7.8125018626451492E-3,7.8125018626451492E-3,7.8125018626451492E-3,7.8125018626451492E-3] ; AVX-64-NEXT: vaddps %ymm2, %ymm1, %ymm1 ; AVX-64-NEXT: vaddps %ymm2, %ymm0, %ymm0 ; AVX-64-NEXT: vdivps %ymm0, %ymm2, %ymm0 @@ -1716,7 +1803,7 @@ ; ; AVX2-64-LABEL: f16xf32_f64: ; AVX2-64: # %bb.0: -; AVX2-64-NEXT: vbroadcastsd {{.*#+}} ymm2 = [4575657222482165760,4575657222482165760,4575657222482165760,4575657222482165760] +; AVX2-64-NEXT: vbroadcastsd {{.*#+}} ymm2 = [7.8125018626451492E-3,7.8125018626451492E-3,7.8125018626451492E-3,7.8125018626451492E-3] ; AVX2-64-NEXT: vaddps %ymm2, %ymm1, %ymm1 ; AVX2-64-NEXT: vaddps %ymm2, %ymm0, %ymm0 ; AVX2-64-NEXT: vdivps %ymm0, %ymm2, %ymm0 @@ -1738,8 +1825,7 @@ define <16 x float> @f16xf32_f128(<16 x float> %a) { ; AVX-LABEL: f16xf32_f128: ; AVX: # %bb.0: -; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm2 = [4.0E+0,1.0E+0,2.0E+0,3.0E+0,4.0E+0,1.0E+0,2.0E+0,3.0E+0] -; AVX-NEXT: # ymm2 = mem[0,1,0,1] +; AVX-NEXT: vmovaps {{.*#+}} ymm2 = [4.0E+0,1.0E+0,2.0E+0,3.0E+0,4.0E+0,1.0E+0,2.0E+0,3.0E+0] ; AVX-NEXT: vaddps %ymm2, %ymm1, %ymm1 ; AVX-NEXT: vaddps %ymm2, %ymm0, %ymm0 ; AVX-NEXT: vdivps %ymm0, %ymm2, %ymm0 @@ -1748,8 +1834,7 @@ ; ; AVX2-LABEL: f16xf32_f128: ; AVX2: # %bb.0: -; AVX2-NEXT: vbroadcastf128 {{.*#+}} ymm2 = [4.0E+0,1.0E+0,2.0E+0,3.0E+0,4.0E+0,1.0E+0,2.0E+0,3.0E+0] -; AVX2-NEXT: # ymm2 = mem[0,1,0,1] +; AVX2-NEXT: vmovaps {{.*#+}} ymm2 = [4.0E+0,1.0E+0,2.0E+0,3.0E+0,4.0E+0,1.0E+0,2.0E+0,3.0E+0] ; AVX2-NEXT: vaddps %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vaddps %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vdivps %ymm0, %ymm2, %ymm0 @@ -1766,8 +1851,7 @@ ; ; AVX-64-LABEL: f16xf32_f128: ; AVX-64: # %bb.0: -; AVX-64-NEXT: vbroadcastf128 {{.*#+}} ymm2 = [4.0E+0,1.0E+0,2.0E+0,3.0E+0,4.0E+0,1.0E+0,2.0E+0,3.0E+0] -; AVX-64-NEXT: # ymm2 = mem[0,1,0,1] +; AVX-64-NEXT: vmovaps {{.*#+}} ymm2 = [4.0E+0,1.0E+0,2.0E+0,3.0E+0,4.0E+0,1.0E+0,2.0E+0,3.0E+0] ; AVX-64-NEXT: vaddps %ymm2, %ymm1, %ymm1 ; AVX-64-NEXT: vaddps %ymm2, %ymm0, %ymm0 ; AVX-64-NEXT: vdivps %ymm0, %ymm2, %ymm0 @@ -1776,8 +1860,7 @@ ; ; AVX2-64-LABEL: f16xf32_f128: ; AVX2-64: # %bb.0: -; AVX2-64-NEXT: vbroadcastf128 {{.*#+}} ymm2 = [4.0E+0,1.0E+0,2.0E+0,3.0E+0,4.0E+0,1.0E+0,2.0E+0,3.0E+0] -; AVX2-64-NEXT: # ymm2 = mem[0,1,0,1] +; AVX2-64-NEXT: vmovaps {{.*#+}} ymm2 = [4.0E+0,1.0E+0,2.0E+0,3.0E+0,4.0E+0,1.0E+0,2.0E+0,3.0E+0] ; AVX2-64-NEXT: vaddps %ymm2, %ymm1, %ymm1 ; AVX2-64-NEXT: vaddps %ymm2, %ymm0, %ymm0 ; AVX2-64-NEXT: vdivps %ymm0, %ymm2, %ymm0 @@ -1858,35 +1941,47 @@ define <4 x double> @f4xf64_f128(<4 x double> %a) { ; AVX-LABEL: f4xf64_f128: ; AVX: # %bb.0: -; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm1 = [2.0E+0,1.0E+0,2.0E+0,1.0E+0] -; AVX-NEXT: # ymm1 = mem[0,1,0,1] +; AVX-NEXT: vmovapd {{.*#+}} ymm1 = [2.0E+0,1.0E+0,2.0E+0,1.0E+0] ; AVX-NEXT: vaddpd %ymm1, %ymm0, %ymm0 ; AVX-NEXT: vdivpd %ymm0, %ymm1, %ymm0 ; AVX-NEXT: retl ; -; ALL32-LABEL: f4xf64_f128: -; ALL32: # %bb.0: -; ALL32-NEXT: vbroadcastf128 {{.*#+}} ymm1 = [2.0E+0,1.0E+0,2.0E+0,1.0E+0] -; ALL32-NEXT: # ymm1 = mem[0,1,0,1] -; ALL32-NEXT: vaddpd %ymm1, %ymm0, %ymm0 -; ALL32-NEXT: vdivpd %ymm0, %ymm1, %ymm0 -; ALL32-NEXT: retl +; AVX2-LABEL: f4xf64_f128: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovapd {{.*#+}} ymm1 = [2.0E+0,1.0E+0,2.0E+0,1.0E+0] +; AVX2-NEXT: vaddpd %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vdivpd %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: retl +; +; AVX512-LABEL: f4xf64_f128: +; AVX512: # %bb.0: +; AVX512-NEXT: vbroadcastf128 {{.*#+}} ymm1 = [2.0E+0,1.0E+0,2.0E+0,1.0E+0] +; AVX512-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512-NEXT: vaddpd %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vdivpd %ymm0, %ymm1, %ymm0 +; AVX512-NEXT: retl ; ; AVX-64-LABEL: f4xf64_f128: ; AVX-64: # %bb.0: -; AVX-64-NEXT: vbroadcastf128 {{.*#+}} ymm1 = [2.0E+0,1.0E+0,2.0E+0,1.0E+0] -; AVX-64-NEXT: # ymm1 = mem[0,1,0,1] +; AVX-64-NEXT: vmovapd {{.*#+}} ymm1 = [2.0E+0,1.0E+0,2.0E+0,1.0E+0] ; AVX-64-NEXT: vaddpd %ymm1, %ymm0, %ymm0 ; AVX-64-NEXT: vdivpd %ymm0, %ymm1, %ymm0 ; AVX-64-NEXT: retq ; -; ALL64-LABEL: f4xf64_f128: -; ALL64: # %bb.0: -; ALL64-NEXT: vbroadcastf128 {{.*#+}} ymm1 = [2.0E+0,1.0E+0,2.0E+0,1.0E+0] -; ALL64-NEXT: # ymm1 = mem[0,1,0,1] -; ALL64-NEXT: vaddpd %ymm1, %ymm0, %ymm0 -; ALL64-NEXT: vdivpd %ymm0, %ymm1, %ymm0 -; ALL64-NEXT: retq +; AVX2-64-LABEL: f4xf64_f128: +; AVX2-64: # %bb.0: +; AVX2-64-NEXT: vmovapd {{.*#+}} ymm1 = [2.0E+0,1.0E+0,2.0E+0,1.0E+0] +; AVX2-64-NEXT: vaddpd %ymm1, %ymm0, %ymm0 +; AVX2-64-NEXT: vdivpd %ymm0, %ymm1, %ymm0 +; AVX2-64-NEXT: retq +; +; AVX512F-64-LABEL: f4xf64_f128: +; AVX512F-64: # %bb.0: +; AVX512F-64-NEXT: vbroadcastf128 {{.*#+}} ymm1 = [2.0E+0,1.0E+0,2.0E+0,1.0E+0] +; AVX512F-64-NEXT: # ymm1 = mem[0,1,0,1] +; AVX512F-64-NEXT: vaddpd %ymm1, %ymm0, %ymm0 +; AVX512F-64-NEXT: vdivpd %ymm0, %ymm1, %ymm0 +; AVX512F-64-NEXT: retq %res1 = fadd <4 x double> , %a %res2 = fdiv <4 x double> , %res1 ret <4 x double> %res2 @@ -1896,8 +1991,7 @@ define <8 x double> @f8xf64_f128(<8 x double> %a) { ; AVX-LABEL: f8xf64_f128: ; AVX: # %bb.0: -; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm2 = [2.0E+0,1.0E+0,2.0E+0,1.0E+0] -; AVX-NEXT: # ymm2 = mem[0,1,0,1] +; AVX-NEXT: vmovapd {{.*#+}} ymm2 = [2.0E+0,1.0E+0,2.0E+0,1.0E+0] ; AVX-NEXT: vaddpd %ymm2, %ymm1, %ymm1 ; AVX-NEXT: vaddpd %ymm2, %ymm0, %ymm0 ; AVX-NEXT: vdivpd %ymm0, %ymm2, %ymm0 @@ -1906,8 +2000,7 @@ ; ; AVX2-LABEL: f8xf64_f128: ; AVX2: # %bb.0: -; AVX2-NEXT: vbroadcastf128 {{.*#+}} ymm2 = [2.0E+0,1.0E+0,2.0E+0,1.0E+0] -; AVX2-NEXT: # ymm2 = mem[0,1,0,1] +; AVX2-NEXT: vmovapd {{.*#+}} ymm2 = [2.0E+0,1.0E+0,2.0E+0,1.0E+0] ; AVX2-NEXT: vaddpd %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vaddpd %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vdivpd %ymm0, %ymm2, %ymm0 @@ -1924,8 +2017,7 @@ ; ; AVX-64-LABEL: f8xf64_f128: ; AVX-64: # %bb.0: -; AVX-64-NEXT: vbroadcastf128 {{.*#+}} ymm2 = [2.0E+0,1.0E+0,2.0E+0,1.0E+0] -; AVX-64-NEXT: # ymm2 = mem[0,1,0,1] +; AVX-64-NEXT: vmovapd {{.*#+}} ymm2 = [2.0E+0,1.0E+0,2.0E+0,1.0E+0] ; AVX-64-NEXT: vaddpd %ymm2, %ymm1, %ymm1 ; AVX-64-NEXT: vaddpd %ymm2, %ymm0, %ymm0 ; AVX-64-NEXT: vdivpd %ymm0, %ymm2, %ymm0 @@ -1934,8 +2026,7 @@ ; ; AVX2-64-LABEL: f8xf64_f128: ; AVX2-64: # %bb.0: -; AVX2-64-NEXT: vbroadcastf128 {{.*#+}} ymm2 = [2.0E+0,1.0E+0,2.0E+0,1.0E+0] -; AVX2-64-NEXT: # ymm2 = mem[0,1,0,1] +; AVX2-64-NEXT: vmovapd {{.*#+}} ymm2 = [2.0E+0,1.0E+0,2.0E+0,1.0E+0] ; AVX2-64-NEXT: vaddpd %ymm2, %ymm1, %ymm1 ; AVX2-64-NEXT: vaddpd %ymm2, %ymm0, %ymm0 ; AVX2-64-NEXT: vdivpd %ymm0, %ymm2, %ymm0 @@ -2023,7 +2114,7 @@ define <8 x i16> @f8xi16_i32_NaN(<8 x i16> %a) { ; AVX-LABEL: f8xi16_i32_NaN: ; AVX: # %bb.0: -; AVX-NEXT: vbroadcastss {{.*#+}} xmm1 = [4290379776,4290379776,4290379776,4290379776] +; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [0,65466,0,65466,0,65466,0,65466] ; AVX-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retl @@ -2037,7 +2128,7 @@ ; ; AVX-64-LABEL: f8xi16_i32_NaN: ; AVX-64: # %bb.0: -; AVX-64-NEXT: vbroadcastss {{.*#+}} xmm1 = [4290379776,4290379776,4290379776,4290379776] +; AVX-64-NEXT: vmovdqa {{.*#+}} xmm1 = [0,65466,0,65466,0,65466,0,65466] ; AVX-64-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ; AVX-64-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX-64-NEXT: retq diff --git a/llvm/test/CodeGen/X86/cast-vsel.ll b/llvm/test/CodeGen/X86/cast-vsel.ll --- a/llvm/test/CodeGen/X86/cast-vsel.ll +++ b/llvm/test/CodeGen/X86/cast-vsel.ll @@ -194,7 +194,7 @@ ; AVX1-LABEL: trunc: ; AVX1: # %bb.0: ; AVX1-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovaps {{.*#+}} ymm1 = [65535,65535,65535,65535,65535,65535,65535,65535] +; AVX1-NEXT: vbroadcastss {{.*#+}} ymm1 = [9.18340949E-41,9.18340949E-41,9.18340949E-41,9.18340949E-41,9.18340949E-41,9.18340949E-41,9.18340949E-41,9.18340949E-41] ; AVX1-NEXT: vandps %ymm1, %ymm2, %ymm2 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4 ; AVX1-NEXT: vpackusdw %xmm4, %xmm2, %xmm2 @@ -337,7 +337,7 @@ ; AVX1-LABEL: example25: ; AVX1: # %bb.0: # %vector.ph ; AVX1-NEXT: movq $-4096, %rax # imm = 0xF000 -; AVX1-NEXT: vmovaps {{.*#+}} ymm0 = [1,1,1,1,1,1,1,1] +; AVX1-NEXT: vbroadcastss {{.*#+}} ymm0 = [1.40129846E-45,1.40129846E-45,1.40129846E-45,1.40129846E-45,1.40129846E-45,1.40129846E-45,1.40129846E-45,1.40129846E-45] ; AVX1-NEXT: .p2align 4, 0x90 ; AVX1-NEXT: .LBB5_1: # %vector.body ; AVX1-NEXT: # =>This Inner Loop Header: Depth=1 diff --git a/llvm/test/CodeGen/X86/combine-add.ll b/llvm/test/CodeGen/X86/combine-add.ll --- a/llvm/test/CodeGen/X86/combine-add.ll +++ b/llvm/test/CodeGen/X86/combine-add.ll @@ -262,8 +262,7 @@ ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm0 = [10,10,10,10,10,10,10,10] ; AVX2-NEXT: vpsubd (%rdi), %ymm0, %ymm0 -; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm1 = [3,3,3,3,3,3,3,3] -; AVX2-NEXT: vpmulld %ymm1, %ymm0, %ymm1 +; AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1 ; AVX2-NEXT: vmovdqu %ymm0, (%rsi) ; AVX2-NEXT: vmovdqu %ymm1, (%rdi) ; AVX2-NEXT: vzeroupper @@ -285,21 +284,12 @@ ; SSE-NEXT: orps %xmm1, %xmm0 ; SSE-NEXT: retq ; -; AVX1-LABEL: combine_vec_add_uniquebits: -; AVX1: # %bb.0: -; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: combine_vec_add_uniquebits: -; AVX2: # %bb.0: -; AVX2-NEXT: vbroadcastss {{.*#+}} xmm2 = [61680,61680,61680,61680] -; AVX2-NEXT: vandps %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vbroadcastss {{.*#+}} xmm2 = [3855,3855,3855,3855] -; AVX2-NEXT: vandps %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vorps %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: retq +; AVX-LABEL: combine_vec_add_uniquebits: +; AVX: # %bb.0: +; AVX-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX-NEXT: vorps %xmm1, %xmm0, %xmm0 +; AVX-NEXT: retq %1 = and <4 x i32> %a, %2 = and <4 x i32> %b, %3 = add <4 x i32> %1, %2 diff --git a/llvm/test/CodeGen/X86/combine-addo.ll b/llvm/test/CodeGen/X86/combine-addo.ll --- a/llvm/test/CodeGen/X86/combine-addo.ll +++ b/llvm/test/CodeGen/X86/combine-addo.ll @@ -113,8 +113,7 @@ ; AVX: # %bb.0: ; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX-NEXT: vpsubd %xmm0, %xmm2, %xmm0 -; AVX-NEXT: vpbroadcastd {{.*#+}} xmm2 = [1,1,1,1] -; AVX-NEXT: vpmaxud %xmm2, %xmm0, %xmm2 +; AVX-NEXT: vpmaxud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 ; AVX-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm2 ; AVX-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 ; AVX-NEXT: retq diff --git a/llvm/test/CodeGen/X86/combine-and.ll b/llvm/test/CodeGen/X86/combine-and.ll --- a/llvm/test/CodeGen/X86/combine-and.ll +++ b/llvm/test/CodeGen/X86/combine-and.ll @@ -327,12 +327,14 @@ ; ; AVX1-LABEL: and_or_v2i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovaps {{.*#+}} xmm0 = [8,8] +; AVX1-NEXT: vmovddup {{.*#+}} xmm0 = [3.9525251667299724E-323,3.9525251667299724E-323] +; AVX1-NEXT: # xmm0 = mem[0,0] ; AVX1-NEXT: retq ; ; AVX2-LABEL: and_or_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovaps {{.*#+}} xmm0 = [8,8] +; AVX2-NEXT: vmovddup {{.*#+}} xmm0 = [3.9525251667299724E-323,3.9525251667299724E-323] +; AVX2-NEXT: # xmm0 = mem[0,0] ; AVX2-NEXT: retq ; ; AVX512-LABEL: and_or_v2i64: @@ -353,12 +355,12 @@ ; ; AVX1-LABEL: and_or_v4i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovaps {{.*#+}} xmm0 = [3,3,3,3] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm0 = [4.20389539E-45,4.20389539E-45,4.20389539E-45,4.20389539E-45] ; AVX1-NEXT: retq ; ; AVX2-LABEL: and_or_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vbroadcastss {{.*#+}} xmm0 = [3,3,3,3] +; AVX2-NEXT: vbroadcastss {{.*#+}} xmm0 = [4.20389539E-45,4.20389539E-45,4.20389539E-45,4.20389539E-45] ; AVX2-NEXT: retq ; ; AVX512-LABEL: and_or_v4i32: diff --git a/llvm/test/CodeGen/X86/combine-bitselect.ll b/llvm/test/CodeGen/X86/combine-bitselect.ll --- a/llvm/test/CodeGen/X86/combine-bitselect.ll +++ b/llvm/test/CodeGen/X86/combine-bitselect.ll @@ -706,16 +706,14 @@ ; XOP: # %bb.0: ; XOP-NEXT: vmovdqa (%rdi), %ymm2 ; XOP-NEXT: vmovdqa 32(%rdi), %ymm3 -; XOP-NEXT: vbroadcastf128 {{.*#+}} ymm4 = [18446744065119617022,18446744073709551612,18446744065119617022,18446744073709551612] -; XOP-NEXT: # ymm4 = mem[0,1,0,1] +; XOP-NEXT: vmovdqa {{.*#+}} ymm4 = [18446744065119617022,18446744073709551612,18446744065119617022,18446744073709551612] ; XOP-NEXT: vpcmov %ymm4, %ymm0, %ymm2, %ymm0 ; XOP-NEXT: vpcmov %ymm4, %ymm1, %ymm3, %ymm1 ; XOP-NEXT: retq ; ; AVX-LABEL: bitselect_v8i64_rm: ; AVX: # %bb.0: -; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm2 = [18446744065119617022,18446744073709551612,18446744065119617022,18446744073709551612] -; AVX-NEXT: # ymm2 = mem[0,1,0,1] +; AVX-NEXT: vmovaps {{.*#+}} ymm2 = [18446744065119617022,18446744073709551612,18446744065119617022,18446744073709551612] ; AVX-NEXT: vandps 32(%rdi), %ymm2, %ymm3 ; AVX-NEXT: vandps (%rdi), %ymm2, %ymm4 ; AVX-NEXT: vandnps %ymm0, %ymm2, %ymm0 @@ -769,16 +767,14 @@ ; XOP: # %bb.0: ; XOP-NEXT: vmovdqa (%rdi), %ymm2 ; XOP-NEXT: vmovdqa 32(%rdi), %ymm3 -; XOP-NEXT: vbroadcastf128 {{.*#+}} ymm4 = [12884901890,4294967296,12884901890,4294967296] -; XOP-NEXT: # ymm4 = mem[0,1,0,1] +; XOP-NEXT: vmovdqa {{.*#+}} ymm4 = [12884901890,4294967296,12884901890,4294967296] ; XOP-NEXT: vpcmov %ymm4, %ymm0, %ymm2, %ymm0 ; XOP-NEXT: vpcmov %ymm4, %ymm1, %ymm3, %ymm1 ; XOP-NEXT: retq ; ; AVX-LABEL: bitselect_v8i64_mr: ; AVX: # %bb.0: -; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm2 = [12884901890,4294967296,12884901890,4294967296] -; AVX-NEXT: # ymm2 = mem[0,1,0,1] +; AVX-NEXT: vmovaps {{.*#+}} ymm2 = [12884901890,4294967296,12884901890,4294967296] ; AVX-NEXT: vandps 32(%rdi), %ymm2, %ymm3 ; AVX-NEXT: vandps (%rdi), %ymm2, %ymm4 ; AVX-NEXT: vandnps %ymm0, %ymm2, %ymm0 @@ -828,16 +824,14 @@ ; XOP: # %bb.0: ; XOP-NEXT: vmovdqa (%rsi), %ymm0 ; XOP-NEXT: vmovdqa 32(%rsi), %ymm1 -; XOP-NEXT: vbroadcastf128 {{.*#+}} ymm2 = [18446744073709551612,18446744065119617022,18446744073709551612,18446744065119617022] -; XOP-NEXT: # ymm2 = mem[0,1,0,1] +; XOP-NEXT: vmovdqa {{.*#+}} ymm2 = [18446744073709551612,18446744065119617022,18446744073709551612,18446744065119617022] ; XOP-NEXT: vpcmov %ymm2, (%rdi), %ymm0, %ymm0 ; XOP-NEXT: vpcmov %ymm2, 32(%rdi), %ymm1, %ymm1 ; XOP-NEXT: retq ; ; AVX-LABEL: bitselect_v8i64_mm: ; AVX: # %bb.0: -; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm1 = [18446744073709551612,18446744065119617022,18446744073709551612,18446744065119617022] -; AVX-NEXT: # ymm1 = mem[0,1,0,1] +; AVX-NEXT: vmovaps {{.*#+}} ymm1 = [18446744073709551612,18446744065119617022,18446744073709551612,18446744065119617022] ; AVX-NEXT: vandps 32(%rsi), %ymm1, %ymm2 ; AVX-NEXT: vandps (%rsi), %ymm1, %ymm0 ; AVX-NEXT: vandnps (%rdi), %ymm1, %ymm3 @@ -1009,25 +1003,14 @@ ; XOP-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm0 ; XOP-NEXT: retq ; -; AVX1-LABEL: bitselect_v4i1_loop: -; AVX1: # %bb.0: # %bb -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2 -; AVX1-NEXT: vpcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; AVX1-NEXT: vblendvps %xmm0, %xmm1, %xmm2, %xmm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: bitselect_v4i1_loop: -; AVX2: # %bb.0: # %bb -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [12,12,12,12] -; AVX2-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm2 -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [15,15,15,15] -; AVX2-NEXT: vpcmpeqd %xmm3, %xmm1, %xmm1 -; AVX2-NEXT: vblendvps %xmm0, %xmm1, %xmm2, %xmm0 -; AVX2-NEXT: retq +; AVX-LABEL: bitselect_v4i1_loop: +; AVX: # %bb.0: # %bb +; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vpcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2 +; AVX-NEXT: vpcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX-NEXT: vblendvps %xmm0, %xmm1, %xmm2, %xmm0 +; AVX-NEXT: retq ; ; AVX512F-LABEL: bitselect_v4i1_loop: ; AVX512F: # %bb.0: # %bb diff --git a/llvm/test/CodeGen/X86/combine-concatvectors.ll b/llvm/test/CodeGen/X86/combine-concatvectors.ll --- a/llvm/test/CodeGen/X86/combine-concatvectors.ll +++ b/llvm/test/CodeGen/X86/combine-concatvectors.ll @@ -64,7 +64,8 @@ ; AVX2-NEXT: movq %rcx, 46348(%rax) ; AVX2-NEXT: vbroadcastss {{.*#+}} ymm0 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; AVX2-NEXT: vmovups %ymm0, 48296(%rax) -; AVX2-NEXT: vmovlps %xmm0, 47372(%rax) +; AVX2-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX2-NEXT: vmovsd %xmm0, 47372(%rax) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq alloca_0: diff --git a/llvm/test/CodeGen/X86/combine-fabs.ll b/llvm/test/CodeGen/X86/combine-fabs.ll --- a/llvm/test/CodeGen/X86/combine-fabs.ll +++ b/llvm/test/CodeGen/X86/combine-fabs.ll @@ -45,8 +45,7 @@ ; ; AVX-LABEL: combine_fabs_fabs: ; AVX: # %bb.0: -; AVX-NEXT: vbroadcastss {{.*#+}} xmm1 = [NaN,NaN,NaN,NaN] -; AVX-NEXT: vandps %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX-NEXT: retq %1 = call float @llvm.fabs.f32(float %a) %2 = call float @llvm.fabs.f32(float %1) @@ -61,8 +60,7 @@ ; ; AVX-LABEL: combine_vec_fabs_fabs: ; AVX: # %bb.0: -; AVX-NEXT: vbroadcastss {{.*#+}} xmm1 = [NaN,NaN,NaN,NaN] -; AVX-NEXT: vandps %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX-NEXT: retq %1 = call <4 x float> @llvm.fabs.v4f32(<4 x float> %a) %2 = call <4 x float> @llvm.fabs.v4f32(<4 x float> %1) @@ -78,8 +76,7 @@ ; ; AVX-LABEL: combine_fabs_fneg: ; AVX: # %bb.0: -; AVX-NEXT: vbroadcastss {{.*#+}} xmm1 = [NaN,NaN,NaN,NaN] -; AVX-NEXT: vandps %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX-NEXT: retq %1 = fsub float -0.0, %a %2 = call float @llvm.fabs.f32(float %1) @@ -94,8 +91,7 @@ ; ; AVX-LABEL: combine_vec_fabs_fneg: ; AVX: # %bb.0: -; AVX-NEXT: vbroadcastss {{.*#+}} xmm1 = [NaN,NaN,NaN,NaN] -; AVX-NEXT: vandps %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX-NEXT: retq %1 = fsub <4 x float> , %a %2 = call <4 x float> @llvm.fabs.v4f32(<4 x float> %1) @@ -111,8 +107,7 @@ ; ; AVX-LABEL: combine_fabs_fcopysign: ; AVX: # %bb.0: -; AVX-NEXT: vbroadcastss {{.*#+}} xmm1 = [NaN,NaN,NaN,NaN] -; AVX-NEXT: vandps %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX-NEXT: retq %1 = call float @llvm.copysign.f32(float %a, float %b) %2 = call float @llvm.fabs.f32(float %1) @@ -127,8 +122,7 @@ ; ; AVX-LABEL: combine_vec_fabs_fcopysign: ; AVX: # %bb.0: -; AVX-NEXT: vbroadcastss {{.*#+}} xmm1 = [NaN,NaN,NaN,NaN] -; AVX-NEXT: vandps %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX-NEXT: retq %1 = call <4 x float> @llvm.copysign.v4f32(<4 x float> %a, <4 x float> %b) %2 = call <4 x float> @llvm.fabs.v4f32(<4 x float> %1) diff --git a/llvm/test/CodeGen/X86/combine-fcopysign.ll b/llvm/test/CodeGen/X86/combine-fcopysign.ll --- a/llvm/test/CodeGen/X86/combine-fcopysign.ll +++ b/llvm/test/CodeGen/X86/combine-fcopysign.ll @@ -16,8 +16,7 @@ ; ; AVX-LABEL: combine_vec_fcopysign_pos_constant0: ; AVX: # %bb.0: -; AVX-NEXT: vbroadcastss {{.*#+}} xmm1 = [NaN,NaN,NaN,NaN] -; AVX-NEXT: vandps %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX-NEXT: retq %1 = call <4 x float> @llvm.copysign.v4f32(<4 x float> %x, <4 x float> ) ret <4 x float> %1 @@ -31,8 +30,7 @@ ; ; AVX-LABEL: combine_vec_fcopysign_pos_constant1: ; AVX: # %bb.0: -; AVX-NEXT: vbroadcastss {{.*#+}} xmm1 = [NaN,NaN,NaN,NaN] -; AVX-NEXT: vandps %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX-NEXT: retq %1 = call <4 x float> @llvm.copysign.v4f32(<4 x float> %x, <4 x float> ) ret <4 x float> %1 @@ -46,8 +44,7 @@ ; ; AVX-LABEL: combine_vec_fcopysign_fabs_sgn: ; AVX: # %bb.0: -; AVX-NEXT: vbroadcastss {{.*#+}} xmm1 = [NaN,NaN,NaN,NaN] -; AVX-NEXT: vandps %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX-NEXT: retq %1 = call <4 x float> @llvm.fabs.v4f32(<4 x float> %y) %2 = call <4 x float> @llvm.copysign.v4f32(<4 x float> %x, <4 x float> %1) @@ -63,8 +60,7 @@ ; ; AVX-LABEL: combine_vec_fcopysign_neg_constant0: ; AVX: # %bb.0: -; AVX-NEXT: vbroadcastss {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] -; AVX-NEXT: vorps %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX-NEXT: retq %1 = call <4 x float> @llvm.copysign.v4f32(<4 x float> %x, <4 x float> ) ret <4 x float> %1 @@ -78,8 +74,7 @@ ; ; AVX-LABEL: combine_vec_fcopysign_neg_constant1: ; AVX: # %bb.0: -; AVX-NEXT: vbroadcastss {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] -; AVX-NEXT: vorps %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX-NEXT: retq %1 = call <4 x float> @llvm.copysign.v4f32(<4 x float> %x, <4 x float> ) ret <4 x float> %1 @@ -93,8 +88,7 @@ ; ; AVX-LABEL: combine_vec_fcopysign_fneg_fabs_sgn: ; AVX: # %bb.0: -; AVX-NEXT: vbroadcastss {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] -; AVX-NEXT: vorps %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX-NEXT: retq %1 = call <4 x float> @llvm.fabs.v4f32(<4 x float> %y) %2 = fsub <4 x float> , %1 @@ -113,10 +107,8 @@ ; ; AVX-LABEL: combine_vec_fcopysign_fabs_mag: ; AVX: # %bb.0: -; AVX-NEXT: vbroadcastss {{.*#+}} xmm2 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] -; AVX-NEXT: vandps %xmm2, %xmm1, %xmm1 -; AVX-NEXT: vbroadcastss {{.*#+}} xmm2 = [NaN,NaN,NaN,NaN] -; AVX-NEXT: vandps %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX-NEXT: vorps %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq %1 = call <4 x float> @llvm.fabs.v4f32(<4 x float> %x) @@ -135,10 +127,8 @@ ; ; AVX-LABEL: combine_vec_fcopysign_fneg_mag: ; AVX: # %bb.0: -; AVX-NEXT: vbroadcastss {{.*#+}} xmm2 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] -; AVX-NEXT: vandps %xmm2, %xmm1, %xmm1 -; AVX-NEXT: vbroadcastss {{.*#+}} xmm2 = [NaN,NaN,NaN,NaN] -; AVX-NEXT: vandps %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX-NEXT: vorps %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq %1 = fsub <4 x float> , %x @@ -157,10 +147,8 @@ ; ; AVX-LABEL: combine_vec_fcopysign_fcopysign_mag: ; AVX: # %bb.0: -; AVX-NEXT: vbroadcastss {{.*#+}} xmm2 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] -; AVX-NEXT: vandps %xmm2, %xmm1, %xmm1 -; AVX-NEXT: vbroadcastss {{.*#+}} xmm2 = [NaN,NaN,NaN,NaN] -; AVX-NEXT: vandps %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX-NEXT: vorps %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq %1 = call <4 x float> @llvm.copysign.v4f32(<4 x float> %x, <4 x float> %z) @@ -179,10 +167,8 @@ ; ; AVX-LABEL: combine_vec_fcopysign_fcopysign_sgn: ; AVX: # %bb.0: -; AVX-NEXT: vbroadcastss {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] -; AVX-NEXT: vandps %xmm1, %xmm2, %xmm1 -; AVX-NEXT: vbroadcastss {{.*#+}} xmm2 = [NaN,NaN,NaN,NaN] -; AVX-NEXT: vandps %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm1 +; AVX-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX-NEXT: vorps %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq %1 = call <4 x float> @llvm.copysign.v4f32(<4 x float> %y, <4 x float> %z) @@ -210,10 +196,8 @@ ; AVX-LABEL: combine_vec_fcopysign_fpext_sgn: ; AVX: # %bb.0: ; AVX-NEXT: vcvtps2pd %xmm1, %ymm1 -; AVX-NEXT: vbroadcastsd {{.*#+}} ymm2 = [NaN,NaN,NaN,NaN] -; AVX-NEXT: vandps %ymm2, %ymm0, %ymm0 -; AVX-NEXT: vbroadcastsd {{.*#+}} ymm2 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] -; AVX-NEXT: vandps %ymm2, %ymm1, %ymm1 +; AVX-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 ; AVX-NEXT: vorps %ymm1, %ymm0, %ymm0 ; AVX-NEXT: retq %1 = fpext <4 x float> %y to <4 x double> @@ -236,10 +220,8 @@ ; AVX-LABEL: combine_vec_fcopysign_fptrunc_sgn: ; AVX: # %bb.0: ; AVX-NEXT: vcvtpd2ps %ymm1, %xmm1 -; AVX-NEXT: vbroadcastss {{.*#+}} xmm2 = [NaN,NaN,NaN,NaN] -; AVX-NEXT: vandpd %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vbroadcastss {{.*#+}} xmm2 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] -; AVX-NEXT: vandpd %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vandpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX-NEXT: vandpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; AVX-NEXT: vorpd %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq diff --git a/llvm/test/CodeGen/X86/combine-mul.ll b/llvm/test/CodeGen/X86/combine-mul.ll --- a/llvm/test/CodeGen/X86/combine-mul.ll +++ b/llvm/test/CodeGen/X86/combine-mul.ll @@ -158,15 +158,14 @@ ; ; AVX-LABEL: combine_vec_mul_negpow2c: ; AVX: # %bb.0: -; AVX-NEXT: vpbroadcastq {{.*#+}} ymm1 = [4294967295,4294967295,4294967295,4294967295] -; AVX-NEXT: vpmuludq %ymm1, %ymm0, %ymm1 -; AVX-NEXT: vpsrlq $32, %ymm0, %ymm2 -; AVX-NEXT: vmovdqa {{.*#+}} ymm3 = [18446744073709551615,18446744073709551614,18446744073709551612,18446744073709551600] -; AVX-NEXT: vpmuludq %ymm3, %ymm2, %ymm2 -; AVX-NEXT: vpaddq %ymm2, %ymm1, %ymm1 -; AVX-NEXT: vpsllq $32, %ymm1, %ymm1 -; AVX-NEXT: vpmuludq %ymm3, %ymm0, %ymm0 +; AVX-NEXT: vmovdqa {{.*#+}} ymm1 = [18446744073709551615,18446744073709551614,18446744073709551612,18446744073709551600] +; AVX-NEXT: vpmuludq %ymm1, %ymm0, %ymm2 +; AVX-NEXT: vpsrlq $32, %ymm0, %ymm3 +; AVX-NEXT: vpmuludq %ymm1, %ymm3, %ymm1 +; AVX-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX-NEXT: vpaddq %ymm1, %ymm0, %ymm0 +; AVX-NEXT: vpsllq $32, %ymm0, %ymm0 +; AVX-NEXT: vpaddq %ymm0, %ymm2, %ymm0 ; AVX-NEXT: retq %1 = mul <4 x i64> %x, ret <4 x i64> %1 diff --git a/llvm/test/CodeGen/X86/combine-pavg.ll b/llvm/test/CodeGen/X86/combine-pavg.ll --- a/llvm/test/CodeGen/X86/combine-pavg.ll +++ b/llvm/test/CodeGen/X86/combine-pavg.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2 declare <16 x i8> @llvm.x86.sse2.pavg.b(<16 x i8>, <16 x i8>) nounwind readnone declare <8 x i16> @llvm.x86.sse2.pavg.w(<8 x i16>, <8 x i16>) nounwind readnone @@ -31,17 +31,29 @@ ; SSE-NEXT: packuswb %xmm3, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: combine_pavgw_knownbits: -; AVX: # %bb.0: -; AVX-NEXT: vmovdqa {{.*#+}} xmm4 = [31,31,31,31,31,31,31,31] -; AVX-NEXT: vpand %xmm4, %xmm0, %xmm0 -; AVX-NEXT: vpand %xmm4, %xmm1, %xmm1 -; AVX-NEXT: vpavgw %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpand %xmm4, %xmm2, %xmm1 -; AVX-NEXT: vpand %xmm4, %xmm3, %xmm2 -; AVX-NEXT: vpavgw %xmm2, %xmm1, %xmm1 -; AVX-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: combine_pavgw_knownbits: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [31,31,31,31,31,31,31,31] +; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vpavgw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm1 +; AVX1-NEXT: vpand %xmm4, %xmm3, %xmm2 +; AVX1-NEXT: vpavgw %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: combine_pavgw_knownbits: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm4 = [31,31,31,31,31,31,31,31] +; AVX2-NEXT: vpand %xmm4, %xmm0, %xmm0 +; AVX2-NEXT: vpand %xmm4, %xmm1, %xmm1 +; AVX2-NEXT: vpavgw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpand %xmm4, %xmm2, %xmm1 +; AVX2-NEXT: vpand %xmm4, %xmm3, %xmm2 +; AVX2-NEXT: vpavgw %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: retq %m0 = and <8 x i16> %a0, %m1 = and <8 x i16> %a1, %m2 = and <8 x i16> %a2, diff --git a/llvm/test/CodeGen/X86/combine-pmuldq.ll b/llvm/test/CodeGen/X86/combine-pmuldq.ll --- a/llvm/test/CodeGen/X86/combine-pmuldq.ll +++ b/llvm/test/CodeGen/X86/combine-pmuldq.ll @@ -260,8 +260,7 @@ ; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] ; AVX2-NEXT: vpsubd %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] -; AVX2-NEXT: vpmuludq %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3] ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 diff --git a/llvm/test/CodeGen/X86/combine-rotates.ll b/llvm/test/CodeGen/X86/combine-rotates.ll --- a/llvm/test/CodeGen/X86/combine-rotates.ll +++ b/llvm/test/CodeGen/X86/combine-rotates.ll @@ -150,8 +150,7 @@ ; AVX2-LABEL: combine_vec_rot_select_zero: ; AVX2: # %bb.0: ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31] -; AVX2-NEXT: vpand %xmm3, %xmm1, %xmm3 +; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm3 ; AVX2-NEXT: vpsllvd %xmm3, %xmm0, %xmm4 ; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm5 = [32,32,32,32] ; AVX2-NEXT: vpsubd %xmm3, %xmm5, %xmm3 @@ -207,8 +206,7 @@ ; ; AVX2-LABEL: rotate_demanded_bits: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [30,30,30,30] -; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; AVX2-NEXT: vpsllvd %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [32,32,32,32] ; AVX2-NEXT: vpsubd %xmm1, %xmm3, %xmm1 @@ -258,8 +256,7 @@ ; ; AVX2-LABEL: rotate_demanded_bits_2: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [23,23,23,23] -; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; AVX2-NEXT: vpsllvd %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [32,32,32,32] ; AVX2-NEXT: vpsubd %xmm1, %xmm3, %xmm1 @@ -310,8 +307,7 @@ ; AVX2-LABEL: rotate_demanded_bits_3: ; AVX2: # %bb.0: ; AVX2-NEXT: vpaddd %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [31,31,31,31] -; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; AVX2-NEXT: vpsllvd %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [32,32,32,32] ; AVX2-NEXT: vpsubd %xmm1, %xmm3, %xmm1 @@ -361,8 +357,7 @@ ; ; AVX2-LABEL: rotl_binop_shuffle: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [31,31,31,31] -; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; AVX2-NEXT: vpsllvd %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [32,32,32,32] ; AVX2-NEXT: vpsubd %xmm1, %xmm3, %xmm1 diff --git a/llvm/test/CodeGen/X86/combine-sdiv.ll b/llvm/test/CodeGen/X86/combine-sdiv.ll --- a/llvm/test/CodeGen/X86/combine-sdiv.ll +++ b/llvm/test/CodeGen/X86/combine-sdiv.ll @@ -80,8 +80,7 @@ ; ; AVX2-LABEL: combine_vec_sdiv_by_minsigned: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2147483648,2147483648,2147483648,2147483648] -; AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 ; AVX2-NEXT: retq ; @@ -149,17 +148,27 @@ ; ; AVX1-LABEL: combine_vec_sdiv_dupe: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovaps {{.*#+}} xmm0 = [1,1,1,1] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm0 = [1.40129846E-45,1.40129846E-45,1.40129846E-45,1.40129846E-45] ; AVX1-NEXT: retq ; -; AVX2ORLATER-LABEL: combine_vec_sdiv_dupe: -; AVX2ORLATER: # %bb.0: -; AVX2ORLATER-NEXT: vbroadcastss {{.*#+}} xmm0 = [1,1,1,1] -; AVX2ORLATER-NEXT: retq +; AVX2-LABEL: combine_vec_sdiv_dupe: +; AVX2: # %bb.0: +; AVX2-NEXT: vbroadcastss {{.*#+}} xmm0 = [1.40129846E-45,1.40129846E-45,1.40129846E-45,1.40129846E-45] +; AVX2-NEXT: retq +; +; AVX512F-LABEL: combine_vec_sdiv_dupe: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vbroadcastss {{.*#+}} xmm0 = [1,1,1,1] +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: combine_vec_sdiv_dupe: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vbroadcastss {{.*#+}} xmm0 = [1,1,1,1] +; AVX512BW-NEXT: retq ; ; XOP-LABEL: combine_vec_sdiv_dupe: ; XOP: # %bb.0: -; XOP-NEXT: vmovaps {{.*#+}} xmm0 = [1,1,1,1] +; XOP-NEXT: vbroadcastss {{.*#+}} xmm0 = [1.40129846E-45,1.40129846E-45,1.40129846E-45,1.40129846E-45] ; XOP-NEXT: retq %1 = sdiv <4 x i32> %x, %x ret <4 x i32> %1 @@ -182,8 +191,7 @@ ; AVX2-LABEL: combine_vec_sdiv_by_pos0: ; AVX2: # %bb.0: ; AVX2-NEXT: vpsrld $2, %xmm0, %xmm0 -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [63,63,63,63] -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX2-NEXT: retq ; ; AVX512F-LABEL: combine_vec_sdiv_by_pos0: @@ -419,7 +427,7 @@ ; AVX2-NEXT: vpsrlw $8, %ymm1, %ymm1 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX2-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [18446744073709551360,18446744073709551360] ; AVX2-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -435,7 +443,7 @@ ; AVX512F-NEXT: vpmovsxbd %xmm1, %zmm1 ; AVX512F-NEXT: vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1 ; AVX512F-NEXT: vpmovdb %zmm1, %xmm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255] +; AVX512F-NEXT: vpbroadcastq {{.*#+}} xmm2 = [18446744073709551360,18446744073709551360] ; AVX512F-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq @@ -672,8 +680,7 @@ ; AVX1-NEXT: vpsraw $1, %xmm3, %xmm3 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3,4,5,6],xmm3[7] ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 -; AVX1-NEXT: vbroadcastf128 {{.*#+}} ymm2 = [0,65535,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,65535] -; AVX1-NEXT: # ymm2 = mem[0,1,0,1] +; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [0,65535,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,65535] ; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 ; AVX1-NEXT: vandnps %ymm0, %ymm2, %ymm0 ; AVX1-NEXT: vorps %ymm0, %ymm1, %ymm0 @@ -724,9 +731,7 @@ ; XOP-NEXT: vpaddw %xmm3, %xmm0, %xmm3 ; XOP-NEXT: vpshaw %xmm2, %xmm3, %xmm2 ; XOP-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 -; XOP-NEXT: vbroadcastf128 {{.*#+}} ymm2 = [0,65535,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,65535] -; XOP-NEXT: # ymm2 = mem[0,1,0,1] -; XOP-NEXT: vpcmov %ymm2, %ymm0, %ymm1, %ymm0 +; XOP-NEXT: vpcmov {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1, %ymm0 ; XOP-NEXT: retq %1 = sdiv <16 x i16> %x, ret <16 x i16> %1 @@ -904,8 +909,7 @@ ; AVX1-NEXT: vpsraw $1, %xmm5, %xmm5 ; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1],xmm5[2],xmm6[3,4,5,6],xmm5[7] ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm5, %ymm2 -; AVX1-NEXT: vbroadcastf128 {{.*#+}} ymm5 = [0,65535,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,65535] -; AVX1-NEXT: # ymm5 = mem[0,1,0,1] +; AVX1-NEXT: vmovaps {{.*#+}} ymm5 = [0,65535,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,65535] ; AVX1-NEXT: vandps %ymm5, %ymm2, %ymm2 ; AVX1-NEXT: vandnps %ymm0, %ymm5, %ymm0 ; AVX1-NEXT: vorps %ymm0, %ymm2, %ymm0 @@ -931,12 +935,10 @@ ; AVX2-LABEL: combine_vec_sdiv_by_pow2b_v32i16: ; AVX2: # %bb.0: ; AVX2-NEXT: vpsraw $15, %ymm0, %ymm2 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,4,2,16,8,32,64,2,0,4,2,16,8,32,64,2] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = ; AVX2-NEXT: vpmulhuw %ymm3, %ymm2, %ymm2 ; AVX2-NEXT: vpaddw %ymm2, %ymm0, %ymm2 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,16384,32768,4096,8192,2048,1024,32768,0,16384,32768,4096,8192,2048,1024,32768] -; AVX2-NEXT: # ymm4 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = ; AVX2-NEXT: vpmulhw %ymm4, %ymm2, %ymm5 ; AVX2-NEXT: vpsraw $1, %ymm2, %ymm2 ; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm5[0,1],ymm2[2],ymm5[3,4,5,6],ymm2[7],ymm5[8,9],ymm2[10],ymm5[11,12,13,14],ymm2[15] @@ -1001,8 +1003,7 @@ ; XOP-NEXT: vpaddw %xmm5, %xmm0, %xmm5 ; XOP-NEXT: vpshaw %xmm3, %xmm5, %xmm5 ; XOP-NEXT: vinsertf128 $1, %xmm2, %ymm5, %ymm2 -; XOP-NEXT: vbroadcastf128 {{.*#+}} ymm5 = [0,65535,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,65535] -; XOP-NEXT: # ymm5 = mem[0,1,0,1] +; XOP-NEXT: vmovdqa {{.*#+}} ymm5 = [0,65535,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,65535] ; XOP-NEXT: vpcmov %ymm5, %ymm0, %ymm2, %ymm0 ; XOP-NEXT: vextractf128 $1, %ymm1, %xmm2 ; XOP-NEXT: vpsraw $15, %xmm2, %xmm6 @@ -1463,12 +1464,10 @@ ; AVX2-LABEL: combine_vec_sdiv_by_pow2b_v16i32: ; AVX2: # %bb.0: ; AVX2-NEXT: vpsrad $31, %ymm0, %ymm2 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,30,29,28,0,30,29,28] -; AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = ; AVX2-NEXT: vpsrlvd %ymm3, %ymm2, %ymm2 ; AVX2-NEXT: vpaddd %ymm2, %ymm0, %ymm2 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,2,3,4,0,2,3,4] -; AVX2-NEXT: # ymm4 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = ; AVX2-NEXT: vpsravd %ymm4, %ymm2, %ymm2 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7] ; AVX2-NEXT: vpsrad $31, %ymm1, %ymm2 diff --git a/llvm/test/CodeGen/X86/combine-shl.ll b/llvm/test/CodeGen/X86/combine-shl.ll --- a/llvm/test/CodeGen/X86/combine-shl.ll +++ b/llvm/test/CodeGen/X86/combine-shl.ll @@ -507,8 +507,7 @@ ; AVX-LABEL: combine_vec_shl_gt_lshr0: ; AVX: # %bb.0: ; AVX-NEXT: vpslld $2, %xmm0, %xmm0 -; AVX-NEXT: vpbroadcastd {{.*#+}} xmm1 = [4294967264,4294967264,4294967264,4294967264] -; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX-NEXT: retq %1 = lshr <4 x i32> %x, %2 = shl <4 x i32> %1, @@ -543,8 +542,7 @@ ; AVX-LABEL: combine_vec_shl_le_lshr0: ; AVX: # %bb.0: ; AVX-NEXT: vpsrld $2, %xmm0, %xmm0 -; AVX-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1073741816,1073741816,1073741816,1073741816] -; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX-NEXT: retq %1 = lshr <4 x i32> %x, %2 = shl <4 x i32> %1, @@ -589,8 +587,7 @@ ; ; AVX-LABEL: combine_vec_shl_ashr0: ; AVX: # %bb.0: -; AVX-NEXT: vbroadcastss {{.*#+}} xmm1 = [4294967264,4294967264,4294967264,4294967264] -; AVX-NEXT: vandps %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX-NEXT: retq %1 = ashr <4 x i32> %x, %2 = shl <4 x i32> %1, @@ -623,8 +620,7 @@ ; AVX-LABEL: combine_vec_shl_add0: ; AVX: # %bb.0: ; AVX-NEXT: vpslld $2, %xmm0, %xmm0 -; AVX-NEXT: vpbroadcastd {{.*#+}} xmm1 = [20,20,20,20] -; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX-NEXT: retq %1 = add <4 x i32> %x, %2 = shl <4 x i32> %1, @@ -670,8 +666,7 @@ ; AVX-LABEL: combine_vec_shl_or0: ; AVX: # %bb.0: ; AVX-NEXT: vpslld $2, %xmm0, %xmm0 -; AVX-NEXT: vpbroadcastd {{.*#+}} xmm1 = [20,20,20,20] -; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX-NEXT: retq %1 = or <4 x i32> %x, %2 = shl <4 x i32> %1, @@ -726,8 +721,7 @@ ; ; AVX-LABEL: combine_vec_shl_mul0: ; AVX: # %bb.0: -; AVX-NEXT: vpbroadcastd {{.*#+}} xmm1 = [20,20,20,20] -; AVX-NEXT: vpmulld %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX-NEXT: retq %1 = mul <4 x i32> %x, %2 = shl <4 x i32> %1, @@ -781,8 +775,7 @@ ; AVX-LABEL: combine_vec_add_shl_nonsplat: ; AVX: # %bb.0: ; AVX-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX-NEXT: vpbroadcastd {{.*#+}} xmm1 = [3,3,3,3] -; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX-NEXT: retq %1 = shl <4 x i32> %a0, %2 = add <4 x i32> %1, @@ -817,8 +810,7 @@ ; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3],xmm1[4],xmm0[5],xmm1[6],xmm0[7] ; AVX-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX-NEXT: vpbroadcastd {{.*#+}} xmm1 = [15,15,15,15] -; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX-NEXT: retq %1 = and <4 x i32> %a0, %2 = shl <4 x i32> %1, @@ -851,8 +843,7 @@ ; AVX: # %bb.0: ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,1,0] ; AVX-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX-NEXT: vpbroadcastd {{.*#+}} xmm1 = [3,3,3,3] -; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX-NEXT: retq %1 = shl <4 x i32> %a0, %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> diff --git a/llvm/test/CodeGen/X86/combine-smax.ll b/llvm/test/CodeGen/X86/combine-smax.ll --- a/llvm/test/CodeGen/X86/combine-smax.ll +++ b/llvm/test/CodeGen/X86/combine-smax.ll @@ -1,11 +1,11 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=SSE2 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE41 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.2 | FileCheck %s --check-prefix=SSE42 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefix=AVX +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE41 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.2 | FileCheck %s --check-prefixes=SSE42 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX,AVX512 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefixes=AVX,AVX512 define <16 x i8> @test_v16i8_nosignbit(<16 x i8> %a, <16 x i8> %b) { ; SSE2-LABEL: test_v16i8_nosignbit: @@ -32,13 +32,29 @@ ; SSE42-NEXT: pmaxsb %xmm2, %xmm0 ; SSE42-NEXT: retq ; -; AVX-LABEL: test_v16i8_nosignbit: -; AVX: # %bb.0: -; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX-NEXT: vpand %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpand %xmm2, %xmm1, %xmm1 -; AVX-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: test_v16i8_nosignbit: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_v16i8_nosignbit: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: test_v16i8_nosignbit: +; AVX512: # %bb.0: +; AVX512-NEXT: vpbroadcastb {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX512-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX512-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: retq %1 = and <16 x i8> %a, %2 = and <16 x i8> %b, %3 = icmp sgt <16 x i8> %1, %2 diff --git a/llvm/test/CodeGen/X86/combine-smin.ll b/llvm/test/CodeGen/X86/combine-smin.ll --- a/llvm/test/CodeGen/X86/combine-smin.ll +++ b/llvm/test/CodeGen/X86/combine-smin.ll @@ -31,14 +31,6 @@ ; SSE42-NEXT: pand %xmm1, %xmm2 ; SSE42-NEXT: pminsb %xmm2, %xmm0 ; SSE42-NEXT: retq -; -; AVX-LABEL: test_v16i8_nosignbit: -; AVX: # %bb.0: -; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX-NEXT: vpand %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpand %xmm2, %xmm1, %xmm1 -; AVX-NEXT: vpminsb %xmm1, %xmm0, %xmm0 -; AVX-NEXT: retq %1 = and <16 x i8> %a, %2 = and <16 x i8> %b, %3 = icmp slt <16 x i8> %1, %2 diff --git a/llvm/test/CodeGen/X86/combine-srem.ll b/llvm/test/CodeGen/X86/combine-srem.ll --- a/llvm/test/CodeGen/X86/combine-srem.ll +++ b/llvm/test/CodeGen/X86/combine-srem.ll @@ -77,24 +77,14 @@ ; SSE-NEXT: paddd %xmm1, %xmm0 ; SSE-NEXT: retq ; -; AVX1-LABEL: combine_vec_srem_by_minsigned: -; AVX1: # %bb.0: -; AVX1-NEXT: vpsrad $31, %xmm0, %xmm1 -; AVX1-NEXT: vpsrld $1, %xmm1, %xmm1 -; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm1 -; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: combine_vec_srem_by_minsigned: -; AVX2: # %bb.0: -; AVX2-NEXT: vpsrad $31, %xmm0, %xmm1 -; AVX2-NEXT: vpsrld $1, %xmm1, %xmm1 -; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm1 -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] -; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpaddd %xmm0, %xmm1, %xmm0 -; AVX2-NEXT: retq +; AVX-LABEL: combine_vec_srem_by_minsigned: +; AVX: # %bb.0: +; AVX-NEXT: vpsrad $31, %xmm0, %xmm1 +; AVX-NEXT: vpsrld $1, %xmm1, %xmm1 +; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm1 +; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX-NEXT: vpaddd %xmm0, %xmm1, %xmm0 +; AVX-NEXT: retq %1 = srem <4 x i32> %x, ret <4 x i32> %1 } @@ -154,16 +144,10 @@ ; SSE-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE-NEXT: retq ; -; AVX1-LABEL: combine_vec_srem_by_pos0: -; AVX1: # %bb.0: -; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: combine_vec_srem_by_pos0: -; AVX2: # %bb.0: -; AVX2-NEXT: vbroadcastss {{.*#+}} xmm1 = [3,3,3,3] -; AVX2-NEXT: vandps %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: retq +; AVX-LABEL: combine_vec_srem_by_pos0: +; AVX: # %bb.0: +; AVX-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX-NEXT: retq %1 = and <4 x i32> %x, %2 = srem <4 x i32> %1, ret <4 x i32> %2 @@ -196,24 +180,14 @@ ; SSE-NEXT: psubd %xmm1, %xmm0 ; SSE-NEXT: retq ; -; AVX1-LABEL: combine_vec_srem_by_pow2a: -; AVX1: # %bb.0: -; AVX1-NEXT: vpsrad $31, %xmm0, %xmm1 -; AVX1-NEXT: vpsrld $30, %xmm1, %xmm1 -; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm1 -; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: combine_vec_srem_by_pow2a: -; AVX2: # %bb.0: -; AVX2-NEXT: vpsrad $31, %xmm0, %xmm1 -; AVX2-NEXT: vpsrld $30, %xmm1, %xmm1 -; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm1 -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [4294967292,4294967292,4294967292,4294967292] -; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: retq +; AVX-LABEL: combine_vec_srem_by_pow2a: +; AVX: # %bb.0: +; AVX-NEXT: vpsrad $31, %xmm0, %xmm1 +; AVX-NEXT: vpsrld $30, %xmm1, %xmm1 +; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm1 +; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: retq %1 = srem <4 x i32> %x, ret <4 x i32> %1 } diff --git a/llvm/test/CodeGen/X86/combine-srl.ll b/llvm/test/CodeGen/X86/combine-srl.ll --- a/llvm/test/CodeGen/X86/combine-srl.ll +++ b/llvm/test/CodeGen/X86/combine-srl.ll @@ -77,8 +77,7 @@ ; ; AVX-LABEL: combine_vec_lshr_known_zero1: ; AVX: # %bb.0: -; AVX-NEXT: vpbroadcastd {{.*#+}} xmm1 = [15,15,15,15] -; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX-NEXT: retq %1 = and <4 x i32> %x, @@ -280,8 +279,7 @@ ; ; AVX-LABEL: combine_vec_lshr_shl_mask0: ; AVX: # %bb.0: -; AVX-NEXT: vbroadcastss {{.*#+}} xmm1 = [1073741823,1073741823,1073741823,1073741823] -; AVX-NEXT: vandps %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX-NEXT: retq %1 = shl <4 x i32> %x, %2 = lshr <4 x i32> %1, @@ -330,8 +328,7 @@ ; AVX-LABEL: combine_vec_lshr_lzcnt_bit0: ; AVX: # %bb.0: ; AVX-NEXT: vpsrld $4, %xmm0, %xmm0 -; AVX-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1,1,1,1] -; AVX-NEXT: vpandn %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX-NEXT: retq %1 = and <4 x i32> %x, %2 = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %1, i1 0) diff --git a/llvm/test/CodeGen/X86/combine-sub-usat.ll b/llvm/test/CodeGen/X86/combine-sub-usat.ll --- a/llvm/test/CodeGen/X86/combine-sub-usat.ll +++ b/llvm/test/CodeGen/X86/combine-sub-usat.ll @@ -260,8 +260,7 @@ ; ; AVX2-LABEL: combine_trunc_v8i32_v8i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [65535,65535,65535,65535,65535,65535,65535,65535] -; AVX2-NEXT: vpminud %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX2-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 ; AVX2-NEXT: vpsubusw %xmm1, %xmm0, %xmm0 diff --git a/llvm/test/CodeGen/X86/combine-udiv.ll b/llvm/test/CodeGen/X86/combine-udiv.ll --- a/llvm/test/CodeGen/X86/combine-udiv.ll +++ b/llvm/test/CodeGen/X86/combine-udiv.ll @@ -135,19 +135,14 @@ ; SSE-NEXT: movaps {{.*#+}} xmm0 = [1,1,1,1] ; SSE-NEXT: retq ; -; AVX1-LABEL: combine_vec_udiv_dupe: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovaps {{.*#+}} xmm0 = [1,1,1,1] -; AVX1-NEXT: retq -; -; AVX2-LABEL: combine_vec_udiv_dupe: -; AVX2: # %bb.0: -; AVX2-NEXT: vbroadcastss {{.*#+}} xmm0 = [1,1,1,1] -; AVX2-NEXT: retq +; AVX-LABEL: combine_vec_udiv_dupe: +; AVX: # %bb.0: +; AVX-NEXT: vbroadcastss {{.*#+}} xmm0 = [1.40129846E-45,1.40129846E-45,1.40129846E-45,1.40129846E-45] +; AVX-NEXT: retq ; ; XOP-LABEL: combine_vec_udiv_dupe: ; XOP: # %bb.0: -; XOP-NEXT: vmovaps {{.*#+}} xmm0 = [1,1,1,1] +; XOP-NEXT: vbroadcastss {{.*#+}} xmm0 = [1.40129846E-45,1.40129846E-45,1.40129846E-45,1.40129846E-45] ; XOP-NEXT: retq %1 = udiv <4 x i32> %x, %x ret <4 x i32> %1 @@ -357,8 +352,7 @@ ; ; AVX2-LABEL: combine_vec_udiv_by_shl_pow2a: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2,2,2,2] -; AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; AVX2-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/combine-urem.ll b/llvm/test/CodeGen/X86/combine-urem.ll --- a/llvm/test/CodeGen/X86/combine-urem.ll +++ b/llvm/test/CodeGen/X86/combine-urem.ll @@ -96,16 +96,10 @@ ; SSE-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE-NEXT: retq ; -; AVX1-LABEL: combine_vec_urem_by_minsigned: -; AVX1: # %bb.0: -; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: combine_vec_urem_by_minsigned: -; AVX2: # %bb.0: -; AVX2-NEXT: vbroadcastss {{.*#+}} xmm1 = [2147483647,2147483647,2147483647,2147483647] -; AVX2-NEXT: vandps %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: retq +; AVX-LABEL: combine_vec_urem_by_minsigned: +; AVX: # %bb.0: +; AVX-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX-NEXT: retq %1 = urem <4 x i32> %x, ret <4 x i32> %1 } @@ -165,16 +159,10 @@ ; SSE-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE-NEXT: retq ; -; AVX1-LABEL: combine_vec_urem_by_pow2a: -; AVX1: # %bb.0: -; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: combine_vec_urem_by_pow2a: -; AVX2: # %bb.0: -; AVX2-NEXT: vbroadcastss {{.*#+}} xmm1 = [3,3,3,3] -; AVX2-NEXT: vandps %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: retq +; AVX-LABEL: combine_vec_urem_by_pow2a: +; AVX: # %bb.0: +; AVX-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX-NEXT: retq %1 = urem <4 x i32> %x, ret <4 x i32> %1 } diff --git a/llvm/test/CodeGen/X86/concat-cast.ll b/llvm/test/CodeGen/X86/concat-cast.ll --- a/llvm/test/CodeGen/X86/concat-cast.ll +++ b/llvm/test/CodeGen/X86/concat-cast.ll @@ -61,13 +61,10 @@ ; AVX2-LABEL: uitofp_v4i32_v4f32: ; AVX2: # %bb.0: ; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1258291200,1258291200,1258291200,1258291200] -; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] +; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],mem[1],xmm0[2],mem[3],xmm0[4],mem[5],xmm0[6],mem[7] ; AVX2-NEXT: vpsrld $16, %xmm0, %xmm0 -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [1392508928,1392508928,1392508928,1392508928] -; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7] -; AVX2-NEXT: vbroadcastss {{.*#+}} xmm2 = [5.49764202E+11,5.49764202E+11,5.49764202E+11,5.49764202E+11] -; AVX2-NEXT: vsubps %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],mem[1],xmm0[2],mem[3],xmm0[4],mem[5],xmm0[6],mem[7] +; AVX2-NEXT: vsubps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX2-NEXT: vaddps %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -137,8 +134,7 @@ ; AVX2-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX2-NEXT: vcvttps2dq %xmm0, %xmm1 ; AVX2-NEXT: vpsrad $31, %xmm1, %xmm2 -; AVX2-NEXT: vbroadcastss {{.*#+}} xmm3 = [2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9] -; AVX2-NEXT: vsubps %xmm3, %xmm0, %xmm0 +; AVX2-NEXT: vsubps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX2-NEXT: vcvttps2dq %xmm0, %xmm0 ; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpor %xmm0, %xmm1, %xmm0 @@ -303,8 +299,7 @@ ; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-NEXT: vcvttpd2dq %ymm0, %xmm1 ; AVX2-NEXT: vpsrad $31, %xmm1, %xmm2 -; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm3 = [2.147483648E+9,2.147483648E+9,2.147483648E+9,2.147483648E+9] -; AVX2-NEXT: vsubpd %ymm3, %ymm0, %ymm0 +; AVX2-NEXT: vsubpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: vcvttpd2dq %ymm0, %xmm0 ; AVX2-NEXT: vandpd %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vorpd %xmm0, %xmm1, %xmm0 @@ -373,7 +368,7 @@ ; AVX2-LABEL: mismatch_tofp_v4i32_v4f32: ; AVX2: # %bb.0: ; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [4.503599627370496E+15,4.503599627370496E+15] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [4841369599423283200,4841369599423283200] ; AVX2-NEXT: vpor %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vsubpd %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vcvtpd2ps %xmm0, %xmm0 diff --git a/llvm/test/CodeGen/X86/copy-low-subvec-elt-to-high-subvec-elt.ll b/llvm/test/CodeGen/X86/copy-low-subvec-elt-to-high-subvec-elt.ll --- a/llvm/test/CodeGen/X86/copy-low-subvec-elt-to-high-subvec-elt.ll +++ b/llvm/test/CodeGen/X86/copy-low-subvec-elt-to-high-subvec-elt.ll @@ -604,7 +604,7 @@ ; ; CHECK-FAST-LABEL: vec256_eltty_i32_source_subvec_1_target_subvec_mask_3_binary: ; CHECK-FAST: # %bb.0: -; CHECK-FAST-NEXT: vbroadcastss {{.*#+}} ymm2 = [4,4,4,4,4,4,4,4] +; CHECK-FAST-NEXT: vbroadcastss {{.*#+}} ymm2 = [5.60519386E-45,5.60519386E-45,5.60519386E-45,5.60519386E-45,5.60519386E-45,5.60519386E-45,5.60519386E-45,5.60519386E-45] ; CHECK-FAST-NEXT: vpermps %ymm1, %ymm2, %ymm1 ; CHECK-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6],ymm1[7] ; CHECK-FAST-NEXT: retq diff --git a/llvm/test/CodeGen/X86/exedepsfix-broadcast.ll b/llvm/test/CodeGen/X86/exedepsfix-broadcast.ll --- a/llvm/test/CodeGen/X86/exedepsfix-broadcast.ll +++ b/llvm/test/CodeGen/X86/exedepsfix-broadcast.ll @@ -7,8 +7,7 @@ define <4 x float> @ExeDepsFix_broadcastss(<4 x float> %arg, <4 x float> %arg2) { ; CHECK-LABEL: ExeDepsFix_broadcastss: ; CHECK: ## %bb.0: -; CHECK-NEXT: vbroadcastss {{.*#+}} xmm2 = [NaN,NaN,NaN,NaN] -; CHECK-NEXT: vandps %xmm2, %xmm0, %xmm0 +; CHECK-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; CHECK-NEXT: vmaxps %xmm1, %xmm0, %xmm0 ; CHECK-NEXT: retq %bitcast = bitcast <4 x float> %arg to <4 x i32> @@ -22,8 +21,7 @@ define <8 x float> @ExeDepsFix_broadcastss256(<8 x float> %arg, <8 x float> %arg2) { ; CHECK-LABEL: ExeDepsFix_broadcastss256: ; CHECK: ## %bb.0: -; CHECK-NEXT: vbroadcastss {{.*#+}} ymm2 = [NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN] -; CHECK-NEXT: vandps %ymm2, %ymm0, %ymm0 +; CHECK-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; CHECK-NEXT: vmaxps %ymm1, %ymm0, %ymm0 ; CHECK-NEXT: retq %bitcast = bitcast <8 x float> %arg to <8 x i32> @@ -88,8 +86,7 @@ define <4 x double> @ExeDepsFix_broadcastsd256(<4 x double> %arg, <4 x double> %arg2) { ; CHECK-LABEL: ExeDepsFix_broadcastsd256: ; CHECK: ## %bb.0: -; CHECK-NEXT: vbroadcastsd {{.*#+}} ymm2 = [2147483647,2147483647,2147483647,2147483647] -; CHECK-NEXT: vandpd %ymm2, %ymm0, %ymm0 +; CHECK-NEXT: vandpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; CHECK-NEXT: vmaxpd %ymm1, %ymm0, %ymm0 ; CHECK-NEXT: retq %bitcast = bitcast <4 x double> %arg to <4 x i64> diff --git a/llvm/test/CodeGen/X86/expand-vp-fp-intrinsics.ll b/llvm/test/CodeGen/X86/expand-vp-fp-intrinsics.ll --- a/llvm/test/CodeGen/X86/expand-vp-fp-intrinsics.ll +++ b/llvm/test/CodeGen/X86/expand-vp-fp-intrinsics.ll @@ -171,8 +171,7 @@ ; ; AVX2-LABEL: vp_fabs_v4f32: ; AVX2: # %bb.0: -; AVX2-NEXT: vbroadcastss {{.*#+}} xmm1 = [NaN,NaN,NaN,NaN] -; AVX2-NEXT: vandps %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX2-NEXT: vmovaps %xmm0, (%rdi) ; AVX2-NEXT: retq ; @@ -220,8 +219,7 @@ ; ; AVX2-LABEL: vp_fneg_v4f32: ; AVX2: # %bb.0: -; AVX2-NEXT: vbroadcastss {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] -; AVX2-NEXT: vxorps %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX2-NEXT: vmovaps %xmm0, (%rdi) ; AVX2-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/expand-vp-int-intrinsics.ll b/llvm/test/CodeGen/X86/expand-vp-int-intrinsics.ll --- a/llvm/test/CodeGen/X86/expand-vp-int-intrinsics.ll +++ b/llvm/test/CodeGen/X86/expand-vp-int-intrinsics.ll @@ -146,8 +146,7 @@ ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3] ; AVX2-NEXT: vpmaxud %xmm3, %xmm2, %xmm2 ; AVX2-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2 -; AVX2-NEXT: vbroadcastss {{.*#+}} xmm3 = [1,1,1,1] -; AVX2-NEXT: vblendvps %xmm2, %xmm3, %xmm1, %xmm1 +; AVX2-NEXT: vblendvps %xmm2, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; AVX2-NEXT: vextractps $1, %xmm1, %ecx ; AVX2-NEXT: vpextrd $1, %xmm0, %eax ; AVX2-NEXT: cltd @@ -289,8 +288,7 @@ ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3] ; AVX2-NEXT: vpmaxud %xmm3, %xmm2, %xmm2 ; AVX2-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2 -; AVX2-NEXT: vbroadcastss {{.*#+}} xmm3 = [1,1,1,1] -; AVX2-NEXT: vblendvps %xmm2, %xmm3, %xmm1, %xmm1 +; AVX2-NEXT: vblendvps %xmm2, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; AVX2-NEXT: vextractps $1, %xmm1, %ecx ; AVX2-NEXT: vpextrd $1, %xmm0, %eax ; AVX2-NEXT: xorl %edx, %edx @@ -432,8 +430,7 @@ ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3] ; AVX2-NEXT: vpmaxud %xmm3, %xmm2, %xmm2 ; AVX2-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2 -; AVX2-NEXT: vbroadcastss {{.*#+}} xmm3 = [1,1,1,1] -; AVX2-NEXT: vblendvps %xmm2, %xmm3, %xmm1, %xmm1 +; AVX2-NEXT: vblendvps %xmm2, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; AVX2-NEXT: vextractps $1, %xmm1, %ecx ; AVX2-NEXT: vpextrd $1, %xmm0, %eax ; AVX2-NEXT: cltd @@ -575,8 +572,7 @@ ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3] ; AVX2-NEXT: vpmaxud %xmm3, %xmm2, %xmm2 ; AVX2-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2 -; AVX2-NEXT: vbroadcastss {{.*#+}} xmm3 = [1,1,1,1] -; AVX2-NEXT: vblendvps %xmm2, %xmm3, %xmm1, %xmm1 +; AVX2-NEXT: vblendvps %xmm2, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; AVX2-NEXT: vextractps $1, %xmm1, %ecx ; AVX2-NEXT: vpextrd $1, %xmm0, %eax ; AVX2-NEXT: xorl %edx, %edx diff --git a/llvm/test/CodeGen/X86/extractelement-fp.ll b/llvm/test/CodeGen/X86/extractelement-fp.ll --- a/llvm/test/CodeGen/X86/extractelement-fp.ll +++ b/llvm/test/CodeGen/X86/extractelement-fp.ll @@ -5,15 +5,13 @@ define float @fneg_v4f32(<4 x float> %x) nounwind { ; X64-LABEL: fneg_v4f32: ; X64: # %bb.0: -; X64-NEXT: vbroadcastss {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] -; X64-NEXT: vxorps %xmm1, %xmm0, %xmm0 +; X64-NEXT: vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; X64-NEXT: retq ; ; X86-LABEL: fneg_v4f32: ; X86: # %bb.0: ; X86-NEXT: pushl %eax -; X86-NEXT: vbroadcastss {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] -; X86-NEXT: vxorps %xmm1, %xmm0, %xmm0 +; X86-NEXT: vxorps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 ; X86-NEXT: vmovss %xmm0, (%esp) ; X86-NEXT: flds (%esp) ; X86-NEXT: popl %eax @@ -26,9 +24,7 @@ define double @fneg_v4f64(<4 x double> %x) nounwind { ; X64-LABEL: fneg_v4f64: ; X64: # %bb.0: -; X64-NEXT: vmovddup {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0] -; X64-NEXT: # xmm1 = mem[0,0] -; X64-NEXT: vxorps %xmm1, %xmm0, %xmm0 +; X64-NEXT: vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; X64-NEXT: vzeroupper ; X64-NEXT: retq ; @@ -38,9 +34,7 @@ ; X86-NEXT: movl %esp, %ebp ; X86-NEXT: andl $-8, %esp ; X86-NEXT: subl $8, %esp -; X86-NEXT: vmovddup {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0] -; X86-NEXT: # xmm1 = mem[0,0] -; X86-NEXT: vxorps %xmm1, %xmm0, %xmm0 +; X86-NEXT: vxorps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 ; X86-NEXT: vmovlps %xmm0, (%esp) ; X86-NEXT: fldl (%esp) ; X86-NEXT: movl %ebp, %esp @@ -525,15 +519,13 @@ define float @fabs_v4f32(<4 x float> %x) nounwind { ; X64-LABEL: fabs_v4f32: ; X64: # %bb.0: -; X64-NEXT: vbroadcastss {{.*#+}} xmm1 = [NaN,NaN,NaN,NaN] -; X64-NEXT: vandps %xmm1, %xmm0, %xmm0 +; X64-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; X64-NEXT: retq ; ; X86-LABEL: fabs_v4f32: ; X86: # %bb.0: ; X86-NEXT: pushl %eax -; X86-NEXT: vbroadcastss {{.*#+}} xmm1 = [NaN,NaN,NaN,NaN] -; X86-NEXT: vandps %xmm1, %xmm0, %xmm0 +; X86-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 ; X86-NEXT: vmovss %xmm0, (%esp) ; X86-NEXT: flds (%esp) ; X86-NEXT: popl %eax @@ -951,20 +943,16 @@ define float @copysign_v4f32(<4 x float> %x, <4 x float> %y) nounwind { ; X64-LABEL: copysign_v4f32: ; X64: # %bb.0: -; X64-NEXT: vbroadcastss {{.*#+}} xmm2 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] -; X64-NEXT: vandps %xmm2, %xmm1, %xmm1 -; X64-NEXT: vbroadcastss {{.*#+}} xmm2 = [NaN,NaN,NaN,NaN] -; X64-NEXT: vandps %xmm2, %xmm0, %xmm0 +; X64-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; X64-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; X64-NEXT: vorps %xmm1, %xmm0, %xmm0 ; X64-NEXT: retq ; ; X86-LABEL: copysign_v4f32: ; X86: # %bb.0: ; X86-NEXT: pushl %eax -; X86-NEXT: vbroadcastss {{.*#+}} xmm2 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] -; X86-NEXT: vandps %xmm2, %xmm1, %xmm1 -; X86-NEXT: vbroadcastss {{.*#+}} xmm2 = [NaN,NaN,NaN,NaN] -; X86-NEXT: vandps %xmm2, %xmm0, %xmm0 +; X86-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1, %xmm1 +; X86-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 ; X86-NEXT: vorps %xmm1, %xmm0, %xmm0 ; X86-NEXT: vmovss %xmm0, (%esp) ; X86-NEXT: flds (%esp) @@ -1227,8 +1215,7 @@ define float @round_v4f32(<4 x float> %x) nounwind { ; X64-LABEL: round_v4f32: ; X64: # %bb.0: -; X64-NEXT: vbroadcastss {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] -; X64-NEXT: vandps %xmm1, %xmm0, %xmm1 +; X64-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 ; X64-NEXT: vbroadcastss {{.*#+}} xmm2 = [4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1] ; X64-NEXT: vorps %xmm2, %xmm1, %xmm1 ; X64-NEXT: vaddss %xmm1, %xmm0, %xmm0 @@ -1238,8 +1225,7 @@ ; X86-LABEL: round_v4f32: ; X86: # %bb.0: ; X86-NEXT: pushl %eax -; X86-NEXT: vbroadcastss {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] -; X86-NEXT: vandps %xmm1, %xmm0, %xmm1 +; X86-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm1 ; X86-NEXT: vbroadcastss {{.*#+}} xmm2 = [4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1] ; X86-NEXT: vorps %xmm2, %xmm1, %xmm1 ; X86-NEXT: vaddss %xmm1, %xmm0, %xmm0 diff --git a/llvm/test/CodeGen/X86/extractelement-from-arg.ll b/llvm/test/CodeGen/X86/extractelement-from-arg.ll --- a/llvm/test/CodeGen/X86/extractelement-from-arg.ll +++ b/llvm/test/CodeGen/X86/extractelement-from-arg.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+sse2 define void @test(ptr %R, <4 x float> %X) nounwind { diff --git a/llvm/test/CodeGen/X86/extractelement-legalization-cycle.ll b/llvm/test/CodeGen/X86/extractelement-legalization-cycle.ll --- a/llvm/test/CodeGen/X86/extractelement-legalization-cycle.ll +++ b/llvm/test/CodeGen/X86/extractelement-legalization-cycle.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s ; When the extractelement is converted to a load the store can be re-used. @@ -9,8 +10,14 @@ define float @foo(ptr %i, ptr %v) { ; CHECK-LABEL: foo: -; CHECK: movaps %xmm0, -[[OFFSET:[0-9]+]](%rsp) -; CHECK: movss -[[OFFSET]](%rsp,{{.*}}), %xmm0 {{.*}} +; CHECK: # %bb.0: +; CHECK-NEXT: movaps (%rsi), %xmm0 +; CHECK-NEXT: mulps %xmm0, %xmm0 +; CHECK-NEXT: movaps %xmm0, (%rsi) +; CHECK-NEXT: movl (%rdi), %eax +; CHECK-NEXT: andl $3, %eax +; CHECK-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; CHECK-NEXT: retq %1 = load <4 x float>, ptr %v, align 16 %mul = fmul <4 x float> %1, %1 diff --git a/llvm/test/CodeGen/X86/extractelement-load.ll b/llvm/test/CodeGen/X86/extractelement-load.ll --- a/llvm/test/CodeGen/X86/extractelement-load.ll +++ b/llvm/test/CodeGen/X86/extractelement-load.ll @@ -160,22 +160,13 @@ ; X64-SSSE3-NEXT: orps %xmm2, %xmm0 ; X64-SSSE3-NEXT: retq ; -; X64-AVX1-LABEL: t6: -; X64-AVX1: # %bb.0: -; X64-AVX1-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X64-AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; X64-AVX1-NEXT: vcmpeqss %xmm1, %xmm0, %xmm1 -; X64-AVX1-NEXT: vblendvps %xmm1, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; X64-AVX1-NEXT: retq -; -; X64-AVX2-LABEL: t6: -; X64-AVX2: # %bb.0: -; X64-AVX2-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X64-AVX2-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; X64-AVX2-NEXT: vcmpeqss %xmm1, %xmm0, %xmm1 -; X64-AVX2-NEXT: vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; X64-AVX2-NEXT: vblendvps %xmm1, %xmm2, %xmm0, %xmm0 -; X64-AVX2-NEXT: retq +; X64-AVX-LABEL: t6: +; X64-AVX: # %bb.0: +; X64-AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X64-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; X64-AVX-NEXT: vcmpeqss %xmm1, %xmm0, %xmm1 +; X64-AVX-NEXT: vblendvps %xmm1, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-AVX-NEXT: retq %vecload = load <8 x float>, ptr %a0, align 32 %vecext = extractelement <8 x float> %vecload, i32 1 %cmp = fcmp oeq float %vecext, 0.000000e+00 @@ -259,22 +250,13 @@ ; X64-SSSE3-NEXT: orps %xmm2, %xmm0 ; X64-SSSE3-NEXT: retq ; -; X64-AVX1-LABEL: PR43971_1: -; X64-AVX1: # %bb.0: # %entry -; X64-AVX1-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X64-AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; X64-AVX1-NEXT: vcmpeqss %xmm1, %xmm0, %xmm1 -; X64-AVX1-NEXT: vblendvps %xmm1, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; X64-AVX1-NEXT: retq -; -; X64-AVX2-LABEL: PR43971_1: -; X64-AVX2: # %bb.0: # %entry -; X64-AVX2-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X64-AVX2-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; X64-AVX2-NEXT: vcmpeqss %xmm1, %xmm0, %xmm1 -; X64-AVX2-NEXT: vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; X64-AVX2-NEXT: vblendvps %xmm1, %xmm2, %xmm0, %xmm0 -; X64-AVX2-NEXT: retq +; X64-AVX-LABEL: PR43971_1: +; X64-AVX: # %bb.0: # %entry +; X64-AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X64-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; X64-AVX-NEXT: vcmpeqss %xmm1, %xmm0, %xmm1 +; X64-AVX-NEXT: vblendvps %xmm1, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-AVX-NEXT: retq entry: %0 = load <8 x float>, ptr %a0, align 32 %vecext = extractelement <8 x float> %0, i32 1 @@ -469,61 +451,33 @@ ; X64-SSSE3-NEXT: popq %rbp ; X64-SSSE3-NEXT: retq ; -; X64-AVX1-LABEL: main: -; X64-AVX1: # %bb.0: -; X64-AVX1-NEXT: pushq %rbp -; X64-AVX1-NEXT: movq %rsp, %rbp -; X64-AVX1-NEXT: andq $-32, %rsp -; X64-AVX1-NEXT: subq $64, %rsp -; X64-AVX1-NEXT: movq n1@GOTPCREL(%rip), %rax -; X64-AVX1-NEXT: vmovaps (%rax), %ymm0 -; X64-AVX1-NEXT: movl zero+4(%rip), %ecx -; X64-AVX1-NEXT: movl zero+8(%rip), %eax -; X64-AVX1-NEXT: vmovaps %ymm0, zero(%rip) -; X64-AVX1-NEXT: vmovaps {{.*#+}} ymm0 = [2,2,2,2,2,2,2,2] -; X64-AVX1-NEXT: vmovaps %ymm0, (%rsp) -; X64-AVX1-NEXT: vmovaps (%rsp), %ymm0 -; X64-AVX1-NEXT: vextractps $2, %xmm0, %esi -; X64-AVX1-NEXT: xorl %edx, %edx -; X64-AVX1-NEXT: divl %esi -; X64-AVX1-NEXT: movl %eax, %esi -; X64-AVX1-NEXT: vextractps $1, %xmm0, %edi -; X64-AVX1-NEXT: movl %ecx, %eax -; X64-AVX1-NEXT: xorl %edx, %edx -; X64-AVX1-NEXT: divl %edi -; X64-AVX1-NEXT: addl %esi, %eax -; X64-AVX1-NEXT: movq %rbp, %rsp -; X64-AVX1-NEXT: popq %rbp -; X64-AVX1-NEXT: vzeroupper -; X64-AVX1-NEXT: retq -; -; X64-AVX2-LABEL: main: -; X64-AVX2: # %bb.0: -; X64-AVX2-NEXT: pushq %rbp -; X64-AVX2-NEXT: movq %rsp, %rbp -; X64-AVX2-NEXT: andq $-32, %rsp -; X64-AVX2-NEXT: subq $64, %rsp -; X64-AVX2-NEXT: movq n1@GOTPCREL(%rip), %rax -; X64-AVX2-NEXT: vmovaps (%rax), %ymm0 -; X64-AVX2-NEXT: movl zero+4(%rip), %ecx -; X64-AVX2-NEXT: movl zero+8(%rip), %eax -; X64-AVX2-NEXT: vmovaps %ymm0, zero(%rip) -; X64-AVX2-NEXT: vbroadcastss {{.*#+}} ymm0 = [2,2,2,2,2,2,2,2] -; X64-AVX2-NEXT: vmovaps %ymm0, (%rsp) -; X64-AVX2-NEXT: vmovaps (%rsp), %ymm0 -; X64-AVX2-NEXT: vextractps $2, %xmm0, %esi -; X64-AVX2-NEXT: xorl %edx, %edx -; X64-AVX2-NEXT: divl %esi -; X64-AVX2-NEXT: movl %eax, %esi -; X64-AVX2-NEXT: vextractps $1, %xmm0, %edi -; X64-AVX2-NEXT: movl %ecx, %eax -; X64-AVX2-NEXT: xorl %edx, %edx -; X64-AVX2-NEXT: divl %edi -; X64-AVX2-NEXT: addl %esi, %eax -; X64-AVX2-NEXT: movq %rbp, %rsp -; X64-AVX2-NEXT: popq %rbp -; X64-AVX2-NEXT: vzeroupper -; X64-AVX2-NEXT: retq +; X64-AVX-LABEL: main: +; X64-AVX: # %bb.0: +; X64-AVX-NEXT: pushq %rbp +; X64-AVX-NEXT: movq %rsp, %rbp +; X64-AVX-NEXT: andq $-32, %rsp +; X64-AVX-NEXT: subq $64, %rsp +; X64-AVX-NEXT: movq n1@GOTPCREL(%rip), %rax +; X64-AVX-NEXT: vmovaps (%rax), %ymm0 +; X64-AVX-NEXT: movl zero+4(%rip), %ecx +; X64-AVX-NEXT: movl zero+8(%rip), %eax +; X64-AVX-NEXT: vmovaps %ymm0, zero(%rip) +; X64-AVX-NEXT: vbroadcastss {{.*#+}} ymm0 = [2.80259693E-45,2.80259693E-45,2.80259693E-45,2.80259693E-45,2.80259693E-45,2.80259693E-45,2.80259693E-45,2.80259693E-45] +; X64-AVX-NEXT: vmovaps %ymm0, (%rsp) +; X64-AVX-NEXT: vmovaps (%rsp), %ymm0 +; X64-AVX-NEXT: vextractps $2, %xmm0, %esi +; X64-AVX-NEXT: xorl %edx, %edx +; X64-AVX-NEXT: divl %esi +; X64-AVX-NEXT: movl %eax, %esi +; X64-AVX-NEXT: vextractps $1, %xmm0, %edi +; X64-AVX-NEXT: movl %ecx, %eax +; X64-AVX-NEXT: xorl %edx, %edx +; X64-AVX-NEXT: divl %edi +; X64-AVX-NEXT: addl %esi, %eax +; X64-AVX-NEXT: movq %rbp, %rsp +; X64-AVX-NEXT: popq %rbp +; X64-AVX-NEXT: vzeroupper +; X64-AVX-NEXT: retq %stackptr = alloca <8 x i32>, align 32 %z = load <8 x i32>, ptr @zero, align 32 %t1 = load <8 x i32>, ptr @n1, align 32 @@ -536,3 +490,6 @@ %r = add i32 %e1, %e2 ret i32 %r } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; X64-AVX1: {{.*}} +; X64-AVX2: {{.*}} diff --git a/llvm/test/CodeGen/X86/extractelement-shuffle.ll b/llvm/test/CodeGen/X86/extractelement-shuffle.ll --- a/llvm/test/CodeGen/X86/extractelement-shuffle.ll +++ b/llvm/test/CodeGen/X86/extractelement-shuffle.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 ; RUN: llc < %s ; REQUIRES: default_triple diff --git a/llvm/test/CodeGen/X86/fma-intrinsics-fast-isel.ll b/llvm/test/CodeGen/X86/fma-intrinsics-fast-isel.ll --- a/llvm/test/CodeGen/X86/fma-intrinsics-fast-isel.ll +++ b/llvm/test/CodeGen/X86/fma-intrinsics-fast-isel.ll @@ -160,7 +160,7 @@ define <4 x float> @test_mm_fnmsub_ps(<4 x float> %a, <4 x float> %b, <4 x float> %c) { ; CHECK-LABEL: test_mm_fnmsub_ps: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vmovaps {{.*#+}} xmm3 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] +; CHECK-NEXT: vbroadcastss {{.*#+}} xmm3 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] ; CHECK-NEXT: vxorps %xmm3, %xmm0, %xmm4 ; CHECK-NEXT: vxorps %xmm3, %xmm2, %xmm0 ; CHECK-NEXT: vfmadd231ps {{.*#+}} xmm0 = (xmm1 * xmm4) + xmm0 @@ -175,7 +175,8 @@ define <2 x double> @test_mm_fnmsub_pd(<2 x double> %a, <2 x double> %b, <2 x double> %c) { ; CHECK-LABEL: test_mm_fnmsub_pd: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vmovapd {{.*#+}} xmm3 = [-0.0E+0,-0.0E+0] +; CHECK-NEXT: vmovddup {{.*#+}} xmm3 = [-0.0E+0,-0.0E+0] +; CHECK-NEXT: # xmm3 = mem[0,0] ; CHECK-NEXT: vxorpd %xmm3, %xmm0, %xmm4 ; CHECK-NEXT: vxorpd %xmm3, %xmm2, %xmm0 ; CHECK-NEXT: vfmadd231pd {{.*#+}} xmm0 = (xmm1 * xmm4) + xmm0 @@ -342,7 +343,7 @@ define <8 x float> @test_mm256_fnmsub_ps(<8 x float> %a, <8 x float> %b, <8 x float> %c) { ; CHECK-LABEL: test_mm256_fnmsub_ps: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vmovaps {{.*#+}} ymm3 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] +; CHECK-NEXT: vbroadcastss {{.*#+}} ymm3 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] ; CHECK-NEXT: vxorps %ymm3, %ymm0, %ymm4 ; CHECK-NEXT: vxorps %ymm3, %ymm2, %ymm0 ; CHECK-NEXT: vfmadd231ps {{.*#+}} ymm0 = (ymm1 * ymm4) + ymm0 @@ -357,7 +358,7 @@ define <4 x double> @test_mm256_fnmsub_pd(<4 x double> %a, <4 x double> %b, <4 x double> %c) { ; CHECK-LABEL: test_mm256_fnmsub_pd: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vmovapd {{.*#+}} ymm3 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] +; CHECK-NEXT: vbroadcastsd {{.*#+}} ymm3 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] ; CHECK-NEXT: vxorpd %ymm3, %ymm0, %ymm4 ; CHECK-NEXT: vxorpd %ymm3, %ymm2, %ymm0 ; CHECK-NEXT: vfmadd231pd {{.*#+}} ymm0 = (ymm1 * ymm4) + ymm0 diff --git a/llvm/test/CodeGen/X86/fma_patterns.ll b/llvm/test/CodeGen/X86/fma_patterns.ll --- a/llvm/test/CodeGen/X86/fma_patterns.ll +++ b/llvm/test/CodeGen/X86/fma_patterns.ll @@ -791,14 +791,14 @@ define <4 x float> @test_v4f32_mul_sub_one_x_y(<4 x float> %x, <4 x float> %y) { ; FMA-INFS-LABEL: test_v4f32_mul_sub_one_x_y: ; FMA-INFS: # %bb.0: -; FMA-INFS-NEXT: vmovaps {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; FMA-INFS-NEXT: vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; FMA-INFS-NEXT: vsubps %xmm0, %xmm2, %xmm0 ; FMA-INFS-NEXT: vmulps %xmm1, %xmm0, %xmm0 ; FMA-INFS-NEXT: retq ; ; FMA4-INFS-LABEL: test_v4f32_mul_sub_one_x_y: ; FMA4-INFS: # %bb.0: -; FMA4-INFS-NEXT: vmovaps {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; FMA4-INFS-NEXT: vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; FMA4-INFS-NEXT: vsubps %xmm0, %xmm2, %xmm0 ; FMA4-INFS-NEXT: vmulps %xmm1, %xmm0, %xmm0 ; FMA4-INFS-NEXT: retq @@ -832,14 +832,14 @@ define <4 x float> @test_v4f32_mul_y_sub_one_x(<4 x float> %x, <4 x float> %y) { ; FMA-INFS-LABEL: test_v4f32_mul_y_sub_one_x: ; FMA-INFS: # %bb.0: -; FMA-INFS-NEXT: vmovaps {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; FMA-INFS-NEXT: vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; FMA-INFS-NEXT: vsubps %xmm0, %xmm2, %xmm0 ; FMA-INFS-NEXT: vmulps %xmm0, %xmm1, %xmm0 ; FMA-INFS-NEXT: retq ; ; FMA4-INFS-LABEL: test_v4f32_mul_y_sub_one_x: ; FMA4-INFS: # %bb.0: -; FMA4-INFS-NEXT: vmovaps {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; FMA4-INFS-NEXT: vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; FMA4-INFS-NEXT: vsubps %xmm0, %xmm2, %xmm0 ; FMA4-INFS-NEXT: vmulps %xmm0, %xmm1, %xmm0 ; FMA4-INFS-NEXT: retq @@ -873,14 +873,14 @@ define <4 x float> @test_v4f32_mul_y_sub_one_x_undefs(<4 x float> %x, <4 x float> %y) { ; FMA-INFS-LABEL: test_v4f32_mul_y_sub_one_x_undefs: ; FMA-INFS: # %bb.0: -; FMA-INFS-NEXT: vmovaps {{.*#+}} xmm2 = <1.0E+0,u,1.0E+0,1.0E+0> +; FMA-INFS-NEXT: vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; FMA-INFS-NEXT: vsubps %xmm0, %xmm2, %xmm0 ; FMA-INFS-NEXT: vmulps %xmm0, %xmm1, %xmm0 ; FMA-INFS-NEXT: retq ; ; FMA4-INFS-LABEL: test_v4f32_mul_y_sub_one_x_undefs: ; FMA4-INFS: # %bb.0: -; FMA4-INFS-NEXT: vmovaps {{.*#+}} xmm2 = <1.0E+0,u,1.0E+0,1.0E+0> +; FMA4-INFS-NEXT: vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; FMA4-INFS-NEXT: vsubps %xmm0, %xmm2, %xmm0 ; FMA4-INFS-NEXT: vmulps %xmm0, %xmm1, %xmm0 ; FMA4-INFS-NEXT: retq @@ -914,14 +914,14 @@ define <4 x float> @test_v4f32_mul_sub_negone_x_y(<4 x float> %x, <4 x float> %y) { ; FMA-INFS-LABEL: test_v4f32_mul_sub_negone_x_y: ; FMA-INFS: # %bb.0: -; FMA-INFS-NEXT: vmovaps {{.*#+}} xmm2 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0] +; FMA-INFS-NEXT: vbroadcastss {{.*#+}} xmm2 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0] ; FMA-INFS-NEXT: vsubps %xmm0, %xmm2, %xmm0 ; FMA-INFS-NEXT: vmulps %xmm1, %xmm0, %xmm0 ; FMA-INFS-NEXT: retq ; ; FMA4-INFS-LABEL: test_v4f32_mul_sub_negone_x_y: ; FMA4-INFS: # %bb.0: -; FMA4-INFS-NEXT: vmovaps {{.*#+}} xmm2 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0] +; FMA4-INFS-NEXT: vbroadcastss {{.*#+}} xmm2 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0] ; FMA4-INFS-NEXT: vsubps %xmm0, %xmm2, %xmm0 ; FMA4-INFS-NEXT: vmulps %xmm1, %xmm0, %xmm0 ; FMA4-INFS-NEXT: retq @@ -955,14 +955,14 @@ define <4 x float> @test_v4f32_mul_y_sub_negone_x(<4 x float> %x, <4 x float> %y) { ; FMA-INFS-LABEL: test_v4f32_mul_y_sub_negone_x: ; FMA-INFS: # %bb.0: -; FMA-INFS-NEXT: vmovaps {{.*#+}} xmm2 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0] +; FMA-INFS-NEXT: vbroadcastss {{.*#+}} xmm2 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0] ; FMA-INFS-NEXT: vsubps %xmm0, %xmm2, %xmm0 ; FMA-INFS-NEXT: vmulps %xmm0, %xmm1, %xmm0 ; FMA-INFS-NEXT: retq ; ; FMA4-INFS-LABEL: test_v4f32_mul_y_sub_negone_x: ; FMA4-INFS: # %bb.0: -; FMA4-INFS-NEXT: vmovaps {{.*#+}} xmm2 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0] +; FMA4-INFS-NEXT: vbroadcastss {{.*#+}} xmm2 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0] ; FMA4-INFS-NEXT: vsubps %xmm0, %xmm2, %xmm0 ; FMA4-INFS-NEXT: vmulps %xmm0, %xmm1, %xmm0 ; FMA4-INFS-NEXT: retq @@ -996,14 +996,14 @@ define <4 x float> @test_v4f32_mul_y_sub_negone_x_undefs(<4 x float> %x, <4 x float> %y) { ; FMA-INFS-LABEL: test_v4f32_mul_y_sub_negone_x_undefs: ; FMA-INFS: # %bb.0: -; FMA-INFS-NEXT: vmovaps {{.*#+}} xmm2 = <-1.0E+0,-1.0E+0,u,-1.0E+0> +; FMA-INFS-NEXT: vbroadcastss {{.*#+}} xmm2 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0] ; FMA-INFS-NEXT: vsubps %xmm0, %xmm2, %xmm0 ; FMA-INFS-NEXT: vmulps %xmm0, %xmm1, %xmm0 ; FMA-INFS-NEXT: retq ; ; FMA4-INFS-LABEL: test_v4f32_mul_y_sub_negone_x_undefs: ; FMA4-INFS: # %bb.0: -; FMA4-INFS-NEXT: vmovaps {{.*#+}} xmm2 = <-1.0E+0,-1.0E+0,u,-1.0E+0> +; FMA4-INFS-NEXT: vbroadcastss {{.*#+}} xmm2 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0] ; FMA4-INFS-NEXT: vsubps %xmm0, %xmm2, %xmm0 ; FMA4-INFS-NEXT: vmulps %xmm0, %xmm1, %xmm0 ; FMA4-INFS-NEXT: retq @@ -1318,7 +1318,7 @@ define <4 x float> @test_v4f32_interp(<4 x float> %x, <4 x float> %y, <4 x float> %t) { ; FMA-INFS-LABEL: test_v4f32_interp: ; FMA-INFS: # %bb.0: -; FMA-INFS-NEXT: vmovaps {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; FMA-INFS-NEXT: vbroadcastss {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; FMA-INFS-NEXT: vsubps %xmm2, %xmm3, %xmm3 ; FMA-INFS-NEXT: vmulps %xmm3, %xmm1, %xmm1 ; FMA-INFS-NEXT: vfmadd213ps {{.*#+}} xmm0 = (xmm2 * xmm0) + xmm1 @@ -1326,7 +1326,7 @@ ; ; FMA4-INFS-LABEL: test_v4f32_interp: ; FMA4-INFS: # %bb.0: -; FMA4-INFS-NEXT: vmovaps {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; FMA4-INFS-NEXT: vbroadcastss {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; FMA4-INFS-NEXT: vsubps %xmm2, %xmm3, %xmm3 ; FMA4-INFS-NEXT: vmulps %xmm3, %xmm1, %xmm1 ; FMA4-INFS-NEXT: vfmaddps {{.*#+}} xmm0 = (xmm0 * xmm2) + xmm1 @@ -1367,7 +1367,7 @@ define <8 x float> @test_v8f32_interp(<8 x float> %x, <8 x float> %y, <8 x float> %t) { ; FMA-INFS-LABEL: test_v8f32_interp: ; FMA-INFS: # %bb.0: -; FMA-INFS-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; FMA-INFS-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; FMA-INFS-NEXT: vsubps %ymm2, %ymm3, %ymm3 ; FMA-INFS-NEXT: vmulps %ymm3, %ymm1, %ymm1 ; FMA-INFS-NEXT: vfmadd213ps {{.*#+}} ymm0 = (ymm2 * ymm0) + ymm1 @@ -1375,7 +1375,7 @@ ; ; FMA4-INFS-LABEL: test_v8f32_interp: ; FMA4-INFS: # %bb.0: -; FMA4-INFS-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; FMA4-INFS-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; FMA4-INFS-NEXT: vsubps %ymm2, %ymm3, %ymm3 ; FMA4-INFS-NEXT: vmulps %ymm3, %ymm1, %ymm1 ; FMA4-INFS-NEXT: vfmaddps {{.*#+}} ymm0 = (ymm0 * ymm2) + ymm1 @@ -1465,7 +1465,8 @@ define <2 x double> @test_v2f64_interp(<2 x double> %x, <2 x double> %y, <2 x double> %t) { ; FMA-INFS-LABEL: test_v2f64_interp: ; FMA-INFS: # %bb.0: -; FMA-INFS-NEXT: vmovapd {{.*#+}} xmm3 = [1.0E+0,1.0E+0] +; FMA-INFS-NEXT: vmovddup {{.*#+}} xmm3 = [1.0E+0,1.0E+0] +; FMA-INFS-NEXT: # xmm3 = mem[0,0] ; FMA-INFS-NEXT: vsubpd %xmm2, %xmm3, %xmm3 ; FMA-INFS-NEXT: vmulpd %xmm3, %xmm1, %xmm1 ; FMA-INFS-NEXT: vfmadd213pd {{.*#+}} xmm0 = (xmm2 * xmm0) + xmm1 @@ -1473,7 +1474,8 @@ ; ; FMA4-INFS-LABEL: test_v2f64_interp: ; FMA4-INFS: # %bb.0: -; FMA4-INFS-NEXT: vmovapd {{.*#+}} xmm3 = [1.0E+0,1.0E+0] +; FMA4-INFS-NEXT: vmovddup {{.*#+}} xmm3 = [1.0E+0,1.0E+0] +; FMA4-INFS-NEXT: # xmm3 = mem[0,0] ; FMA4-INFS-NEXT: vsubpd %xmm2, %xmm3, %xmm3 ; FMA4-INFS-NEXT: vmulpd %xmm3, %xmm1, %xmm1 ; FMA4-INFS-NEXT: vfmaddpd {{.*#+}} xmm0 = (xmm0 * xmm2) + xmm1 @@ -1515,7 +1517,7 @@ define <4 x double> @test_v4f64_interp(<4 x double> %x, <4 x double> %y, <4 x double> %t) { ; FMA-INFS-LABEL: test_v4f64_interp: ; FMA-INFS: # %bb.0: -; FMA-INFS-NEXT: vmovapd {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; FMA-INFS-NEXT: vbroadcastsd {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; FMA-INFS-NEXT: vsubpd %ymm2, %ymm3, %ymm3 ; FMA-INFS-NEXT: vmulpd %ymm3, %ymm1, %ymm1 ; FMA-INFS-NEXT: vfmadd213pd {{.*#+}} ymm0 = (ymm2 * ymm0) + ymm1 @@ -1523,7 +1525,7 @@ ; ; FMA4-INFS-LABEL: test_v4f64_interp: ; FMA4-INFS: # %bb.0: -; FMA4-INFS-NEXT: vmovapd {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; FMA4-INFS-NEXT: vbroadcastsd {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; FMA4-INFS-NEXT: vsubpd %ymm2, %ymm3, %ymm3 ; FMA4-INFS-NEXT: vmulpd %ymm3, %ymm1, %ymm1 ; FMA4-INFS-NEXT: vfmaddpd {{.*#+}} ymm0 = (ymm0 * ymm2) + ymm1 diff --git a/llvm/test/CodeGen/X86/fma_patterns_wide.ll b/llvm/test/CodeGen/X86/fma_patterns_wide.ll --- a/llvm/test/CodeGen/X86/fma_patterns_wide.ll +++ b/llvm/test/CodeGen/X86/fma_patterns_wide.ll @@ -259,7 +259,7 @@ define <16 x float> @test_v16f32_mul_add_x_one_y(<16 x float> %x, <16 x float> %y) { ; FMA-INFS-LABEL: test_v16f32_mul_add_x_one_y: ; FMA-INFS: # %bb.0: -; FMA-INFS-NEXT: vmovaps {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; FMA-INFS-NEXT: vbroadcastss {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; FMA-INFS-NEXT: vaddps %ymm4, %ymm1, %ymm1 ; FMA-INFS-NEXT: vaddps %ymm4, %ymm0, %ymm0 ; FMA-INFS-NEXT: vmulps %ymm2, %ymm0, %ymm0 @@ -268,7 +268,7 @@ ; ; FMA4-INFS-LABEL: test_v16f32_mul_add_x_one_y: ; FMA4-INFS: # %bb.0: -; FMA4-INFS-NEXT: vmovaps {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; FMA4-INFS-NEXT: vbroadcastss {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; FMA4-INFS-NEXT: vaddps %ymm4, %ymm1, %ymm1 ; FMA4-INFS-NEXT: vaddps %ymm4, %ymm0, %ymm0 ; FMA4-INFS-NEXT: vmulps %ymm2, %ymm0, %ymm0 @@ -305,7 +305,7 @@ define <8 x double> @test_v8f64_mul_y_add_x_one(<8 x double> %x, <8 x double> %y) { ; FMA-INFS-LABEL: test_v8f64_mul_y_add_x_one: ; FMA-INFS: # %bb.0: -; FMA-INFS-NEXT: vmovapd {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; FMA-INFS-NEXT: vbroadcastsd {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; FMA-INFS-NEXT: vaddpd %ymm4, %ymm1, %ymm1 ; FMA-INFS-NEXT: vaddpd %ymm4, %ymm0, %ymm0 ; FMA-INFS-NEXT: vmulpd %ymm0, %ymm2, %ymm0 @@ -314,7 +314,7 @@ ; ; FMA4-INFS-LABEL: test_v8f64_mul_y_add_x_one: ; FMA4-INFS: # %bb.0: -; FMA4-INFS-NEXT: vmovapd {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; FMA4-INFS-NEXT: vbroadcastsd {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; FMA4-INFS-NEXT: vaddpd %ymm4, %ymm1, %ymm1 ; FMA4-INFS-NEXT: vaddpd %ymm4, %ymm0, %ymm0 ; FMA4-INFS-NEXT: vmulpd %ymm0, %ymm2, %ymm0 @@ -351,7 +351,7 @@ define <16 x float> @test_v16f32_mul_add_x_negone_y(<16 x float> %x, <16 x float> %y) { ; FMA-INFS-LABEL: test_v16f32_mul_add_x_negone_y: ; FMA-INFS: # %bb.0: -; FMA-INFS-NEXT: vmovaps {{.*#+}} ymm4 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0] +; FMA-INFS-NEXT: vbroadcastss {{.*#+}} ymm4 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0] ; FMA-INFS-NEXT: vaddps %ymm4, %ymm1, %ymm1 ; FMA-INFS-NEXT: vaddps %ymm4, %ymm0, %ymm0 ; FMA-INFS-NEXT: vmulps %ymm2, %ymm0, %ymm0 @@ -360,7 +360,7 @@ ; ; FMA4-INFS-LABEL: test_v16f32_mul_add_x_negone_y: ; FMA4-INFS: # %bb.0: -; FMA4-INFS-NEXT: vmovaps {{.*#+}} ymm4 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0] +; FMA4-INFS-NEXT: vbroadcastss {{.*#+}} ymm4 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0] ; FMA4-INFS-NEXT: vaddps %ymm4, %ymm1, %ymm1 ; FMA4-INFS-NEXT: vaddps %ymm4, %ymm0, %ymm0 ; FMA4-INFS-NEXT: vmulps %ymm2, %ymm0, %ymm0 @@ -397,7 +397,7 @@ define <8 x double> @test_v8f64_mul_y_add_x_negone(<8 x double> %x, <8 x double> %y) { ; FMA-INFS-LABEL: test_v8f64_mul_y_add_x_negone: ; FMA-INFS: # %bb.0: -; FMA-INFS-NEXT: vmovapd {{.*#+}} ymm4 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0] +; FMA-INFS-NEXT: vbroadcastsd {{.*#+}} ymm4 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0] ; FMA-INFS-NEXT: vaddpd %ymm4, %ymm1, %ymm1 ; FMA-INFS-NEXT: vaddpd %ymm4, %ymm0, %ymm0 ; FMA-INFS-NEXT: vmulpd %ymm0, %ymm2, %ymm0 @@ -406,7 +406,7 @@ ; ; FMA4-INFS-LABEL: test_v8f64_mul_y_add_x_negone: ; FMA4-INFS: # %bb.0: -; FMA4-INFS-NEXT: vmovapd {{.*#+}} ymm4 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0] +; FMA4-INFS-NEXT: vbroadcastsd {{.*#+}} ymm4 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0] ; FMA4-INFS-NEXT: vaddpd %ymm4, %ymm1, %ymm1 ; FMA4-INFS-NEXT: vaddpd %ymm4, %ymm0, %ymm0 ; FMA4-INFS-NEXT: vmulpd %ymm0, %ymm2, %ymm0 @@ -443,7 +443,7 @@ define <16 x float> @test_v16f32_mul_sub_one_x_y(<16 x float> %x, <16 x float> %y) { ; FMA-INFS-LABEL: test_v16f32_mul_sub_one_x_y: ; FMA-INFS: # %bb.0: -; FMA-INFS-NEXT: vmovaps {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; FMA-INFS-NEXT: vbroadcastss {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; FMA-INFS-NEXT: vsubps %ymm1, %ymm4, %ymm1 ; FMA-INFS-NEXT: vsubps %ymm0, %ymm4, %ymm0 ; FMA-INFS-NEXT: vmulps %ymm2, %ymm0, %ymm0 @@ -452,7 +452,7 @@ ; ; FMA4-INFS-LABEL: test_v16f32_mul_sub_one_x_y: ; FMA4-INFS: # %bb.0: -; FMA4-INFS-NEXT: vmovaps {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; FMA4-INFS-NEXT: vbroadcastss {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; FMA4-INFS-NEXT: vsubps %ymm1, %ymm4, %ymm1 ; FMA4-INFS-NEXT: vsubps %ymm0, %ymm4, %ymm0 ; FMA4-INFS-NEXT: vmulps %ymm2, %ymm0, %ymm0 @@ -490,7 +490,7 @@ define <8 x double> @test_v8f64_mul_y_sub_one_x(<8 x double> %x, <8 x double> %y) { ; FMA-INFS-LABEL: test_v8f64_mul_y_sub_one_x: ; FMA-INFS: # %bb.0: -; FMA-INFS-NEXT: vmovapd {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; FMA-INFS-NEXT: vbroadcastsd {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; FMA-INFS-NEXT: vsubpd %ymm1, %ymm4, %ymm1 ; FMA-INFS-NEXT: vsubpd %ymm0, %ymm4, %ymm0 ; FMA-INFS-NEXT: vmulpd %ymm0, %ymm2, %ymm0 @@ -499,7 +499,7 @@ ; ; FMA4-INFS-LABEL: test_v8f64_mul_y_sub_one_x: ; FMA4-INFS: # %bb.0: -; FMA4-INFS-NEXT: vmovapd {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; FMA4-INFS-NEXT: vbroadcastsd {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; FMA4-INFS-NEXT: vsubpd %ymm1, %ymm4, %ymm1 ; FMA4-INFS-NEXT: vsubpd %ymm0, %ymm4, %ymm0 ; FMA4-INFS-NEXT: vmulpd %ymm0, %ymm2, %ymm0 @@ -537,7 +537,7 @@ define <16 x float> @test_v16f32_mul_sub_negone_x_y(<16 x float> %x, <16 x float> %y) { ; FMA-INFS-LABEL: test_v16f32_mul_sub_negone_x_y: ; FMA-INFS: # %bb.0: -; FMA-INFS-NEXT: vmovaps {{.*#+}} ymm4 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0] +; FMA-INFS-NEXT: vbroadcastss {{.*#+}} ymm4 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0] ; FMA-INFS-NEXT: vsubps %ymm1, %ymm4, %ymm1 ; FMA-INFS-NEXT: vsubps %ymm0, %ymm4, %ymm0 ; FMA-INFS-NEXT: vmulps %ymm2, %ymm0, %ymm0 @@ -546,7 +546,7 @@ ; ; FMA4-INFS-LABEL: test_v16f32_mul_sub_negone_x_y: ; FMA4-INFS: # %bb.0: -; FMA4-INFS-NEXT: vmovaps {{.*#+}} ymm4 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0] +; FMA4-INFS-NEXT: vbroadcastss {{.*#+}} ymm4 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0] ; FMA4-INFS-NEXT: vsubps %ymm1, %ymm4, %ymm1 ; FMA4-INFS-NEXT: vsubps %ymm0, %ymm4, %ymm0 ; FMA4-INFS-NEXT: vmulps %ymm2, %ymm0, %ymm0 @@ -584,7 +584,7 @@ define <8 x double> @test_v8f64_mul_y_sub_negone_x(<8 x double> %x, <8 x double> %y) { ; FMA-INFS-LABEL: test_v8f64_mul_y_sub_negone_x: ; FMA-INFS: # %bb.0: -; FMA-INFS-NEXT: vmovapd {{.*#+}} ymm4 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0] +; FMA-INFS-NEXT: vbroadcastsd {{.*#+}} ymm4 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0] ; FMA-INFS-NEXT: vsubpd %ymm1, %ymm4, %ymm1 ; FMA-INFS-NEXT: vsubpd %ymm0, %ymm4, %ymm0 ; FMA-INFS-NEXT: vmulpd %ymm0, %ymm2, %ymm0 @@ -593,7 +593,7 @@ ; ; FMA4-INFS-LABEL: test_v8f64_mul_y_sub_negone_x: ; FMA4-INFS: # %bb.0: -; FMA4-INFS-NEXT: vmovapd {{.*#+}} ymm4 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0] +; FMA4-INFS-NEXT: vbroadcastsd {{.*#+}} ymm4 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0] ; FMA4-INFS-NEXT: vsubpd %ymm1, %ymm4, %ymm1 ; FMA4-INFS-NEXT: vsubpd %ymm0, %ymm4, %ymm0 ; FMA4-INFS-NEXT: vmulpd %ymm0, %ymm2, %ymm0 @@ -631,7 +631,7 @@ define <16 x float> @test_v16f32_mul_sub_x_one_y(<16 x float> %x, <16 x float> %y) { ; FMA-INFS-LABEL: test_v16f32_mul_sub_x_one_y: ; FMA-INFS: # %bb.0: -; FMA-INFS-NEXT: vmovaps {{.*#+}} ymm4 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0] +; FMA-INFS-NEXT: vbroadcastss {{.*#+}} ymm4 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0] ; FMA-INFS-NEXT: vaddps %ymm4, %ymm1, %ymm1 ; FMA-INFS-NEXT: vaddps %ymm4, %ymm0, %ymm0 ; FMA-INFS-NEXT: vmulps %ymm2, %ymm0, %ymm0 @@ -640,7 +640,7 @@ ; ; FMA4-INFS-LABEL: test_v16f32_mul_sub_x_one_y: ; FMA4-INFS: # %bb.0: -; FMA4-INFS-NEXT: vmovaps {{.*#+}} ymm4 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0] +; FMA4-INFS-NEXT: vbroadcastss {{.*#+}} ymm4 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0] ; FMA4-INFS-NEXT: vaddps %ymm4, %ymm1, %ymm1 ; FMA4-INFS-NEXT: vaddps %ymm4, %ymm0, %ymm0 ; FMA4-INFS-NEXT: vmulps %ymm2, %ymm0, %ymm0 @@ -677,7 +677,7 @@ define <8 x double> @test_v8f64_mul_y_sub_x_one(<8 x double> %x, <8 x double> %y) { ; FMA-INFS-LABEL: test_v8f64_mul_y_sub_x_one: ; FMA-INFS: # %bb.0: -; FMA-INFS-NEXT: vmovapd {{.*#+}} ymm4 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0] +; FMA-INFS-NEXT: vbroadcastsd {{.*#+}} ymm4 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0] ; FMA-INFS-NEXT: vaddpd %ymm4, %ymm1, %ymm1 ; FMA-INFS-NEXT: vaddpd %ymm4, %ymm0, %ymm0 ; FMA-INFS-NEXT: vmulpd %ymm0, %ymm2, %ymm0 @@ -686,7 +686,7 @@ ; ; FMA4-INFS-LABEL: test_v8f64_mul_y_sub_x_one: ; FMA4-INFS: # %bb.0: -; FMA4-INFS-NEXT: vmovapd {{.*#+}} ymm4 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0] +; FMA4-INFS-NEXT: vbroadcastsd {{.*#+}} ymm4 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0] ; FMA4-INFS-NEXT: vaddpd %ymm4, %ymm1, %ymm1 ; FMA4-INFS-NEXT: vaddpd %ymm4, %ymm0, %ymm0 ; FMA4-INFS-NEXT: vmulpd %ymm0, %ymm2, %ymm0 @@ -723,7 +723,7 @@ define <16 x float> @test_v16f32_mul_sub_x_negone_y(<16 x float> %x, <16 x float> %y) { ; FMA-INFS-LABEL: test_v16f32_mul_sub_x_negone_y: ; FMA-INFS: # %bb.0: -; FMA-INFS-NEXT: vmovaps {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; FMA-INFS-NEXT: vbroadcastss {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; FMA-INFS-NEXT: vaddps %ymm4, %ymm1, %ymm1 ; FMA-INFS-NEXT: vaddps %ymm4, %ymm0, %ymm0 ; FMA-INFS-NEXT: vmulps %ymm2, %ymm0, %ymm0 @@ -732,7 +732,7 @@ ; ; FMA4-INFS-LABEL: test_v16f32_mul_sub_x_negone_y: ; FMA4-INFS: # %bb.0: -; FMA4-INFS-NEXT: vmovaps {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; FMA4-INFS-NEXT: vbroadcastss {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; FMA4-INFS-NEXT: vaddps %ymm4, %ymm1, %ymm1 ; FMA4-INFS-NEXT: vaddps %ymm4, %ymm0, %ymm0 ; FMA4-INFS-NEXT: vmulps %ymm2, %ymm0, %ymm0 @@ -769,7 +769,7 @@ define <8 x double> @test_v8f64_mul_y_sub_x_negone(<8 x double> %x, <8 x double> %y) { ; FMA-INFS-LABEL: test_v8f64_mul_y_sub_x_negone: ; FMA-INFS: # %bb.0: -; FMA-INFS-NEXT: vmovapd {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; FMA-INFS-NEXT: vbroadcastsd {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; FMA-INFS-NEXT: vaddpd %ymm4, %ymm1, %ymm1 ; FMA-INFS-NEXT: vaddpd %ymm4, %ymm0, %ymm0 ; FMA-INFS-NEXT: vmulpd %ymm0, %ymm2, %ymm0 @@ -778,7 +778,7 @@ ; ; FMA4-INFS-LABEL: test_v8f64_mul_y_sub_x_negone: ; FMA4-INFS: # %bb.0: -; FMA4-INFS-NEXT: vmovapd {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; FMA4-INFS-NEXT: vbroadcastsd {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; FMA4-INFS-NEXT: vaddpd %ymm4, %ymm1, %ymm1 ; FMA4-INFS-NEXT: vaddpd %ymm4, %ymm0, %ymm0 ; FMA4-INFS-NEXT: vmulpd %ymm0, %ymm2, %ymm0 @@ -819,7 +819,7 @@ define <16 x float> @test_v16f32_interp(<16 x float> %x, <16 x float> %y, <16 x float> %t) { ; FMA-INFS-LABEL: test_v16f32_interp: ; FMA-INFS: # %bb.0: -; FMA-INFS-NEXT: vmovaps {{.*#+}} ymm6 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; FMA-INFS-NEXT: vbroadcastss {{.*#+}} ymm6 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; FMA-INFS-NEXT: vsubps %ymm4, %ymm6, %ymm7 ; FMA-INFS-NEXT: vsubps %ymm5, %ymm6, %ymm6 ; FMA-INFS-NEXT: vmulps %ymm6, %ymm3, %ymm3 @@ -830,7 +830,7 @@ ; ; FMA4-INFS-LABEL: test_v16f32_interp: ; FMA4-INFS: # %bb.0: -; FMA4-INFS-NEXT: vmovaps {{.*#+}} ymm6 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; FMA4-INFS-NEXT: vbroadcastss {{.*#+}} ymm6 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; FMA4-INFS-NEXT: vsubps %ymm4, %ymm6, %ymm7 ; FMA4-INFS-NEXT: vsubps %ymm5, %ymm6, %ymm6 ; FMA4-INFS-NEXT: vmulps %ymm6, %ymm3, %ymm3 @@ -878,7 +878,7 @@ define <8 x double> @test_v8f64_interp(<8 x double> %x, <8 x double> %y, <8 x double> %t) { ; FMA-INFS-LABEL: test_v8f64_interp: ; FMA-INFS: # %bb.0: -; FMA-INFS-NEXT: vmovapd {{.*#+}} ymm6 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; FMA-INFS-NEXT: vbroadcastsd {{.*#+}} ymm6 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; FMA-INFS-NEXT: vsubpd %ymm4, %ymm6, %ymm7 ; FMA-INFS-NEXT: vsubpd %ymm5, %ymm6, %ymm6 ; FMA-INFS-NEXT: vmulpd %ymm6, %ymm3, %ymm3 @@ -889,7 +889,7 @@ ; ; FMA4-INFS-LABEL: test_v8f64_interp: ; FMA4-INFS: # %bb.0: -; FMA4-INFS-NEXT: vmovapd {{.*#+}} ymm6 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; FMA4-INFS-NEXT: vbroadcastsd {{.*#+}} ymm6 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; FMA4-INFS-NEXT: vsubpd %ymm4, %ymm6, %ymm7 ; FMA4-INFS-NEXT: vsubpd %ymm5, %ymm6, %ymm6 ; FMA4-INFS-NEXT: vmulpd %ymm6, %ymm3, %ymm3 @@ -1143,7 +1143,7 @@ ; FMA: # %bb.0: ; FMA-NEXT: vmulpd %ymm3, %ymm1, %ymm1 ; FMA-NEXT: vmulpd %ymm2, %ymm0, %ymm0 -; FMA-NEXT: vmovapd {{.*#+}} ymm2 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] +; FMA-NEXT: vbroadcastsd {{.*#+}} ymm2 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] ; FMA-NEXT: vxorpd %ymm2, %ymm0, %ymm0 ; FMA-NEXT: vxorpd %ymm2, %ymm1, %ymm1 ; FMA-NEXT: retq @@ -1152,7 +1152,7 @@ ; FMA4: # %bb.0: ; FMA4-NEXT: vmulpd %ymm3, %ymm1, %ymm1 ; FMA4-NEXT: vmulpd %ymm2, %ymm0, %ymm0 -; FMA4-NEXT: vmovapd {{.*#+}} ymm2 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] +; FMA4-NEXT: vbroadcastsd {{.*#+}} ymm2 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] ; FMA4-NEXT: vxorpd %ymm2, %ymm0, %ymm0 ; FMA4-NEXT: vxorpd %ymm2, %ymm1, %ymm1 ; FMA4-NEXT: retq diff --git a/llvm/test/CodeGen/X86/fold-vector-trunc-sitofp.ll b/llvm/test/CodeGen/X86/fold-vector-trunc-sitofp.ll --- a/llvm/test/CodeGen/X86/fold-vector-trunc-sitofp.ll +++ b/llvm/test/CodeGen/X86/fold-vector-trunc-sitofp.ll @@ -7,7 +7,8 @@ define <4 x float> @test1() { ; CHECK-LABEL: test1: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps {{.*#+}} xmm0 = [-1.0E+0,0.0E+0,-1.0E+0,0.0E+0] +; CHECK-NEXT: vmovddup {{.*#+}} xmm0 = [1.5873523201947252E-314,1.5873523201947252E-314] +; CHECK-NEXT: # xmm0 = mem[0,0] ; CHECK-NEXT: ret{{[l|q]}} %1 = trunc <4 x i3> to <4 x i1> %2 = sitofp <4 x i1> %1 to <4 x float> diff --git a/llvm/test/CodeGen/X86/fp-round.ll b/llvm/test/CodeGen/X86/fp-round.ll --- a/llvm/test/CodeGen/X86/fp-round.ll +++ b/llvm/test/CodeGen/X86/fp-round.ll @@ -572,9 +572,9 @@ ; ; AVX1-LABEL: round_v16f32: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] +; AVX1-NEXT: vbroadcastss {{.*#+}} ymm2 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] ; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm3 -; AVX1-NEXT: vmovaps {{.*#+}} ymm4 = [4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1] +; AVX1-NEXT: vbroadcastss {{.*#+}} ymm4 = [4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1] ; AVX1-NEXT: vorps %ymm4, %ymm3, %ymm3 ; AVX1-NEXT: vaddps %ymm3, %ymm0, %ymm0 ; AVX1-NEXT: vroundps $11, %ymm0, %ymm0 @@ -680,9 +680,9 @@ ; ; AVX1-LABEL: round_v8f64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovapd {{.*#+}} ymm2 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] +; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm2 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] ; AVX1-NEXT: vandpd %ymm2, %ymm0, %ymm3 -; AVX1-NEXT: vmovapd {{.*#+}} ymm4 = [4.9999999999999994E-1,4.9999999999999994E-1,4.9999999999999994E-1,4.9999999999999994E-1] +; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm4 = [4.9999999999999994E-1,4.9999999999999994E-1,4.9999999999999994E-1,4.9999999999999994E-1] ; AVX1-NEXT: vorpd %ymm4, %ymm3, %ymm3 ; AVX1-NEXT: vaddpd %ymm3, %ymm0, %ymm0 ; AVX1-NEXT: vroundpd $11, %ymm0, %ymm0 diff --git a/llvm/test/CodeGen/X86/freeze-binary.ll b/llvm/test/CodeGen/X86/freeze-binary.ll --- a/llvm/test/CodeGen/X86/freeze-binary.ll +++ b/llvm/test/CodeGen/X86/freeze-binary.ll @@ -190,8 +190,7 @@ ; ; X64-LABEL: freeze_add_vec: ; X64: # %bb.0: -; X64-NEXT: vpbroadcastd {{.*#+}} xmm1 = [5,5,5,5] -; X64-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; X64-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; X64-NEXT: retq %x = add <4 x i32> %a0, %y = freeze <4 x i32> %x @@ -261,8 +260,7 @@ ; ; X64-LABEL: freeze_sub_vec: ; X64: # %bb.0: -; X64-NEXT: vpbroadcastd {{.*#+}} xmm1 = [5,5,5,5] -; X64-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; X64-NEXT: vpsubd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; X64-NEXT: retq %x = sub <4 x i32> %a0, %y = freeze <4 x i32> %x diff --git a/llvm/test/CodeGen/X86/freeze-vector.ll b/llvm/test/CodeGen/X86/freeze-vector.ll --- a/llvm/test/CodeGen/X86/freeze-vector.ll +++ b/llvm/test/CodeGen/X86/freeze-vector.ll @@ -288,10 +288,9 @@ ; ; X64-LABEL: freeze_buildvector_single_maybe_poison_operand: ; X64: # %bb.0: -; X64-NEXT: vmovdqa {{.*#+}} xmm0 = +; X64-NEXT: vpbroadcastd {{.*#+}} xmm0 = [42,42,42,42] ; X64-NEXT: vpinsrd $0, (%rdi), %xmm0, %xmm0 -; X64-NEXT: vpbroadcastd {{.*#+}} xmm1 = [7,7,7,7] -; X64-NEXT: vpand %xmm1, %xmm0, %xmm0 +; X64-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; X64-NEXT: vmovdqa %xmm0, (%rsi) ; X64-NEXT: retq %i0.src = load i32, ptr %origin @@ -322,11 +321,10 @@ ; ; X64-LABEL: freeze_buildvector_single_repeated_maybe_poison_operand: ; X64: # %bb.0: -; X64-NEXT: vmovdqa {{.*#+}} xmm0 = +; X64-NEXT: vpbroadcastd {{.*#+}} xmm0 = [42,42,42,42] ; X64-NEXT: vpinsrd $0, (%rdi), %xmm0, %xmm0 ; X64-NEXT: vpbroadcastq %xmm0, %xmm0 -; X64-NEXT: vpbroadcastd {{.*#+}} xmm1 = [7,7,7,7] -; X64-NEXT: vpand %xmm1, %xmm0, %xmm0 +; X64-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; X64-NEXT: vmovdqa %xmm0, (%rsi) ; X64-NEXT: retq %i0.src = load i32, ptr %origin @@ -447,8 +445,7 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl (%edx), %edx ; X86-NEXT: andl $15, %edx -; X86-NEXT: vmovddup {{.*#+}} xmm0 = [7,7] -; X86-NEXT: # xmm0 = mem[0,0] +; X86-NEXT: vmovdqa {{.*#+}} xmm0 = [7,0,7,0] ; X86-NEXT: vmovd %edx, %xmm1 ; X86-NEXT: vpand %xmm0, %xmm1, %xmm2 ; X86-NEXT: vmovdqa %xmm2, (%ecx) @@ -506,8 +503,7 @@ ; X64-NEXT: vpinsrd $1, (%rsi), %xmm0, %xmm0 ; X64-NEXT: vpinsrd $2, (%rdx), %xmm0, %xmm0 ; X64-NEXT: vpinsrd $3, (%rcx), %xmm0, %xmm0 -; X64-NEXT: vpbroadcastd {{.*#+}} xmm1 = [7,7,7,7] -; X64-NEXT: vpand %xmm1, %xmm0, %xmm0 +; X64-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; X64-NEXT: vmovdqa %xmm0, (%r8) ; X64-NEXT: retq %i0.src = load i32, ptr %origin0 @@ -551,8 +547,7 @@ ; X64-NEXT: vpinsrd $1, (%rsi), %xmm0, %xmm0 ; X64-NEXT: vpinsrd $2, %eax, %xmm0, %xmm0 ; X64-NEXT: vpinsrd $3, (%rcx), %xmm0, %xmm0 -; X64-NEXT: vpbroadcastd {{.*#+}} xmm1 = [7,7,7,7] -; X64-NEXT: vpand %xmm1, %xmm0, %xmm0 +; X64-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; X64-NEXT: vmovdqa %xmm0, (%r8) ; X64-NEXT: retq %i0.src = load i32, ptr %origin0 @@ -601,11 +596,9 @@ ; X64-NEXT: vpinsrd $1, (%rsi), %xmm0, %xmm0 ; X64-NEXT: vpinsrd $2, (%rdx), %xmm0, %xmm0 ; X64-NEXT: vpinsrd $3, (%rcx), %xmm0, %xmm0 -; X64-NEXT: vpbroadcastd {{.*#+}} xmm1 = [15,15,15,15] -; X64-NEXT: vpand %xmm1, %xmm0, %xmm0 +; X64-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; X64-NEXT: vmovdqa %xmm0, (%r9) -; X64-NEXT: vpbroadcastd {{.*#+}} xmm1 = [7,7,7,7] -; X64-NEXT: vpand %xmm1, %xmm0, %xmm0 +; X64-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; X64-NEXT: vmovdqa %xmm0, (%r8) ; X64-NEXT: retq %i0.src = load i32, ptr %origin0 diff --git a/llvm/test/CodeGen/X86/funnel-shift-rot.ll b/llvm/test/CodeGen/X86/funnel-shift-rot.ll --- a/llvm/test/CodeGen/X86/funnel-shift-rot.ll +++ b/llvm/test/CodeGen/X86/funnel-shift-rot.ll @@ -149,8 +149,7 @@ ; ; X64-AVX2-LABEL: rotl_v4i32: ; X64-AVX2: # %bb.0: -; X64-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [31,31,31,31] -; X64-AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 +; X64-AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; X64-AVX2-NEXT: vpsllvd %xmm1, %xmm0, %xmm2 ; X64-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [32,32,32,32] ; X64-AVX2-NEXT: vpsubd %xmm1, %xmm3, %xmm1 @@ -328,8 +327,7 @@ ; ; X64-AVX2-LABEL: rotr_v4i32: ; X64-AVX2: # %bb.0: -; X64-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [31,31,31,31] -; X64-AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 +; X64-AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; X64-AVX2-NEXT: vpsrlvd %xmm1, %xmm0, %xmm2 ; X64-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [32,32,32,32] ; X64-AVX2-NEXT: vpsubd %xmm1, %xmm3, %xmm1 diff --git a/llvm/test/CodeGen/X86/gfni-funnel-shifts.ll b/llvm/test/CodeGen/X86/gfni-funnel-shifts.ll --- a/llvm/test/CodeGen/X86/gfni-funnel-shifts.ll +++ b/llvm/test/CodeGen/X86/gfni-funnel-shifts.ll @@ -238,7 +238,7 @@ ; GFNIAVX2-LABEL: splatconstant_fshl_v64i8: ; GFNIAVX2: # %bb.0: ; GFNIAVX2-NEXT: vpsrlw $7, %ymm2, %ymm2 -; GFNIAVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; GFNIAVX2-NEXT: vpbroadcastb {{.*#+}} ymm4 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] ; GFNIAVX2-NEXT: vpand %ymm4, %ymm2, %ymm2 ; GFNIAVX2-NEXT: vpaddb %ymm0, %ymm0, %ymm0 ; GFNIAVX2-NEXT: vpor %ymm2, %ymm0, %ymm0 @@ -323,7 +323,7 @@ ; GFNIAVX2-LABEL: splatconstant_fshr_v64i8: ; GFNIAVX2: # %bb.0: ; GFNIAVX2-NEXT: vpsrlw $2, %ymm2, %ymm2 -; GFNIAVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192] +; GFNIAVX2-NEXT: vpbroadcastb {{.*#+}} ymm4 = [192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192] ; GFNIAVX2-NEXT: vpandn %ymm2, %ymm4, %ymm2 ; GFNIAVX2-NEXT: vpsllw $6, %ymm0, %ymm0 ; GFNIAVX2-NEXT: vpand %ymm4, %ymm0, %ymm0 diff --git a/llvm/test/CodeGen/X86/gfni-rotates.ll b/llvm/test/CodeGen/X86/gfni-rotates.ll --- a/llvm/test/CodeGen/X86/gfni-rotates.ll +++ b/llvm/test/CodeGen/X86/gfni-rotates.ll @@ -242,7 +242,7 @@ ; GFNIAVX2-LABEL: splatconstant_rotl_v64i8: ; GFNIAVX2: # %bb.0: ; GFNIAVX2-NEXT: vpsrlw $7, %ymm0, %ymm2 -; GFNIAVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; GFNIAVX2-NEXT: vpbroadcastb {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] ; GFNIAVX2-NEXT: vpand %ymm3, %ymm2, %ymm2 ; GFNIAVX2-NEXT: vpaddb %ymm0, %ymm0, %ymm0 ; GFNIAVX2-NEXT: vpor %ymm2, %ymm0, %ymm0 @@ -328,7 +328,7 @@ ; GFNIAVX2-LABEL: splatconstant_rotr_v64i8: ; GFNIAVX2: # %bb.0: ; GFNIAVX2-NEXT: vpsrlw $2, %ymm0, %ymm2 -; GFNIAVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192] +; GFNIAVX2-NEXT: vpbroadcastb {{.*#+}} ymm3 = [192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192,192] ; GFNIAVX2-NEXT: vpandn %ymm2, %ymm3, %ymm2 ; GFNIAVX2-NEXT: vpsllw $6, %ymm0, %ymm0 ; GFNIAVX2-NEXT: vpand %ymm3, %ymm0, %ymm0 diff --git a/llvm/test/CodeGen/X86/gfni-shifts.ll b/llvm/test/CodeGen/X86/gfni-shifts.ll --- a/llvm/test/CodeGen/X86/gfni-shifts.ll +++ b/llvm/test/CodeGen/X86/gfni-shifts.ll @@ -50,14 +50,23 @@ ; GFNISSE-NEXT: psubb %xmm1, %xmm0 ; GFNISSE-NEXT: retq ; -; GFNIAVX1OR2-LABEL: splatconstant_ashr_v16i8: -; GFNIAVX1OR2: # %bb.0: -; GFNIAVX1OR2-NEXT: vpsrlw $4, %xmm0, %xmm0 -; GFNIAVX1OR2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; GFNIAVX1OR2-NEXT: vmovdqa {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] -; GFNIAVX1OR2-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; GFNIAVX1OR2-NEXT: vpsubb %xmm1, %xmm0, %xmm0 -; GFNIAVX1OR2-NEXT: retq +; GFNIAVX1-LABEL: splatconstant_ashr_v16i8: +; GFNIAVX1: # %bb.0: +; GFNIAVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 +; GFNIAVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; GFNIAVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; GFNIAVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; GFNIAVX1-NEXT: vpsubb %xmm1, %xmm0, %xmm0 +; GFNIAVX1-NEXT: retq +; +; GFNIAVX2-LABEL: splatconstant_ashr_v16i8: +; GFNIAVX2: # %bb.0: +; GFNIAVX2-NEXT: vpsrlw $4, %xmm0, %xmm0 +; GFNIAVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; GFNIAVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; GFNIAVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; GFNIAVX2-NEXT: vpsubb %xmm1, %xmm0, %xmm0 +; GFNIAVX2-NEXT: retq ; ; GFNIAVX512-LABEL: splatconstant_ashr_v16i8: ; GFNIAVX512: # %bb.0: @@ -181,7 +190,7 @@ ; GFNIAVX2: # %bb.0: ; GFNIAVX2-NEXT: vpsrlw $2, %ymm0, %ymm0 ; GFNIAVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; GFNIAVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] +; GFNIAVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] ; GFNIAVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0 ; GFNIAVX2-NEXT: vpsubb %ymm1, %ymm0, %ymm0 ; GFNIAVX2-NEXT: retq @@ -235,7 +244,7 @@ ; GFNIAVX2-LABEL: splatconstant_shl_v64i8: ; GFNIAVX2: # %bb.0: ; GFNIAVX2-NEXT: vpsllw $5, %ymm0, %ymm0 -; GFNIAVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224] +; GFNIAVX2-NEXT: vpbroadcastb {{.*#+}} ymm2 = [224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224,224] ; GFNIAVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 ; GFNIAVX2-NEXT: vpsllw $5, %ymm1, %ymm1 ; GFNIAVX2-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -284,7 +293,7 @@ ; GFNIAVX2-LABEL: splatconstant_lshr_v64i8: ; GFNIAVX2: # %bb.0: ; GFNIAVX2-NEXT: vpsrlw $7, %ymm0, %ymm0 -; GFNIAVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; GFNIAVX2-NEXT: vpbroadcastb {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] ; GFNIAVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 ; GFNIAVX2-NEXT: vpsrlw $7, %ymm1, %ymm1 ; GFNIAVX2-NEXT: vpand %ymm2, %ymm1, %ymm1 @@ -351,9 +360,9 @@ ; GFNIAVX2-LABEL: splatconstant_ashr_v64i8: ; GFNIAVX2: # %bb.0: ; GFNIAVX2-NEXT: vpsrlw $1, %ymm0, %ymm0 -; GFNIAVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; GFNIAVX2-NEXT: vpbroadcastb {{.*#+}} ymm2 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] ; GFNIAVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 -; GFNIAVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64] +; GFNIAVX2-NEXT: vpbroadcastb {{.*#+}} ymm3 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64] ; GFNIAVX2-NEXT: vpxor %ymm3, %ymm0, %ymm0 ; GFNIAVX2-NEXT: vpsubb %ymm3, %ymm0, %ymm0 ; GFNIAVX2-NEXT: vpsrlw $1, %ymm1, %ymm1 @@ -372,3 +381,5 @@ %shift = ashr <64 x i8> %a, ret <64 x i8> %shift } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GFNIAVX1OR2: {{.*}} diff --git a/llvm/test/CodeGen/X86/hoist-and-by-const-from-lshr-in-eqcmp-zero.ll b/llvm/test/CodeGen/X86/hoist-and-by-const-from-lshr-in-eqcmp-zero.ll --- a/llvm/test/CodeGen/X86/hoist-and-by-const-from-lshr-in-eqcmp-zero.ll +++ b/llvm/test/CodeGen/X86/hoist-and-by-const-from-lshr-in-eqcmp-zero.ll @@ -512,15 +512,6 @@ ; X86-SSE2-NEXT: pcmpeqd %xmm2, %xmm0 ; X86-SSE2-NEXT: retl ; -; AVX2-LABEL: vec_4xi32_splat_eq: -; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [1,1,1,1] -; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX2-NEXT: vpsllvd %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 -; AVX2-NEXT: ret{{[l|q]}} -; ; X64-SSE2-LABEL: vec_4xi32_splat_eq: ; X64-SSE2: # %bb.0: ; X64-SSE2-NEXT: pxor %xmm2, %xmm2 @@ -596,15 +587,6 @@ ; X86-SSE2-NEXT: pcmpeqd %xmm2, %xmm0 ; X86-SSE2-NEXT: retl ; -; AVX2-LABEL: vec_4xi32_nonsplat_undef0_eq: -; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [1,1,1,1] -; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX2-NEXT: vpsllvd %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 -; AVX2-NEXT: ret{{[l|q]}} -; ; X64-SSE2-LABEL: vec_4xi32_nonsplat_undef0_eq: ; X64-SSE2: # %bb.0: ; X64-SSE2-NEXT: pxor %xmm2, %xmm2 diff --git a/llvm/test/CodeGen/X86/hoist-and-by-const-from-shl-in-eqcmp-zero.ll b/llvm/test/CodeGen/X86/hoist-and-by-const-from-shl-in-eqcmp-zero.ll --- a/llvm/test/CodeGen/X86/hoist-and-by-const-from-shl-in-eqcmp-zero.ll +++ b/llvm/test/CodeGen/X86/hoist-and-by-const-from-shl-in-eqcmp-zero.ll @@ -468,15 +468,6 @@ ; X86-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 ; X86-SSE2-NEXT: retl ; -; AVX2-LABEL: vec_4xi32_splat_eq: -; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [1,1,1,1] -; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX2-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 -; AVX2-NEXT: ret{{[l|q]}} -; ; X64-SSE2-LABEL: vec_4xi32_splat_eq: ; X64-SSE2: # %bb.0: ; X64-SSE2-NEXT: pslld $23, %xmm1 @@ -556,15 +547,6 @@ ; X86-SSE2-NEXT: pcmpeqd %xmm1, %xmm0 ; X86-SSE2-NEXT: retl ; -; AVX2-LABEL: vec_4xi32_nonsplat_undef0_eq: -; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [1,1,1,1] -; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX2-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 -; AVX2-NEXT: ret{{[l|q]}} -; ; X64-SSE2-LABEL: vec_4xi32_nonsplat_undef0_eq: ; X64-SSE2: # %bb.0: ; X64-SSE2-NEXT: movl $1, %eax diff --git a/llvm/test/CodeGen/X86/horizontal-reduce-umax.ll b/llvm/test/CodeGen/X86/horizontal-reduce-umax.ll --- a/llvm/test/CodeGen/X86/horizontal-reduce-umax.ll +++ b/llvm/test/CodeGen/X86/horizontal-reduce-umax.ll @@ -52,11 +52,10 @@ ; ; X86-AVX1-LABEL: test_reduce_v2i64: ; X86-AVX1: ## %bb.0: -; X86-AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] -; X86-AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] -; X86-AVX1-NEXT: ## xmm2 = mem[0,0] -; X86-AVX1-NEXT: vxorps %xmm2, %xmm0, %xmm3 -; X86-AVX1-NEXT: vxorps %xmm2, %xmm1, %xmm2 +; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X86-AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,2147483648,0,2147483648] +; X86-AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm3 +; X86-AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm2 ; X86-AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 ; X86-AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; X86-AVX1-NEXT: vmovd %xmm0, %eax @@ -122,7 +121,7 @@ ; X64-AVX2-LABEL: test_reduce_v2i64: ; X64-AVX2: ## %bb.0: ; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; X64-AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; X64-AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] ; X64-AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm3 ; X64-AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm2 ; X64-AVX2-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 @@ -525,10 +524,9 @@ ; X86-AVX1-LABEL: test_reduce_v4i64: ; X86-AVX1: ## %bb.0: ; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; X86-AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] -; X86-AVX1-NEXT: ## xmm2 = mem[0,0] -; X86-AVX1-NEXT: vxorps %xmm2, %xmm1, %xmm3 -; X86-AVX1-NEXT: vxorps %xmm2, %xmm0, %xmm4 +; X86-AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,2147483648,0,2147483648] +; X86-AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm3 +; X86-AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm4 ; X86-AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3 ; X86-AVX1-NEXT: vblendvpd %xmm3, %xmm0, %xmm1, %xmm0 ; X86-AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] @@ -633,7 +631,7 @@ ; X64-AVX2-LABEL: test_reduce_v4i64: ; X64-AVX2: ## %bb.0: ; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X64-AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] +; X64-AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] ; X64-AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm3 ; X64-AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm4 ; X64-AVX2-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3 @@ -1220,16 +1218,15 @@ ; ; X86-AVX1-LABEL: test_reduce_v8i64: ; X86-AVX1: ## %bb.0: -; X86-AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] -; X86-AVX1-NEXT: ## xmm2 = mem[0,0] -; X86-AVX1-NEXT: vxorps %xmm2, %xmm1, %xmm3 -; X86-AVX1-NEXT: vxorps %xmm2, %xmm0, %xmm4 +; X86-AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,2147483648,0,2147483648] +; X86-AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm3 +; X86-AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm4 ; X86-AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3 ; X86-AVX1-NEXT: vblendvpd %xmm3, %xmm0, %xmm1, %xmm3 ; X86-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 -; X86-AVX1-NEXT: vxorps %xmm2, %xmm1, %xmm4 +; X86-AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm4 ; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; X86-AVX1-NEXT: vxorps %xmm2, %xmm0, %xmm5 +; X86-AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm5 ; X86-AVX1-NEXT: vpcmpgtq %xmm4, %xmm5, %xmm4 ; X86-AVX1-NEXT: vblendvpd %xmm4, %xmm0, %xmm1, %xmm0 ; X86-AVX1-NEXT: vxorpd %xmm2, %xmm0, %xmm1 @@ -1250,10 +1247,12 @@ ; X86-AVX2: ## %bb.0: ; X86-AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] ; X86-AVX2-NEXT: vpxor %ymm2, %ymm1, %ymm3 -; X86-AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm4 -; X86-AVX2-NEXT: vpcmpgtq %ymm3, %ymm4, %ymm3 -; X86-AVX2-NEXT: vblendvpd %ymm3, %ymm0, %ymm1, %ymm0 +; X86-AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm2 +; X86-AVX2-NEXT: vpcmpgtq %ymm3, %ymm2, %ymm2 +; X86-AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 ; X86-AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 +; X86-AVX2-NEXT: vmovddup {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; X86-AVX2-NEXT: ## xmm2 = mem[0,0] ; X86-AVX2-NEXT: vxorpd %xmm2, %xmm1, %xmm3 ; X86-AVX2-NEXT: vxorpd %xmm2, %xmm0, %xmm4 ; X86-AVX2-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3 @@ -1394,10 +1393,12 @@ ; X64-AVX2: ## %bb.0: ; X64-AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] ; X64-AVX2-NEXT: vpxor %ymm2, %ymm1, %ymm3 -; X64-AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm4 -; X64-AVX2-NEXT: vpcmpgtq %ymm3, %ymm4, %ymm3 -; X64-AVX2-NEXT: vblendvpd %ymm3, %ymm0, %ymm1, %ymm0 +; X64-AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm2 +; X64-AVX2-NEXT: vpcmpgtq %ymm3, %ymm2, %ymm2 +; X64-AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 ; X64-AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 +; X64-AVX2-NEXT: vmovddup {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; X64-AVX2-NEXT: ## xmm2 = mem[0,0] ; X64-AVX2-NEXT: vxorpd %xmm2, %xmm1, %xmm3 ; X64-AVX2-NEXT: vxorpd %xmm2, %xmm0, %xmm4 ; X64-AVX2-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3 diff --git a/llvm/test/CodeGen/X86/horizontal-reduce-umin.ll b/llvm/test/CodeGen/X86/horizontal-reduce-umin.ll --- a/llvm/test/CodeGen/X86/horizontal-reduce-umin.ll +++ b/llvm/test/CodeGen/X86/horizontal-reduce-umin.ll @@ -53,11 +53,10 @@ ; ; X86-AVX1-LABEL: test_reduce_v2i64: ; X86-AVX1: ## %bb.0: -; X86-AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] -; X86-AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] -; X86-AVX1-NEXT: ## xmm2 = mem[0,0] -; X86-AVX1-NEXT: vxorps %xmm2, %xmm0, %xmm3 -; X86-AVX1-NEXT: vxorps %xmm2, %xmm1, %xmm2 +; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X86-AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,2147483648,0,2147483648] +; X86-AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm3 +; X86-AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm2 ; X86-AVX1-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2 ; X86-AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; X86-AVX1-NEXT: vmovd %xmm0, %eax @@ -124,7 +123,7 @@ ; X64-AVX2-LABEL: test_reduce_v2i64: ; X64-AVX2: ## %bb.0: ; X64-AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; X64-AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; X64-AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] ; X64-AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm3 ; X64-AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm2 ; X64-AVX2-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2 @@ -468,11 +467,10 @@ ; ; X86-AVX1-LABEL: test_reduce_v4i64: ; X86-AVX1: ## %bb.0: -; X86-AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808] -; X86-AVX1-NEXT: ## xmm1 = mem[0,0] -; X86-AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm2 +; X86-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [0,2147483648,0,2147483648] +; X86-AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm2 ; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 -; X86-AVX1-NEXT: vxorps %xmm1, %xmm3, %xmm4 +; X86-AVX1-NEXT: vpxor %xmm1, %xmm3, %xmm4 ; X86-AVX1-NEXT: vpcmpgtq %xmm2, %xmm4, %xmm2 ; X86-AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm3, %xmm0 ; X86-AVX1-NEXT: vshufps {{.*#+}} xmm2 = xmm0[2,3,2,3] @@ -579,7 +577,7 @@ ; X64-AVX2-LABEL: test_reduce_v4i64: ; X64-AVX2: ## %bb.0: ; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X64-AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] +; X64-AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] ; X64-AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm3 ; X64-AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm4 ; X64-AVX2-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3 @@ -1137,15 +1135,14 @@ ; X86-AVX1-LABEL: test_reduce_v8i64: ; X86-AVX1: ## %bb.0: ; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 -; X86-AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] -; X86-AVX1-NEXT: ## xmm2 = mem[0,0] -; X86-AVX1-NEXT: vxorps %xmm2, %xmm3, %xmm4 +; X86-AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,2147483648,0,2147483648] +; X86-AVX1-NEXT: vpxor %xmm2, %xmm3, %xmm4 ; X86-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5 -; X86-AVX1-NEXT: vxorps %xmm2, %xmm5, %xmm6 +; X86-AVX1-NEXT: vpxor %xmm2, %xmm5, %xmm6 ; X86-AVX1-NEXT: vpcmpgtq %xmm4, %xmm6, %xmm4 ; X86-AVX1-NEXT: vblendvpd %xmm4, %xmm3, %xmm5, %xmm3 -; X86-AVX1-NEXT: vxorps %xmm2, %xmm0, %xmm4 -; X86-AVX1-NEXT: vxorps %xmm2, %xmm1, %xmm5 +; X86-AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm4 +; X86-AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm5 ; X86-AVX1-NEXT: vpcmpgtq %xmm4, %xmm5, %xmm4 ; X86-AVX1-NEXT: vblendvpd %xmm4, %xmm0, %xmm1, %xmm0 ; X86-AVX1-NEXT: vxorpd %xmm2, %xmm0, %xmm1 @@ -1166,10 +1163,12 @@ ; X86-AVX2: ## %bb.0: ; X86-AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] ; X86-AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm3 -; X86-AVX2-NEXT: vpxor %ymm2, %ymm1, %ymm4 -; X86-AVX2-NEXT: vpcmpgtq %ymm3, %ymm4, %ymm3 -; X86-AVX2-NEXT: vblendvpd %ymm3, %ymm0, %ymm1, %ymm0 +; X86-AVX2-NEXT: vpxor %ymm2, %ymm1, %ymm2 +; X86-AVX2-NEXT: vpcmpgtq %ymm3, %ymm2, %ymm2 +; X86-AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 ; X86-AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 +; X86-AVX2-NEXT: vmovddup {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; X86-AVX2-NEXT: ## xmm2 = mem[0,0] ; X86-AVX2-NEXT: vxorpd %xmm2, %xmm0, %xmm3 ; X86-AVX2-NEXT: vxorpd %xmm2, %xmm1, %xmm4 ; X86-AVX2-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3 @@ -1312,10 +1311,12 @@ ; X64-AVX2: ## %bb.0: ; X64-AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] ; X64-AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm3 -; X64-AVX2-NEXT: vpxor %ymm2, %ymm1, %ymm4 -; X64-AVX2-NEXT: vpcmpgtq %ymm3, %ymm4, %ymm3 -; X64-AVX2-NEXT: vblendvpd %ymm3, %ymm0, %ymm1, %ymm0 +; X64-AVX2-NEXT: vpxor %ymm2, %ymm1, %ymm2 +; X64-AVX2-NEXT: vpcmpgtq %ymm3, %ymm2, %ymm2 +; X64-AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 ; X64-AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 +; X64-AVX2-NEXT: vmovddup {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; X64-AVX2-NEXT: ## xmm2 = mem[0,0] ; X64-AVX2-NEXT: vxorpd %xmm2, %xmm0, %xmm3 ; X64-AVX2-NEXT: vxorpd %xmm2, %xmm1, %xmm4 ; X64-AVX2-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3 diff --git a/llvm/test/CodeGen/X86/i64-to-float.ll b/llvm/test/CodeGen/X86/i64-to-float.ll --- a/llvm/test/CodeGen/X86/i64-to-float.ll +++ b/llvm/test/CodeGen/X86/i64-to-float.ll @@ -291,12 +291,10 @@ ; ; X86-AVX-LABEL: clamp_sitofp_2i64_2f64: ; X86-AVX: # %bb.0: -; X86-AVX-NEXT: vmovddup {{.*#+}} xmm1 = [18446744073709551361,18446744073709551361] -; X86-AVX-NEXT: # xmm1 = mem[0,0] +; X86-AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [4294967041,4294967295,4294967041,4294967295] ; X86-AVX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 ; X86-AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; X86-AVX-NEXT: vmovddup {{.*#+}} xmm1 = [255,255] -; X86-AVX-NEXT: # xmm1 = mem[0,0] +; X86-AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [255,0,255,0] ; X86-AVX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 ; X86-AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; X86-AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3] diff --git a/llvm/test/CodeGen/X86/icmp-abs-C-vec.ll b/llvm/test/CodeGen/X86/icmp-abs-C-vec.ll --- a/llvm/test/CodeGen/X86/icmp-abs-C-vec.ll +++ b/llvm/test/CodeGen/X86/icmp-abs-C-vec.ll @@ -103,8 +103,7 @@ ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsubq %ymm0, %ymm1, %ymm1 ; AVX2-NEXT: vblendvpd %ymm0, %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [129,129,129,129] -; AVX2-NEXT: vpcmpeqq %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpcmpeqq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper @@ -164,8 +163,7 @@ ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsubq %ymm0, %ymm1, %ymm1 ; AVX2-NEXT: vblendvpd %ymm0, %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [129,129,129,129] -; AVX2-NEXT: vpcmpeqq %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpcmpeqq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; SSE41-LABEL: illegal_abs_to_eq_or_sext: @@ -224,8 +222,7 @@ ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsubq %ymm0, %ymm1, %ymm1 ; AVX2-NEXT: vblendvpd %ymm0, %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [129,129,129,129] -; AVX2-NEXT: vpcmpeqq %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpcmpeqq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 ; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 @@ -294,8 +291,7 @@ ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsubq %ymm0, %ymm1, %ymm1 ; AVX2-NEXT: vblendvpd %ymm0, %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [129,129,129,129] -; AVX2-NEXT: vpcmpeqq %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpcmpeqq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 ; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq @@ -358,8 +354,7 @@ ; AVX2-LABEL: legal_abs_eq_unchanged: ; AVX2: # %bb.0: ; AVX2-NEXT: vpabsd %xmm0, %xmm0 -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [129,129,129,129] -; AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX2-NEXT: retq ; ; SSE41-LABEL: legal_abs_eq_unchanged: @@ -392,8 +387,7 @@ ; AVX2-LABEL: legal_abs_eq_unchanged_sext: ; AVX2: # %bb.0: ; AVX2-NEXT: vpabsd %xmm0, %xmm0 -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [129,129,129,129] -; AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX2-NEXT: retq ; ; SSE41-LABEL: legal_abs_eq_unchanged_sext: @@ -428,8 +422,7 @@ ; AVX2-LABEL: legal_abs_ne_unchangedd: ; AVX2: # %bb.0: ; AVX2-NEXT: vpabsd %xmm0, %xmm0 -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [129,129,129,129] -; AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq @@ -469,8 +462,7 @@ ; AVX2-LABEL: legal_abs_ne_unchangedd_sext: ; AVX2: # %bb.0: ; AVX2-NEXT: vpabsd %xmm0, %xmm0 -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [129,129,129,129] -; AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq @@ -511,10 +503,8 @@ ; ; AVX2-LABEL: eq_or_to_abs_vec4x64: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [129,129,129,129] -; AVX2-NEXT: vpcmpeqq %ymm1, %ymm0, %ymm1 -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [18446744073709551487,18446744073709551487,18446744073709551487,18446744073709551487] -; AVX2-NEXT: vpcmpeqq %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpcmpeqq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1 +; AVX2-NEXT: vpcmpeqq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: vpor %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 @@ -570,10 +560,8 @@ ; ; AVX2-LABEL: eq_or_to_abs_vec4x64_sext: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [129,129,129,129] -; AVX2-NEXT: vpcmpeqq %ymm1, %ymm0, %ymm1 -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [18446744073709551487,18446744073709551487,18446744073709551487,18446744073709551487] -; AVX2-NEXT: vpcmpeqq %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpcmpeqq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1 +; AVX2-NEXT: vpcmpeqq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: vpor %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: retq ; @@ -642,11 +630,9 @@ ; ; AVX2-LABEL: ne_and_to_abs_vec4x64: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [129,129,129,129] -; AVX2-NEXT: vpcmpeqq %ymm1, %ymm0, %ymm1 +; AVX2-NEXT: vpcmpeqq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1 ; AVX2-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [18446744073709551487,18446744073709551487,18446744073709551487,18446744073709551487] -; AVX2-NEXT: vpcmpeqq %ymm3, %ymm0, %ymm0 +; AVX2-NEXT: vpcmpeqq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpandn %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 @@ -710,11 +696,9 @@ ; ; AVX2-LABEL: ne_and_to_abs_vec4x64_sext: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [129,129,129,129] -; AVX2-NEXT: vpcmpeqq %ymm1, %ymm0, %ymm1 +; AVX2-NEXT: vpcmpeqq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1 ; AVX2-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [18446744073709551487,18446744073709551487,18446744073709551487,18446744073709551487] -; AVX2-NEXT: vpcmpeqq %ymm3, %ymm0, %ymm0 +; AVX2-NEXT: vpcmpeqq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpandn %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: retq @@ -786,9 +770,8 @@ ; ; AVX2-LABEL: eq_or_to_abs_vec4x32: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1,1,1,1] ; AVX2-NEXT: vpabsd %xmm0, %xmm0 -; AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX2-NEXT: retq ; ; SSE41-LABEL: eq_or_to_abs_vec4x32: @@ -821,9 +804,8 @@ ; ; AVX2-LABEL: eq_or_to_abs_vec4x32_sext: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1,1,1,1] ; AVX2-NEXT: vpabsd %xmm0, %xmm0 -; AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX2-NEXT: retq ; ; SSE41-LABEL: eq_or_to_abs_vec4x32_sext: @@ -858,9 +840,8 @@ ; ; AVX2-LABEL: ne_and_to_abs_vec4x32: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1,1,1,1] ; AVX2-NEXT: vpabsd %xmm0, %xmm0 -; AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq @@ -900,9 +881,8 @@ ; ; AVX2-LABEL: ne_and_to_abs_vec4x32_sext: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1,1,1,1] ; AVX2-NEXT: vpabsd %xmm0, %xmm0 -; AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq diff --git a/llvm/test/CodeGen/X86/icmp-pow2-diff.ll b/llvm/test/CodeGen/X86/icmp-pow2-diff.ll --- a/llvm/test/CodeGen/X86/icmp-pow2-diff.ll +++ b/llvm/test/CodeGen/X86/icmp-pow2-diff.ll @@ -46,8 +46,7 @@ ; ; AVX2-LABEL: andnot_eq_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [4294967287,4294967287,4294967287,4294967287] -; AVX2-NEXT: vpandn %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq @@ -226,7 +225,7 @@ ; AVX2: # %bb.0: ; AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm1 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm2 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] ; AVX2-NEXT: vpcmpgtb %xmm0, %xmm2, %xmm0 ; AVX2-NEXT: vpandn %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq diff --git a/llvm/test/CodeGen/X86/insert-into-constant-vector.ll b/llvm/test/CodeGen/X86/insert-into-constant-vector.ll --- a/llvm/test/CodeGen/X86/insert-into-constant-vector.ll +++ b/llvm/test/CodeGen/X86/insert-into-constant-vector.ll @@ -150,11 +150,23 @@ ; X86-AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] ; X86-AVX-NEXT: retl ; -; X64-AVX-LABEL: elt0_v2i64: -; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vmovdqa {{.*#+}} xmm0 = -; X64-AVX-NEXT: vpinsrq $0, %rdi, %xmm0, %xmm0 -; X64-AVX-NEXT: retq +; X64-AVX1-LABEL: elt0_v2i64: +; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: vmovdqa {{.*#+}} xmm0 = +; X64-AVX1-NEXT: vpinsrq $0, %rdi, %xmm0, %xmm0 +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: elt0_v2i64: +; X64-AVX2: # %bb.0: +; X64-AVX2-NEXT: vpbroadcastq {{.*#+}} xmm0 = [1,1] +; X64-AVX2-NEXT: vpinsrq $0, %rdi, %xmm0, %xmm0 +; X64-AVX2-NEXT: retq +; +; X64-AVX512F-LABEL: elt0_v2i64: +; X64-AVX512F: # %bb.0: +; X64-AVX512F-NEXT: vpbroadcastq {{.*#+}} xmm0 = [1,1] +; X64-AVX512F-NEXT: vpinsrq $0, %rdi, %xmm0, %xmm0 +; X64-AVX512F-NEXT: retq %ins = insertelement <2 x i64> , i64 %x, i32 0 ret <2 x i64> %ins } @@ -219,13 +231,15 @@ ; ; X86-AVX-LABEL: elt1_v2f64: ; X86-AVX: # %bb.0: -; X86-AVX-NEXT: vmovaps {{.*#+}} xmm0 = <4.2E+1,u> +; X86-AVX-NEXT: vmovddup {{.*#+}} xmm0 = [4.2E+1,4.2E+1] +; X86-AVX-NEXT: # xmm0 = mem[0,0] ; X86-AVX-NEXT: vmovhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1] ; X86-AVX-NEXT: retl ; ; X64-AVX-LABEL: elt1_v2f64: ; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vmovaps {{.*#+}} xmm1 = <4.2E+1,u> +; X64-AVX-NEXT: vmovddup {{.*#+}} xmm1 = [4.2E+1,4.2E+1] +; X64-AVX-NEXT: # xmm1 = mem[0,0] ; X64-AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] ; X64-AVX-NEXT: retq %ins = insertelement <2 x double> , double %x, i32 1 diff --git a/llvm/test/CodeGen/X86/known-bits-vector.ll b/llvm/test/CodeGen/X86/known-bits-vector.ll --- a/llvm/test/CodeGen/X86/known-bits-vector.ll +++ b/llvm/test/CodeGen/X86/known-bits-vector.ll @@ -156,12 +156,12 @@ define <4 x float> @knownbits_mask_or_shuffle_uitofp(<4 x i32> %a0) nounwind { ; X86-LABEL: knownbits_mask_or_shuffle_uitofp: ; X86: # %bb.0: -; X86-NEXT: vmovaps {{.*#+}} xmm0 = [6.5535E+4,6.5535E+4,6.5535E+4,6.5535E+4] +; X86-NEXT: vbroadcastss {{.*#+}} xmm0 = [6.5535E+4,6.5535E+4,6.5535E+4,6.5535E+4] ; X86-NEXT: retl ; ; X64-LABEL: knownbits_mask_or_shuffle_uitofp: ; X64: # %bb.0: -; X64-NEXT: vmovaps {{.*#+}} xmm0 = [6.5535E+4,6.5535E+4,6.5535E+4,6.5535E+4] +; X64-NEXT: vbroadcastss {{.*#+}} xmm0 = [6.5535E+4,6.5535E+4,6.5535E+4,6.5535E+4] ; X64-NEXT: retq %1 = and <4 x i32> %a0, %2 = or <4 x i32> %1, @@ -385,7 +385,7 @@ ; X86-LABEL: knownbits_mask_concat_uitofp: ; X86: # %bb.0: ; X86-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,3,1,3] -; X86-NEXT: vmovaps {{.*#+}} xmm2 = [131071,131071,131071,131071] +; X86-NEXT: vbroadcastss {{.*#+}} xmm2 = [1.83669591E-40,1.83669591E-40,1.83669591E-40,1.83669591E-40] ; X86-NEXT: vandps %xmm2, %xmm1, %xmm1 ; X86-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,0,2] ; X86-NEXT: vandps %xmm2, %xmm0, %xmm0 @@ -396,7 +396,7 @@ ; X64-LABEL: knownbits_mask_concat_uitofp: ; X64: # %bb.0: ; X64-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,3,1,3] -; X64-NEXT: vmovaps {{.*#+}} xmm2 = [131071,131071,131071,131071] +; X64-NEXT: vbroadcastss {{.*#+}} xmm2 = [1.83669591E-40,1.83669591E-40,1.83669591E-40,1.83669591E-40] ; X64-NEXT: vandps %xmm2, %xmm1, %xmm1 ; X64-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,0,2] ; X64-NEXT: vandps %xmm2, %xmm0, %xmm0 diff --git a/llvm/test/CodeGen/X86/machine-combiner-int-vec.ll b/llvm/test/CodeGen/X86/machine-combiner-int-vec.ll --- a/llvm/test/CodeGen/X86/machine-combiner-int-vec.ll +++ b/llvm/test/CodeGen/X86/machine-combiner-int-vec.ll @@ -425,7 +425,7 @@ ; AVX2-LABEL: reassociate_umax_v2i64: ; AVX2: # %bb.0: ; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808] ; AVX2-NEXT: vpxor %xmm1, %xmm2, %xmm4 ; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm5 ; AVX2-NEXT: vpcmpgtq %xmm5, %xmm4, %xmm4 @@ -723,7 +723,7 @@ ; AVX2-LABEL: reassociate_umin_v2i64: ; AVX2: # %bb.0: ; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808] ; AVX2-NEXT: vpxor %xmm1, %xmm2, %xmm4 ; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm5 ; AVX2-NEXT: vpcmpgtq %xmm4, %xmm5, %xmm4 @@ -1076,15 +1076,15 @@ ; ; AVX2-LABEL: reassociate_umax_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm4 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] ; AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpxor %ymm4, %ymm2, %ymm1 -; AVX2-NEXT: vpxor %ymm4, %ymm0, %ymm5 -; AVX2-NEXT: vpcmpgtq %ymm5, %ymm1, %ymm1 -; AVX2-NEXT: vblendvpd %ymm1, %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vxorpd %ymm4, %ymm0, %ymm1 -; AVX2-NEXT: vpxor %ymm4, %ymm3, %ymm2 -; AVX2-NEXT: vpcmpgtq %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] +; AVX2-NEXT: vpxor %ymm1, %ymm2, %ymm4 +; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm5 +; AVX2-NEXT: vpcmpgtq %ymm5, %ymm4, %ymm4 +; AVX2-NEXT: vblendvpd %ymm4, %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vxorpd %ymm1, %ymm0, %ymm2 +; AVX2-NEXT: vpxor %ymm1, %ymm3, %ymm1 +; AVX2-NEXT: vpcmpgtq %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vblendvpd %ymm1, %ymm3, %ymm0, %ymm0 ; AVX2-NEXT: retq ; @@ -1489,15 +1489,15 @@ ; ; AVX2-LABEL: reassociate_umin_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm4 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] ; AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpxor %ymm4, %ymm2, %ymm1 -; AVX2-NEXT: vpxor %ymm4, %ymm0, %ymm5 -; AVX2-NEXT: vpcmpgtq %ymm1, %ymm5, %ymm1 -; AVX2-NEXT: vblendvpd %ymm1, %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vxorpd %ymm4, %ymm0, %ymm1 -; AVX2-NEXT: vpxor %ymm4, %ymm3, %ymm2 -; AVX2-NEXT: vpcmpgtq %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] +; AVX2-NEXT: vpxor %ymm1, %ymm2, %ymm4 +; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm5 +; AVX2-NEXT: vpcmpgtq %ymm4, %ymm5, %ymm4 +; AVX2-NEXT: vblendvpd %ymm4, %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vxorpd %ymm1, %ymm0, %ymm2 +; AVX2-NEXT: vpxor %ymm1, %ymm3, %ymm1 +; AVX2-NEXT: vpcmpgtq %ymm1, %ymm2, %ymm1 ; AVX2-NEXT: vblendvpd %ymm1, %ymm3, %ymm0, %ymm0 ; AVX2-NEXT: retq ; @@ -2068,8 +2068,8 @@ ; AVX2-LABEL: reassociate_umax_v8i64: ; AVX2: # %bb.0: ; AVX2-NEXT: vpaddq %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] ; AVX2-NEXT: vpaddq %ymm3, %ymm1, %ymm1 +; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] ; AVX2-NEXT: vpxor %ymm2, %ymm5, %ymm3 ; AVX2-NEXT: vpxor %ymm2, %ymm1, %ymm8 ; AVX2-NEXT: vpcmpgtq %ymm8, %ymm3, %ymm3 @@ -2833,8 +2833,8 @@ ; AVX2-LABEL: reassociate_umin_v8i64: ; AVX2: # %bb.0: ; AVX2-NEXT: vpaddq %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] ; AVX2-NEXT: vpaddq %ymm3, %ymm1, %ymm1 +; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] ; AVX2-NEXT: vpxor %ymm2, %ymm5, %ymm3 ; AVX2-NEXT: vpxor %ymm2, %ymm1, %ymm8 ; AVX2-NEXT: vpcmpgtq %ymm3, %ymm8, %ymm3 diff --git a/llvm/test/CodeGen/X86/masked_load.ll b/llvm/test/CodeGen/X86/masked_load.ll --- a/llvm/test/CodeGen/X86/masked_load.ll +++ b/llvm/test/CodeGen/X86/masked_load.ll @@ -6856,8 +6856,7 @@ ; ; AVX1OR2-LABEL: mload_constmask_v16f64_allones_split: ; AVX1OR2: ## %bb.0: -; AVX1OR2-NEXT: vbroadcastf128 {{.*#+}} ymm0 = [18446744073709551615,0,18446744073709551615,0] -; AVX1OR2-NEXT: ## ymm0 = mem[0,1,0,1] +; AVX1OR2-NEXT: vmovapd {{.*#+}} ymm0 = [18446744073709551615,0,18446744073709551615,0] ; AVX1OR2-NEXT: vmaskmovpd 64(%rdi), %ymm0, %ymm1 ; AVX1OR2-NEXT: vblendpd {{.*#+}} ymm2 = ymm1[0],ymm2[1],ymm1[2],ymm2[3] ; AVX1OR2-NEXT: vmaskmovpd 96(%rdi), %ymm0, %ymm0 diff --git a/llvm/test/CodeGen/X86/masked_store_trunc.ll b/llvm/test/CodeGen/X86/masked_store_trunc.ll --- a/llvm/test/CodeGen/X86/masked_store_trunc.ll +++ b/llvm/test/CodeGen/X86/masked_store_trunc.ll @@ -376,7 +376,7 @@ ; ; AVX1-LABEL: truncstore_v8i64_v8i16: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovaps {{.*#+}} ymm3 = [65535,65535,65535,65535] +; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm3 = [3.2378592100206092E-319,3.2378592100206092E-319,3.2378592100206092E-319,3.2378592100206092E-319] ; AVX1-NEXT: vandps %ymm3, %ymm1, %ymm1 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 ; AVX1-NEXT: vpackusdw %xmm4, %xmm1, %xmm1 @@ -764,7 +764,7 @@ ; ; AVX1-LABEL: truncstore_v8i64_v8i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovaps {{.*#+}} ymm3 = [255,255,255,255] +; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm3 = [1.2598673968951787E-321,1.2598673968951787E-321,1.2598673968951787E-321,1.2598673968951787E-321] ; AVX1-NEXT: vandps %ymm3, %ymm1, %ymm1 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 ; AVX1-NEXT: vpackusdw %xmm4, %xmm1, %xmm1 @@ -2221,7 +2221,7 @@ ; ; AVX1-LABEL: truncstore_v16i32_v16i16: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovaps {{.*#+}} ymm4 = [65535,65535,65535,65535,65535,65535,65535,65535] +; AVX1-NEXT: vbroadcastss {{.*#+}} ymm4 = [9.18340949E-41,9.18340949E-41,9.18340949E-41,9.18340949E-41,9.18340949E-41,9.18340949E-41,9.18340949E-41,9.18340949E-41] ; AVX1-NEXT: vandps %ymm4, %ymm1, %ymm1 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5 ; AVX1-NEXT: vpackusdw %xmm5, %xmm1, %xmm1 @@ -2897,7 +2897,7 @@ ; ; AVX1-LABEL: truncstore_v16i32_v16i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovaps {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255] +; AVX1-NEXT: vbroadcastss {{.*#+}} ymm4 = [3.57331108E-43,3.57331108E-43,3.57331108E-43,3.57331108E-43,3.57331108E-43,3.57331108E-43,3.57331108E-43,3.57331108E-43] ; AVX1-NEXT: vandps %ymm4, %ymm1, %ymm1 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5 ; AVX1-NEXT: vpackusdw %xmm5, %xmm1, %xmm1 @@ -4879,7 +4879,7 @@ ; ; AVX1-LABEL: truncstore_v32i16_v32i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovaps {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX1-NEXT: vbroadcastss {{.*#+}} ymm3 = [2.34184089E-38,2.34184089E-38,2.34184089E-38,2.34184089E-38,2.34184089E-38,2.34184089E-38,2.34184089E-38,2.34184089E-38] ; AVX1-NEXT: vandps %ymm3, %ymm1, %ymm1 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 ; AVX1-NEXT: vpackuswb %xmm4, %xmm1, %xmm1 diff --git a/llvm/test/CodeGen/X86/masked_store_trunc_ssat.ll b/llvm/test/CodeGen/X86/masked_store_trunc_ssat.ll --- a/llvm/test/CodeGen/X86/masked_store_trunc_ssat.ll +++ b/llvm/test/CodeGen/X86/masked_store_trunc_ssat.ll @@ -2451,10 +2451,10 @@ ; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 ; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm1 ; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,2],zero,zero -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [2147483647,2147483647] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [2147483647,2147483647] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm3 ; AVX2-NEXT: vblendvpd %xmm3, %xmm0, %xmm2, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [18446744071562067968,18446744071562067968] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [18446744071562067968,18446744071562067968] ; AVX2-NEXT: vpcmpgtq %xmm2, %xmm0, %xmm3 ; AVX2-NEXT: vblendvpd %xmm3, %xmm0, %xmm2, %xmm0 ; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3] @@ -2588,34 +2588,63 @@ ; SSE4-NEXT: pextrw $1, %xmm0, 2(%rdi) ; SSE4-NEXT: retq ; -; AVX-LABEL: truncstore_v2i64_v2i16: -; AVX: # %bb.0: -; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [32767,32767] -; AVX-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm4 -; AVX-NEXT: vblendvpd %xmm4, %xmm0, %xmm3, %xmm0 -; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [18446744073709518848,18446744073709518848] -; AVX-NEXT: vpcmpgtq %xmm3, %xmm0, %xmm4 -; AVX-NEXT: vblendvpd %xmm4, %xmm0, %xmm3, %xmm0 -; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3] -; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; AVX-NEXT: vpcmpeqq %xmm2, %xmm1, %xmm1 -; AVX-NEXT: vmovmskpd %xmm1, %eax -; AVX-NEXT: xorl $3, %eax -; AVX-NEXT: testb $1, %al -; AVX-NEXT: jne .LBB7_1 -; AVX-NEXT: # %bb.2: # %else -; AVX-NEXT: testb $2, %al -; AVX-NEXT: jne .LBB7_3 -; AVX-NEXT: .LBB7_4: # %else2 -; AVX-NEXT: retq -; AVX-NEXT: .LBB7_1: # %cond.store -; AVX-NEXT: vpextrw $0, %xmm0, (%rdi) -; AVX-NEXT: testb $2, %al -; AVX-NEXT: je .LBB7_4 -; AVX-NEXT: .LBB7_3: # %cond.store1 -; AVX-NEXT: vpextrw $1, %xmm0, 2(%rdi) -; AVX-NEXT: retq +; AVX1-LABEL: truncstore_v2i64_v2i16: +; AVX1: # %bb.0: +; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [32767,32767] +; AVX1-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm4 +; AVX1-NEXT: vblendvpd %xmm4, %xmm0, %xmm3, %xmm0 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [18446744073709518848,18446744073709518848] +; AVX1-NEXT: vpcmpgtq %xmm3, %xmm0, %xmm4 +; AVX1-NEXT: vblendvpd %xmm4, %xmm0, %xmm3, %xmm0 +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; AVX1-NEXT: vpcmpeqq %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vmovmskpd %xmm1, %eax +; AVX1-NEXT: xorl $3, %eax +; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: jne .LBB7_1 +; AVX1-NEXT: # %bb.2: # %else +; AVX1-NEXT: testb $2, %al +; AVX1-NEXT: jne .LBB7_3 +; AVX1-NEXT: .LBB7_4: # %else2 +; AVX1-NEXT: retq +; AVX1-NEXT: .LBB7_1: # %cond.store +; AVX1-NEXT: vpextrw $0, %xmm0, (%rdi) +; AVX1-NEXT: testb $2, %al +; AVX1-NEXT: je .LBB7_4 +; AVX1-NEXT: .LBB7_3: # %cond.store1 +; AVX1-NEXT: vpextrw $1, %xmm0, 2(%rdi) +; AVX1-NEXT: retq +; +; AVX2-LABEL: truncstore_v2i64_v2i16: +; AVX2: # %bb.0: +; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm3 = [32767,32767] +; AVX2-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm4 +; AVX2-NEXT: vblendvpd %xmm4, %xmm0, %xmm3, %xmm0 +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm3 = [18446744073709518848,18446744073709518848] +; AVX2-NEXT: vpcmpgtq %xmm3, %xmm0, %xmm4 +; AVX2-NEXT: vblendvpd %xmm4, %xmm0, %xmm3, %xmm0 +; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; AVX2-NEXT: vpcmpeqq %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vmovmskpd %xmm1, %eax +; AVX2-NEXT: xorl $3, %eax +; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: jne .LBB7_1 +; AVX2-NEXT: # %bb.2: # %else +; AVX2-NEXT: testb $2, %al +; AVX2-NEXT: jne .LBB7_3 +; AVX2-NEXT: .LBB7_4: # %else2 +; AVX2-NEXT: retq +; AVX2-NEXT: .LBB7_1: # %cond.store +; AVX2-NEXT: vpextrw $0, %xmm0, (%rdi) +; AVX2-NEXT: testb $2, %al +; AVX2-NEXT: je .LBB7_4 +; AVX2-NEXT: .LBB7_3: # %cond.store1 +; AVX2-NEXT: vpextrw $1, %xmm0, 2(%rdi) +; AVX2-NEXT: retq ; ; AVX512F-LABEL: truncstore_v2i64_v2i16: ; AVX512F: # %bb.0: @@ -2756,33 +2785,61 @@ ; SSE4-NEXT: pextrb $1, %xmm2, 1(%rdi) ; SSE4-NEXT: retq ; -; AVX-LABEL: truncstore_v2i64_v2i8: -; AVX: # %bb.0: -; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [127,127] -; AVX-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm4 -; AVX-NEXT: vblendvpd %xmm4, %xmm0, %xmm3, %xmm0 -; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [18446744073709551488,18446744073709551488] -; AVX-NEXT: vpcmpgtq %xmm3, %xmm0, %xmm4 -; AVX-NEXT: vblendvpd %xmm4, %xmm0, %xmm3, %xmm0 -; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpcmpeqq %xmm2, %xmm1, %xmm1 -; AVX-NEXT: vmovmskpd %xmm1, %eax -; AVX-NEXT: xorl $3, %eax -; AVX-NEXT: testb $1, %al -; AVX-NEXT: jne .LBB8_1 -; AVX-NEXT: # %bb.2: # %else -; AVX-NEXT: testb $2, %al -; AVX-NEXT: jne .LBB8_3 -; AVX-NEXT: .LBB8_4: # %else2 -; AVX-NEXT: retq -; AVX-NEXT: .LBB8_1: # %cond.store -; AVX-NEXT: vpextrb $0, %xmm0, (%rdi) -; AVX-NEXT: testb $2, %al -; AVX-NEXT: je .LBB8_4 -; AVX-NEXT: .LBB8_3: # %cond.store1 -; AVX-NEXT: vpextrb $1, %xmm0, 1(%rdi) -; AVX-NEXT: retq +; AVX1-LABEL: truncstore_v2i64_v2i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [127,127] +; AVX1-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm4 +; AVX1-NEXT: vblendvpd %xmm4, %xmm0, %xmm3, %xmm0 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [18446744073709551488,18446744073709551488] +; AVX1-NEXT: vpcmpgtq %xmm3, %xmm0, %xmm4 +; AVX1-NEXT: vblendvpd %xmm4, %xmm0, %xmm3, %xmm0 +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX1-NEXT: vpcmpeqq %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vmovmskpd %xmm1, %eax +; AVX1-NEXT: xorl $3, %eax +; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: jne .LBB8_1 +; AVX1-NEXT: # %bb.2: # %else +; AVX1-NEXT: testb $2, %al +; AVX1-NEXT: jne .LBB8_3 +; AVX1-NEXT: .LBB8_4: # %else2 +; AVX1-NEXT: retq +; AVX1-NEXT: .LBB8_1: # %cond.store +; AVX1-NEXT: vpextrb $0, %xmm0, (%rdi) +; AVX1-NEXT: testb $2, %al +; AVX1-NEXT: je .LBB8_4 +; AVX1-NEXT: .LBB8_3: # %cond.store1 +; AVX1-NEXT: vpextrb $1, %xmm0, 1(%rdi) +; AVX1-NEXT: retq +; +; AVX2-LABEL: truncstore_v2i64_v2i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm3 = [127,127] +; AVX2-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm4 +; AVX2-NEXT: vblendvpd %xmm4, %xmm0, %xmm3, %xmm0 +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm3 = [18446744073709551488,18446744073709551488] +; AVX2-NEXT: vpcmpgtq %xmm3, %xmm0, %xmm4 +; AVX2-NEXT: vblendvpd %xmm4, %xmm0, %xmm3, %xmm0 +; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vpcmpeqq %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vmovmskpd %xmm1, %eax +; AVX2-NEXT: xorl $3, %eax +; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: jne .LBB8_1 +; AVX2-NEXT: # %bb.2: # %else +; AVX2-NEXT: testb $2, %al +; AVX2-NEXT: jne .LBB8_3 +; AVX2-NEXT: .LBB8_4: # %else2 +; AVX2-NEXT: retq +; AVX2-NEXT: .LBB8_1: # %cond.store +; AVX2-NEXT: vpextrb $0, %xmm0, (%rdi) +; AVX2-NEXT: testb $2, %al +; AVX2-NEXT: je .LBB8_4 +; AVX2-NEXT: .LBB8_3: # %cond.store1 +; AVX2-NEXT: vpextrb $1, %xmm0, 1(%rdi) +; AVX2-NEXT: retq ; ; AVX512F-LABEL: truncstore_v2i64_v2i8: ; AVX512F: # %bb.0: @@ -5231,85 +5288,44 @@ ; SSE4-NEXT: pextrb $3, %xmm0, 3(%rdi) ; SSE4-NEXT: retq ; -; AVX1-LABEL: truncstore_v4i32_v4i8: -; AVX1: # %bb.0: -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpminsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX1-NEXT: vpmaxsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX1-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 -; AVX1-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 -; AVX1-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vmovmskps %xmm1, %eax -; AVX1-NEXT: xorl $15, %eax -; AVX1-NEXT: testb $1, %al -; AVX1-NEXT: jne .LBB14_1 -; AVX1-NEXT: # %bb.2: # %else -; AVX1-NEXT: testb $2, %al -; AVX1-NEXT: jne .LBB14_3 -; AVX1-NEXT: .LBB14_4: # %else2 -; AVX1-NEXT: testb $4, %al -; AVX1-NEXT: jne .LBB14_5 -; AVX1-NEXT: .LBB14_6: # %else4 -; AVX1-NEXT: testb $8, %al -; AVX1-NEXT: jne .LBB14_7 -; AVX1-NEXT: .LBB14_8: # %else6 -; AVX1-NEXT: retq -; AVX1-NEXT: .LBB14_1: # %cond.store -; AVX1-NEXT: vpextrb $0, %xmm0, (%rdi) -; AVX1-NEXT: testb $2, %al -; AVX1-NEXT: je .LBB14_4 -; AVX1-NEXT: .LBB14_3: # %cond.store1 -; AVX1-NEXT: vpextrb $1, %xmm0, 1(%rdi) -; AVX1-NEXT: testb $4, %al -; AVX1-NEXT: je .LBB14_6 -; AVX1-NEXT: .LBB14_5: # %cond.store3 -; AVX1-NEXT: vpextrb $2, %xmm0, 2(%rdi) -; AVX1-NEXT: testb $8, %al -; AVX1-NEXT: je .LBB14_8 -; AVX1-NEXT: .LBB14_7: # %cond.store5 -; AVX1-NEXT: vpextrb $3, %xmm0, 3(%rdi) -; AVX1-NEXT: retq -; -; AVX2-LABEL: truncstore_v4i32_v4i8: -; AVX2: # %bb.0: -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [127,127,127,127] -; AVX2-NEXT: vpminsd %xmm3, %xmm0, %xmm0 -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [4294967168,4294967168,4294967168,4294967168] -; AVX2-NEXT: vpmaxsd %xmm3, %xmm0, %xmm0 -; AVX2-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 -; AVX2-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 -; AVX2-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vmovmskps %xmm1, %eax -; AVX2-NEXT: xorl $15, %eax -; AVX2-NEXT: testb $1, %al -; AVX2-NEXT: jne .LBB14_1 -; AVX2-NEXT: # %bb.2: # %else -; AVX2-NEXT: testb $2, %al -; AVX2-NEXT: jne .LBB14_3 -; AVX2-NEXT: .LBB14_4: # %else2 -; AVX2-NEXT: testb $4, %al -; AVX2-NEXT: jne .LBB14_5 -; AVX2-NEXT: .LBB14_6: # %else4 -; AVX2-NEXT: testb $8, %al -; AVX2-NEXT: jne .LBB14_7 -; AVX2-NEXT: .LBB14_8: # %else6 -; AVX2-NEXT: retq -; AVX2-NEXT: .LBB14_1: # %cond.store -; AVX2-NEXT: vpextrb $0, %xmm0, (%rdi) -; AVX2-NEXT: testb $2, %al -; AVX2-NEXT: je .LBB14_4 -; AVX2-NEXT: .LBB14_3: # %cond.store1 -; AVX2-NEXT: vpextrb $1, %xmm0, 1(%rdi) -; AVX2-NEXT: testb $4, %al -; AVX2-NEXT: je .LBB14_6 -; AVX2-NEXT: .LBB14_5: # %cond.store3 -; AVX2-NEXT: vpextrb $2, %xmm0, 2(%rdi) -; AVX2-NEXT: testb $8, %al -; AVX2-NEXT: je .LBB14_8 -; AVX2-NEXT: .LBB14_7: # %cond.store5 -; AVX2-NEXT: vpextrb $3, %xmm0, 3(%rdi) -; AVX2-NEXT: retq +; AVX-LABEL: truncstore_v4i32_v4i8: +; AVX: # %bb.0: +; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX-NEXT: vpminsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX-NEXT: vpmaxsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vmovmskps %xmm1, %eax +; AVX-NEXT: xorl $15, %eax +; AVX-NEXT: testb $1, %al +; AVX-NEXT: jne .LBB14_1 +; AVX-NEXT: # %bb.2: # %else +; AVX-NEXT: testb $2, %al +; AVX-NEXT: jne .LBB14_3 +; AVX-NEXT: .LBB14_4: # %else2 +; AVX-NEXT: testb $4, %al +; AVX-NEXT: jne .LBB14_5 +; AVX-NEXT: .LBB14_6: # %else4 +; AVX-NEXT: testb $8, %al +; AVX-NEXT: jne .LBB14_7 +; AVX-NEXT: .LBB14_8: # %else6 +; AVX-NEXT: retq +; AVX-NEXT: .LBB14_1: # %cond.store +; AVX-NEXT: vpextrb $0, %xmm0, (%rdi) +; AVX-NEXT: testb $2, %al +; AVX-NEXT: je .LBB14_4 +; AVX-NEXT: .LBB14_3: # %cond.store1 +; AVX-NEXT: vpextrb $1, %xmm0, 1(%rdi) +; AVX-NEXT: testb $4, %al +; AVX-NEXT: je .LBB14_6 +; AVX-NEXT: .LBB14_5: # %cond.store3 +; AVX-NEXT: vpextrb $2, %xmm0, 2(%rdi) +; AVX-NEXT: testb $8, %al +; AVX-NEXT: je .LBB14_8 +; AVX-NEXT: .LBB14_7: # %cond.store5 +; AVX-NEXT: vpextrb $3, %xmm0, 3(%rdi) +; AVX-NEXT: retq ; ; AVX512F-LABEL: truncstore_v4i32_v4i8: ; AVX512F: # %bb.0: diff --git a/llvm/test/CodeGen/X86/masked_store_trunc_usat.ll b/llvm/test/CodeGen/X86/masked_store_trunc_usat.ll --- a/llvm/test/CodeGen/X86/masked_store_trunc_usat.ll +++ b/llvm/test/CodeGen/X86/masked_store_trunc_usat.ll @@ -232,7 +232,8 @@ ; AVX1-NEXT: vpxor %xmm3, %xmm1, %xmm4 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [9223372041149743103,9223372041149743103] ; AVX1-NEXT: vpcmpgtq %xmm4, %xmm5, %xmm4 -; AVX1-NEXT: vmovapd {{.*#+}} xmm6 = [4294967295,4294967295] +; AVX1-NEXT: vmovddup {{.*#+}} xmm6 = [2.1219957904712067E-314,2.1219957904712067E-314] +; AVX1-NEXT: # xmm6 = mem[0,0] ; AVX1-NEXT: vblendvpd %xmm4, %xmm1, %xmm6, %xmm4 ; AVX1-NEXT: vpxor %xmm3, %xmm0, %xmm7 ; AVX1-NEXT: vpcmpgtq %xmm7, %xmm5, %xmm7 @@ -258,7 +259,7 @@ ; AVX2-NEXT: vpcmpeqd %ymm3, %ymm2, %ymm2 ; AVX2-NEXT: vpcmpeqd %ymm3, %ymm3, %ymm3 ; AVX2-NEXT: vpxor %ymm3, %ymm2, %ymm2 -; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm3 = [4294967295,4294967295,4294967295,4294967295] +; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm3 = [2.1219957904712067E-314,2.1219957904712067E-314,2.1219957904712067E-314,2.1219957904712067E-314] ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm4 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] ; AVX2-NEXT: vpxor %ymm4, %ymm1, %ymm5 ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm6 = [9223372041149743103,9223372041149743103,9223372041149743103,9223372041149743103] @@ -545,7 +546,8 @@ ; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm5 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [9223372036854841343,9223372036854841343] ; AVX1-NEXT: vpcmpgtq %xmm5, %xmm6, %xmm5 -; AVX1-NEXT: vmovapd {{.*#+}} xmm7 = [65535,65535] +; AVX1-NEXT: vmovddup {{.*#+}} xmm7 = [3.2378592100206092E-319,3.2378592100206092E-319] +; AVX1-NEXT: # xmm7 = mem[0,0] ; AVX1-NEXT: vblendvpd %xmm5, %xmm3, %xmm7, %xmm3 ; AVX1-NEXT: vpxor %xmm4, %xmm0, %xmm5 ; AVX1-NEXT: vpcmpgtq %xmm5, %xmm6, %xmm5 @@ -629,7 +631,7 @@ ; AVX2-LABEL: truncstore_v8i64_v8i16: ; AVX2: # %bb.0: ; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm4 = [65535,65535,65535,65535] +; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm4 = [3.2378592100206092E-319,3.2378592100206092E-319,3.2378592100206092E-319,3.2378592100206092E-319] ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm5 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] ; AVX2-NEXT: vpxor %ymm5, %ymm1, %ymm6 ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm7 = [9223372036854841343,9223372036854841343,9223372036854841343,9223372036854841343] @@ -1018,7 +1020,8 @@ ; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm5 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [9223372036854776063,9223372036854776063] ; AVX1-NEXT: vpcmpgtq %xmm5, %xmm6, %xmm5 -; AVX1-NEXT: vmovapd {{.*#+}} xmm7 = [255,255] +; AVX1-NEXT: vmovddup {{.*#+}} xmm7 = [1.2598673968951787E-321,1.2598673968951787E-321] +; AVX1-NEXT: # xmm7 = mem[0,0] ; AVX1-NEXT: vblendvpd %xmm5, %xmm3, %xmm7, %xmm3 ; AVX1-NEXT: vpxor %xmm4, %xmm0, %xmm5 ; AVX1-NEXT: vpcmpgtq %xmm5, %xmm6, %xmm5 @@ -1103,7 +1106,7 @@ ; AVX2-LABEL: truncstore_v8i64_v8i8: ; AVX2: # %bb.0: ; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm4 = [255,255,255,255] +; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm4 = [1.2598673968951787E-321,1.2598673968951787E-321,1.2598673968951787E-321,1.2598673968951787E-321] ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm5 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] ; AVX2-NEXT: vpxor %ymm5, %ymm1, %ymm6 ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm7 = [9223372036854776063,9223372036854776063,9223372036854776063,9223372036854776063] @@ -1393,7 +1396,8 @@ ; AVX1-NEXT: vpxor %xmm3, %xmm2, %xmm4 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [9223372041149743103,9223372041149743103] ; AVX1-NEXT: vpcmpgtq %xmm4, %xmm5, %xmm4 -; AVX1-NEXT: vmovapd {{.*#+}} xmm6 = [4294967295,4294967295] +; AVX1-NEXT: vmovddup {{.*#+}} xmm6 = [2.1219957904712067E-314,2.1219957904712067E-314] +; AVX1-NEXT: # xmm6 = mem[0,0] ; AVX1-NEXT: vblendvpd %xmm4, %xmm2, %xmm6, %xmm2 ; AVX1-NEXT: vpxor %xmm3, %xmm0, %xmm3 ; AVX1-NEXT: vpcmpgtq %xmm3, %xmm5, %xmm3 @@ -1409,9 +1413,8 @@ ; AVX2-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 ; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 ; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm2 = [4294967295,4294967295,4294967295,4294967295] -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] -; AVX2-NEXT: vpxor %ymm3, %ymm0, %ymm3 +; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm2 = [2.1219957904712067E-314,2.1219957904712067E-314,2.1219957904712067E-314,2.1219957904712067E-314] +; AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm3 ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm4 = [9223372041149743103,9223372041149743103,9223372041149743103,9223372041149743103] ; AVX2-NEXT: vpcmpgtq %ymm3, %ymm4, %ymm3 ; AVX2-NEXT: vblendvpd %ymm3, %ymm0, %ymm2, %ymm0 @@ -1588,7 +1591,8 @@ ; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm5 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [9223372036854841343,9223372036854841343] ; AVX1-NEXT: vpcmpgtq %xmm5, %xmm6, %xmm5 -; AVX1-NEXT: vmovapd {{.*#+}} xmm7 = [65535,65535] +; AVX1-NEXT: vmovddup {{.*#+}} xmm7 = [3.2378592100206092E-319,3.2378592100206092E-319] +; AVX1-NEXT: # xmm7 = mem[0,0] ; AVX1-NEXT: vblendvpd %xmm5, %xmm3, %xmm7, %xmm3 ; AVX1-NEXT: vpxor %xmm4, %xmm0, %xmm4 ; AVX1-NEXT: vpcmpgtq %xmm4, %xmm6, %xmm4 @@ -1632,9 +1636,8 @@ ; AVX2-LABEL: truncstore_v4i64_v4i16: ; AVX2: # %bb.0: ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm3 = [65535,65535,65535,65535] -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm4 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] -; AVX2-NEXT: vpxor %ymm4, %ymm0, %ymm4 +; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm3 = [3.2378592100206092E-319,3.2378592100206092E-319,3.2378592100206092E-319,3.2378592100206092E-319] +; AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm4 ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm5 = [9223372036854841343,9223372036854841343,9223372036854841343,9223372036854841343] ; AVX2-NEXT: vpcmpgtq %ymm4, %ymm5, %ymm4 ; AVX2-NEXT: vblendvpd %ymm4, %ymm0, %ymm3, %ymm0 @@ -1869,7 +1872,8 @@ ; AVX1-NEXT: vpxor %xmm3, %xmm0, %xmm4 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [9223372036854776063,9223372036854776063] ; AVX1-NEXT: vpcmpgtq %xmm4, %xmm5, %xmm4 -; AVX1-NEXT: vmovapd {{.*#+}} xmm6 = [255,255] +; AVX1-NEXT: vmovddup {{.*#+}} xmm6 = [1.2598673968951787E-321,1.2598673968951787E-321] +; AVX1-NEXT: # xmm6 = mem[0,0] ; AVX1-NEXT: vblendvpd %xmm4, %xmm0, %xmm6, %xmm4 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: vpxor %xmm3, %xmm0, %xmm3 @@ -1916,9 +1920,8 @@ ; AVX2-LABEL: truncstore_v4i64_v4i8: ; AVX2: # %bb.0: ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm3 = [255,255,255,255] -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm4 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] -; AVX2-NEXT: vpxor %ymm4, %ymm0, %ymm4 +; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm3 = [1.2598673968951787E-321,1.2598673968951787E-321,1.2598673968951787E-321,1.2598673968951787E-321] +; AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm4 ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm5 = [9223372036854776063,9223372036854776063,9223372036854776063,9223372036854776063] ; AVX2-NEXT: vpcmpgtq %ymm4, %ymm5, %ymm4 ; AVX2-NEXT: vblendvpd %ymm4, %ymm0, %ymm3, %ymm0 @@ -2099,7 +2102,8 @@ ; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 ; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,2],zero,zero -; AVX1-NEXT: vmovapd {{.*#+}} xmm2 = [4294967295,4294967295] +; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [2.1219957904712067E-314,2.1219957904712067E-314] +; AVX1-NEXT: # xmm2 = mem[0,0] ; AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm3 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [9223372041149743103,9223372041149743103] ; AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3 @@ -2115,9 +2119,10 @@ ; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 ; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm1 ; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,2],zero,zero -; AVX2-NEXT: vmovapd {{.*#+}} xmm2 = [4294967295,4294967295] +; AVX2-NEXT: vmovddup {{.*#+}} xmm2 = [2.1219957904712067E-314,2.1219957904712067E-314] +; AVX2-NEXT: # xmm2 = mem[0,0] ; AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm3 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = [9223372041149743103,9223372041149743103] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm4 = [9223372041149743103,9223372041149743103] ; AVX2-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3 ; AVX2-NEXT: vblendvpd %xmm3, %xmm0, %xmm2, %xmm0 ; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3] @@ -2231,33 +2236,63 @@ ; SSE4-NEXT: pextrw $1, %xmm0, 2(%rdi) ; SSE4-NEXT: retq ; -; AVX-LABEL: truncstore_v2i64_v2i16: -; AVX: # %bb.0: -; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX-NEXT: vmovapd {{.*#+}} xmm3 = [65535,65535] -; AVX-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm4 -; AVX-NEXT: vmovdqa {{.*#+}} xmm5 = [9223372036854841343,9223372036854841343] -; AVX-NEXT: vpcmpgtq %xmm4, %xmm5, %xmm4 -; AVX-NEXT: vblendvpd %xmm4, %xmm0, %xmm3, %xmm0 -; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3] -; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; AVX-NEXT: vpcmpeqq %xmm2, %xmm1, %xmm1 -; AVX-NEXT: vmovmskpd %xmm1, %eax -; AVX-NEXT: xorl $3, %eax -; AVX-NEXT: testb $1, %al -; AVX-NEXT: jne .LBB7_1 -; AVX-NEXT: # %bb.2: # %else -; AVX-NEXT: testb $2, %al -; AVX-NEXT: jne .LBB7_3 -; AVX-NEXT: .LBB7_4: # %else2 -; AVX-NEXT: retq -; AVX-NEXT: .LBB7_1: # %cond.store -; AVX-NEXT: vpextrw $0, %xmm0, (%rdi) -; AVX-NEXT: testb $2, %al -; AVX-NEXT: je .LBB7_4 -; AVX-NEXT: .LBB7_3: # %cond.store1 -; AVX-NEXT: vpextrw $1, %xmm0, 2(%rdi) -; AVX-NEXT: retq +; AVX1-LABEL: truncstore_v2i64_v2i16: +; AVX1: # %bb.0: +; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vmovddup {{.*#+}} xmm3 = [3.2378592100206092E-319,3.2378592100206092E-319] +; AVX1-NEXT: # xmm3 = mem[0,0] +; AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm4 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [9223372036854841343,9223372036854841343] +; AVX1-NEXT: vpcmpgtq %xmm4, %xmm5, %xmm4 +; AVX1-NEXT: vblendvpd %xmm4, %xmm0, %xmm3, %xmm0 +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; AVX1-NEXT: vpcmpeqq %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vmovmskpd %xmm1, %eax +; AVX1-NEXT: xorl $3, %eax +; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: jne .LBB7_1 +; AVX1-NEXT: # %bb.2: # %else +; AVX1-NEXT: testb $2, %al +; AVX1-NEXT: jne .LBB7_3 +; AVX1-NEXT: .LBB7_4: # %else2 +; AVX1-NEXT: retq +; AVX1-NEXT: .LBB7_1: # %cond.store +; AVX1-NEXT: vpextrw $0, %xmm0, (%rdi) +; AVX1-NEXT: testb $2, %al +; AVX1-NEXT: je .LBB7_4 +; AVX1-NEXT: .LBB7_3: # %cond.store1 +; AVX1-NEXT: vpextrw $1, %xmm0, 2(%rdi) +; AVX1-NEXT: retq +; +; AVX2-LABEL: truncstore_v2i64_v2i16: +; AVX2: # %bb.0: +; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vmovddup {{.*#+}} xmm3 = [3.2378592100206092E-319,3.2378592100206092E-319] +; AVX2-NEXT: # xmm3 = mem[0,0] +; AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm4 +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm5 = [9223372036854841343,9223372036854841343] +; AVX2-NEXT: vpcmpgtq %xmm4, %xmm5, %xmm4 +; AVX2-NEXT: vblendvpd %xmm4, %xmm0, %xmm3, %xmm0 +; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; AVX2-NEXT: vpcmpeqq %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vmovmskpd %xmm1, %eax +; AVX2-NEXT: xorl $3, %eax +; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: jne .LBB7_1 +; AVX2-NEXT: # %bb.2: # %else +; AVX2-NEXT: testb $2, %al +; AVX2-NEXT: jne .LBB7_3 +; AVX2-NEXT: .LBB7_4: # %else2 +; AVX2-NEXT: retq +; AVX2-NEXT: .LBB7_1: # %cond.store +; AVX2-NEXT: vpextrw $0, %xmm0, (%rdi) +; AVX2-NEXT: testb $2, %al +; AVX2-NEXT: je .LBB7_4 +; AVX2-NEXT: .LBB7_3: # %cond.store1 +; AVX2-NEXT: vpextrw $1, %xmm0, 2(%rdi) +; AVX2-NEXT: retq ; ; AVX512F-LABEL: truncstore_v2i64_v2i16: ; AVX512F: # %bb.0: @@ -2378,32 +2413,61 @@ ; SSE4-NEXT: pextrb $1, %xmm3, 1(%rdi) ; SSE4-NEXT: retq ; -; AVX-LABEL: truncstore_v2i64_v2i8: -; AVX: # %bb.0: -; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX-NEXT: vmovapd {{.*#+}} xmm3 = [255,255] -; AVX-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm4 -; AVX-NEXT: vmovdqa {{.*#+}} xmm5 = [9223372036854776063,9223372036854776063] -; AVX-NEXT: vpcmpgtq %xmm4, %xmm5, %xmm4 -; AVX-NEXT: vblendvpd %xmm4, %xmm0, %xmm3, %xmm0 -; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpcmpeqq %xmm2, %xmm1, %xmm1 -; AVX-NEXT: vmovmskpd %xmm1, %eax -; AVX-NEXT: xorl $3, %eax -; AVX-NEXT: testb $1, %al -; AVX-NEXT: jne .LBB8_1 -; AVX-NEXT: # %bb.2: # %else -; AVX-NEXT: testb $2, %al -; AVX-NEXT: jne .LBB8_3 -; AVX-NEXT: .LBB8_4: # %else2 -; AVX-NEXT: retq -; AVX-NEXT: .LBB8_1: # %cond.store -; AVX-NEXT: vpextrb $0, %xmm0, (%rdi) -; AVX-NEXT: testb $2, %al -; AVX-NEXT: je .LBB8_4 -; AVX-NEXT: .LBB8_3: # %cond.store1 -; AVX-NEXT: vpextrb $1, %xmm0, 1(%rdi) -; AVX-NEXT: retq +; AVX1-LABEL: truncstore_v2i64_v2i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vmovddup {{.*#+}} xmm3 = [1.2598673968951787E-321,1.2598673968951787E-321] +; AVX1-NEXT: # xmm3 = mem[0,0] +; AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm4 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [9223372036854776063,9223372036854776063] +; AVX1-NEXT: vpcmpgtq %xmm4, %xmm5, %xmm4 +; AVX1-NEXT: vblendvpd %xmm4, %xmm0, %xmm3, %xmm0 +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX1-NEXT: vpcmpeqq %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vmovmskpd %xmm1, %eax +; AVX1-NEXT: xorl $3, %eax +; AVX1-NEXT: testb $1, %al +; AVX1-NEXT: jne .LBB8_1 +; AVX1-NEXT: # %bb.2: # %else +; AVX1-NEXT: testb $2, %al +; AVX1-NEXT: jne .LBB8_3 +; AVX1-NEXT: .LBB8_4: # %else2 +; AVX1-NEXT: retq +; AVX1-NEXT: .LBB8_1: # %cond.store +; AVX1-NEXT: vpextrb $0, %xmm0, (%rdi) +; AVX1-NEXT: testb $2, %al +; AVX1-NEXT: je .LBB8_4 +; AVX1-NEXT: .LBB8_3: # %cond.store1 +; AVX1-NEXT: vpextrb $1, %xmm0, 1(%rdi) +; AVX1-NEXT: retq +; +; AVX2-LABEL: truncstore_v2i64_v2i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vmovddup {{.*#+}} xmm3 = [1.2598673968951787E-321,1.2598673968951787E-321] +; AVX2-NEXT: # xmm3 = mem[0,0] +; AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm4 +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm5 = [9223372036854776063,9223372036854776063] +; AVX2-NEXT: vpcmpgtq %xmm4, %xmm5, %xmm4 +; AVX2-NEXT: vblendvpd %xmm4, %xmm0, %xmm3, %xmm0 +; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vpcmpeqq %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vmovmskpd %xmm1, %eax +; AVX2-NEXT: xorl $3, %eax +; AVX2-NEXT: testb $1, %al +; AVX2-NEXT: jne .LBB8_1 +; AVX2-NEXT: # %bb.2: # %else +; AVX2-NEXT: testb $2, %al +; AVX2-NEXT: jne .LBB8_3 +; AVX2-NEXT: .LBB8_4: # %else2 +; AVX2-NEXT: retq +; AVX2-NEXT: .LBB8_1: # %cond.store +; AVX2-NEXT: vpextrb $0, %xmm0, (%rdi) +; AVX2-NEXT: testb $2, %al +; AVX2-NEXT: je .LBB8_4 +; AVX2-NEXT: .LBB8_3: # %cond.store1 +; AVX2-NEXT: vpextrb $1, %xmm0, 1(%rdi) +; AVX2-NEXT: retq ; ; AVX512F-LABEL: truncstore_v2i64_v2i8: ; AVX512F: # %bb.0: @@ -4128,8 +4192,7 @@ ; AVX2-LABEL: truncstore_v8i32_v8i16: ; AVX2: # %bb.0: ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm3 = [65535,65535,65535,65535,65535,65535,65535,65535] -; AVX2-NEXT: vpminud %ymm3, %ymm0, %ymm0 +; AVX2-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm3 ; AVX2-NEXT: vpackusdw %xmm3, %xmm0, %xmm0 ; AVX2-NEXT: vpcmpeqd %ymm2, %ymm1, %ymm1 @@ -4524,8 +4587,7 @@ ; AVX2-LABEL: truncstore_v8i32_v8i8: ; AVX2: # %bb.0: ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255] -; AVX2-NEXT: vpminud %ymm3, %ymm0, %ymm0 +; AVX2-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm3 ; AVX2-NEXT: vpackusdw %xmm3, %xmm0, %xmm0 ; AVX2-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 @@ -4771,80 +4833,42 @@ ; SSE4-NEXT: pextrw $3, %xmm0, 6(%rdi) ; SSE4-NEXT: retq ; -; AVX1-LABEL: truncstore_v4i32_v4i16: -; AVX1: # %bb.0: -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX1-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 -; AVX1-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vmovmskps %xmm1, %eax -; AVX1-NEXT: xorl $15, %eax -; AVX1-NEXT: testb $1, %al -; AVX1-NEXT: jne .LBB13_1 -; AVX1-NEXT: # %bb.2: # %else -; AVX1-NEXT: testb $2, %al -; AVX1-NEXT: jne .LBB13_3 -; AVX1-NEXT: .LBB13_4: # %else2 -; AVX1-NEXT: testb $4, %al -; AVX1-NEXT: jne .LBB13_5 -; AVX1-NEXT: .LBB13_6: # %else4 -; AVX1-NEXT: testb $8, %al -; AVX1-NEXT: jne .LBB13_7 -; AVX1-NEXT: .LBB13_8: # %else6 -; AVX1-NEXT: retq -; AVX1-NEXT: .LBB13_1: # %cond.store -; AVX1-NEXT: vpextrw $0, %xmm0, (%rdi) -; AVX1-NEXT: testb $2, %al -; AVX1-NEXT: je .LBB13_4 -; AVX1-NEXT: .LBB13_3: # %cond.store1 -; AVX1-NEXT: vpextrw $1, %xmm0, 2(%rdi) -; AVX1-NEXT: testb $4, %al -; AVX1-NEXT: je .LBB13_6 -; AVX1-NEXT: .LBB13_5: # %cond.store3 -; AVX1-NEXT: vpextrw $2, %xmm0, 4(%rdi) -; AVX1-NEXT: testb $8, %al -; AVX1-NEXT: je .LBB13_8 -; AVX1-NEXT: .LBB13_7: # %cond.store5 -; AVX1-NEXT: vpextrw $3, %xmm0, 6(%rdi) -; AVX1-NEXT: retq -; -; AVX2-LABEL: truncstore_v4i32_v4i16: -; AVX2: # %bb.0: -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [65535,65535,65535,65535] -; AVX2-NEXT: vpminud %xmm3, %xmm0, %xmm0 -; AVX2-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 -; AVX2-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vmovmskps %xmm1, %eax -; AVX2-NEXT: xorl $15, %eax -; AVX2-NEXT: testb $1, %al -; AVX2-NEXT: jne .LBB13_1 -; AVX2-NEXT: # %bb.2: # %else -; AVX2-NEXT: testb $2, %al -; AVX2-NEXT: jne .LBB13_3 -; AVX2-NEXT: .LBB13_4: # %else2 -; AVX2-NEXT: testb $4, %al -; AVX2-NEXT: jne .LBB13_5 -; AVX2-NEXT: .LBB13_6: # %else4 -; AVX2-NEXT: testb $8, %al -; AVX2-NEXT: jne .LBB13_7 -; AVX2-NEXT: .LBB13_8: # %else6 -; AVX2-NEXT: retq -; AVX2-NEXT: .LBB13_1: # %cond.store -; AVX2-NEXT: vpextrw $0, %xmm0, (%rdi) -; AVX2-NEXT: testb $2, %al -; AVX2-NEXT: je .LBB13_4 -; AVX2-NEXT: .LBB13_3: # %cond.store1 -; AVX2-NEXT: vpextrw $1, %xmm0, 2(%rdi) -; AVX2-NEXT: testb $4, %al -; AVX2-NEXT: je .LBB13_6 -; AVX2-NEXT: .LBB13_5: # %cond.store3 -; AVX2-NEXT: vpextrw $2, %xmm0, 4(%rdi) -; AVX2-NEXT: testb $8, %al -; AVX2-NEXT: je .LBB13_8 -; AVX2-NEXT: .LBB13_7: # %cond.store5 -; AVX2-NEXT: vpextrw $3, %xmm0, 6(%rdi) -; AVX2-NEXT: retq +; AVX-LABEL: truncstore_v4i32_v4i16: +; AVX: # %bb.0: +; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vmovmskps %xmm1, %eax +; AVX-NEXT: xorl $15, %eax +; AVX-NEXT: testb $1, %al +; AVX-NEXT: jne .LBB13_1 +; AVX-NEXT: # %bb.2: # %else +; AVX-NEXT: testb $2, %al +; AVX-NEXT: jne .LBB13_3 +; AVX-NEXT: .LBB13_4: # %else2 +; AVX-NEXT: testb $4, %al +; AVX-NEXT: jne .LBB13_5 +; AVX-NEXT: .LBB13_6: # %else4 +; AVX-NEXT: testb $8, %al +; AVX-NEXT: jne .LBB13_7 +; AVX-NEXT: .LBB13_8: # %else6 +; AVX-NEXT: retq +; AVX-NEXT: .LBB13_1: # %cond.store +; AVX-NEXT: vpextrw $0, %xmm0, (%rdi) +; AVX-NEXT: testb $2, %al +; AVX-NEXT: je .LBB13_4 +; AVX-NEXT: .LBB13_3: # %cond.store1 +; AVX-NEXT: vpextrw $1, %xmm0, 2(%rdi) +; AVX-NEXT: testb $4, %al +; AVX-NEXT: je .LBB13_6 +; AVX-NEXT: .LBB13_5: # %cond.store3 +; AVX-NEXT: vpextrw $2, %xmm0, 4(%rdi) +; AVX-NEXT: testb $8, %al +; AVX-NEXT: je .LBB13_8 +; AVX-NEXT: .LBB13_7: # %cond.store5 +; AVX-NEXT: vpextrw $3, %xmm0, 6(%rdi) +; AVX-NEXT: retq ; ; AVX512F-LABEL: truncstore_v4i32_v4i16: ; AVX512F: # %bb.0: @@ -4998,82 +5022,43 @@ ; SSE4-NEXT: pextrb $3, %xmm0, 3(%rdi) ; SSE4-NEXT: retq ; -; AVX1-LABEL: truncstore_v4i32_v4i8: -; AVX1: # %bb.0: -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX1-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 -; AVX1-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 -; AVX1-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vmovmskps %xmm1, %eax -; AVX1-NEXT: xorl $15, %eax -; AVX1-NEXT: testb $1, %al -; AVX1-NEXT: jne .LBB14_1 -; AVX1-NEXT: # %bb.2: # %else -; AVX1-NEXT: testb $2, %al -; AVX1-NEXT: jne .LBB14_3 -; AVX1-NEXT: .LBB14_4: # %else2 -; AVX1-NEXT: testb $4, %al -; AVX1-NEXT: jne .LBB14_5 -; AVX1-NEXT: .LBB14_6: # %else4 -; AVX1-NEXT: testb $8, %al -; AVX1-NEXT: jne .LBB14_7 -; AVX1-NEXT: .LBB14_8: # %else6 -; AVX1-NEXT: retq -; AVX1-NEXT: .LBB14_1: # %cond.store -; AVX1-NEXT: vpextrb $0, %xmm0, (%rdi) -; AVX1-NEXT: testb $2, %al -; AVX1-NEXT: je .LBB14_4 -; AVX1-NEXT: .LBB14_3: # %cond.store1 -; AVX1-NEXT: vpextrb $1, %xmm0, 1(%rdi) -; AVX1-NEXT: testb $4, %al -; AVX1-NEXT: je .LBB14_6 -; AVX1-NEXT: .LBB14_5: # %cond.store3 -; AVX1-NEXT: vpextrb $2, %xmm0, 2(%rdi) -; AVX1-NEXT: testb $8, %al -; AVX1-NEXT: je .LBB14_8 -; AVX1-NEXT: .LBB14_7: # %cond.store5 -; AVX1-NEXT: vpextrb $3, %xmm0, 3(%rdi) -; AVX1-NEXT: retq -; -; AVX2-LABEL: truncstore_v4i32_v4i8: -; AVX2: # %bb.0: -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [255,255,255,255] -; AVX2-NEXT: vpminud %xmm3, %xmm0, %xmm0 -; AVX2-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 -; AVX2-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 -; AVX2-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vmovmskps %xmm1, %eax -; AVX2-NEXT: xorl $15, %eax -; AVX2-NEXT: testb $1, %al -; AVX2-NEXT: jne .LBB14_1 -; AVX2-NEXT: # %bb.2: # %else -; AVX2-NEXT: testb $2, %al -; AVX2-NEXT: jne .LBB14_3 -; AVX2-NEXT: .LBB14_4: # %else2 -; AVX2-NEXT: testb $4, %al -; AVX2-NEXT: jne .LBB14_5 -; AVX2-NEXT: .LBB14_6: # %else4 -; AVX2-NEXT: testb $8, %al -; AVX2-NEXT: jne .LBB14_7 -; AVX2-NEXT: .LBB14_8: # %else6 -; AVX2-NEXT: retq -; AVX2-NEXT: .LBB14_1: # %cond.store -; AVX2-NEXT: vpextrb $0, %xmm0, (%rdi) -; AVX2-NEXT: testb $2, %al -; AVX2-NEXT: je .LBB14_4 -; AVX2-NEXT: .LBB14_3: # %cond.store1 -; AVX2-NEXT: vpextrb $1, %xmm0, 1(%rdi) -; AVX2-NEXT: testb $4, %al -; AVX2-NEXT: je .LBB14_6 -; AVX2-NEXT: .LBB14_5: # %cond.store3 -; AVX2-NEXT: vpextrb $2, %xmm0, 2(%rdi) -; AVX2-NEXT: testb $8, %al -; AVX2-NEXT: je .LBB14_8 -; AVX2-NEXT: .LBB14_7: # %cond.store5 -; AVX2-NEXT: vpextrb $3, %xmm0, 3(%rdi) -; AVX2-NEXT: retq +; AVX-LABEL: truncstore_v4i32_v4i8: +; AVX: # %bb.0: +; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vmovmskps %xmm1, %eax +; AVX-NEXT: xorl $15, %eax +; AVX-NEXT: testb $1, %al +; AVX-NEXT: jne .LBB14_1 +; AVX-NEXT: # %bb.2: # %else +; AVX-NEXT: testb $2, %al +; AVX-NEXT: jne .LBB14_3 +; AVX-NEXT: .LBB14_4: # %else2 +; AVX-NEXT: testb $4, %al +; AVX-NEXT: jne .LBB14_5 +; AVX-NEXT: .LBB14_6: # %else4 +; AVX-NEXT: testb $8, %al +; AVX-NEXT: jne .LBB14_7 +; AVX-NEXT: .LBB14_8: # %else6 +; AVX-NEXT: retq +; AVX-NEXT: .LBB14_1: # %cond.store +; AVX-NEXT: vpextrb $0, %xmm0, (%rdi) +; AVX-NEXT: testb $2, %al +; AVX-NEXT: je .LBB14_4 +; AVX-NEXT: .LBB14_3: # %cond.store1 +; AVX-NEXT: vpextrb $1, %xmm0, 1(%rdi) +; AVX-NEXT: testb $4, %al +; AVX-NEXT: je .LBB14_6 +; AVX-NEXT: .LBB14_5: # %cond.store3 +; AVX-NEXT: vpextrb $2, %xmm0, 2(%rdi) +; AVX-NEXT: testb $8, %al +; AVX-NEXT: je .LBB14_8 +; AVX-NEXT: .LBB14_7: # %cond.store5 +; AVX-NEXT: vpextrb $3, %xmm0, 3(%rdi) +; AVX-NEXT: retq ; ; AVX512F-LABEL: truncstore_v4i32_v4i8: ; AVX512F: # %bb.0: @@ -5857,7 +5842,7 @@ ; AVX2-LABEL: truncstore_v32i16_v32i8: ; AVX2: # %bb.0: ; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX2-NEXT: vpminuw %ymm4, %ymm1, %ymm1 ; AVX2-NEXT: vpminuw %ymm4, %ymm0, %ymm0 ; AVX2-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 @@ -6097,7 +6082,7 @@ ; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512F-NEXT: vpcmpeqb %ymm2, %ymm1, %ymm1 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX512F-NEXT: vpminuw %ymm3, %ymm2, %ymm2 ; AVX512F-NEXT: vpminuw %ymm3, %ymm0, %ymm0 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero diff --git a/llvm/test/CodeGen/X86/memset-nonzero.ll b/llvm/test/CodeGen/X86/memset-nonzero.ll --- a/llvm/test/CodeGen/X86/memset-nonzero.ll +++ b/llvm/test/CodeGen/X86/memset-nonzero.ll @@ -28,7 +28,7 @@ ; ; AVX-LABEL: memset_16_nonzero_bytes: ; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42] +; AVX-NEXT: vbroadcastss {{.*#+}} xmm0 = [1.51136617E-13,1.51136617E-13,1.51136617E-13,1.51136617E-13] ; AVX-NEXT: vmovups %xmm0, (%rdi) ; AVX-NEXT: retq %call = tail call ptr @__memset_chk(ptr %x, i32 42, i64 16, i64 -1) @@ -54,7 +54,7 @@ ; ; AVX-LABEL: memset_32_nonzero_bytes: ; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42] +; AVX-NEXT: vbroadcastss {{.*#+}} ymm0 = [1.51136617E-13,1.51136617E-13,1.51136617E-13,1.51136617E-13,1.51136617E-13,1.51136617E-13,1.51136617E-13,1.51136617E-13] ; AVX-NEXT: vmovups %ymm0, (%rdi) ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq @@ -87,7 +87,7 @@ ; ; AVX1-LABEL: memset_64_nonzero_bytes: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovaps {{.*#+}} ymm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42] +; AVX1-NEXT: vbroadcastss {{.*#+}} ymm0 = [1.51136617E-13,1.51136617E-13,1.51136617E-13,1.51136617E-13,1.51136617E-13,1.51136617E-13,1.51136617E-13,1.51136617E-13] ; AVX1-NEXT: vmovups %ymm0, 32(%rdi) ; AVX1-NEXT: vmovups %ymm0, (%rdi) ; AVX1-NEXT: vzeroupper @@ -95,7 +95,7 @@ ; ; AVX2-LABEL: memset_64_nonzero_bytes: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovaps {{.*#+}} ymm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42] +; AVX2-NEXT: vbroadcastss {{.*#+}} ymm0 = [1.51136617E-13,1.51136617E-13,1.51136617E-13,1.51136617E-13,1.51136617E-13,1.51136617E-13,1.51136617E-13,1.51136617E-13] ; AVX2-NEXT: vmovups %ymm0, 32(%rdi) ; AVX2-NEXT: vmovups %ymm0, (%rdi) ; AVX2-NEXT: vzeroupper @@ -156,7 +156,7 @@ ; ; AVX1-LABEL: memset_128_nonzero_bytes: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovaps {{.*#+}} ymm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42] +; AVX1-NEXT: vbroadcastss {{.*#+}} ymm0 = [1.51136617E-13,1.51136617E-13,1.51136617E-13,1.51136617E-13,1.51136617E-13,1.51136617E-13,1.51136617E-13,1.51136617E-13] ; AVX1-NEXT: vmovups %ymm0, 96(%rdi) ; AVX1-NEXT: vmovups %ymm0, 64(%rdi) ; AVX1-NEXT: vmovups %ymm0, 32(%rdi) @@ -166,7 +166,7 @@ ; ; AVX2-LABEL: memset_128_nonzero_bytes: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovaps {{.*#+}} ymm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42] +; AVX2-NEXT: vbroadcastss {{.*#+}} ymm0 = [1.51136617E-13,1.51136617E-13,1.51136617E-13,1.51136617E-13,1.51136617E-13,1.51136617E-13,1.51136617E-13,1.51136617E-13] ; AVX2-NEXT: vmovups %ymm0, 96(%rdi) ; AVX2-NEXT: vmovups %ymm0, 64(%rdi) ; AVX2-NEXT: vmovups %ymm0, 32(%rdi) @@ -223,7 +223,7 @@ ; ; AVX1-LABEL: memset_256_nonzero_bytes: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovaps {{.*#+}} ymm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42] +; AVX1-NEXT: vbroadcastss {{.*#+}} ymm0 = [1.51136617E-13,1.51136617E-13,1.51136617E-13,1.51136617E-13,1.51136617E-13,1.51136617E-13,1.51136617E-13,1.51136617E-13] ; AVX1-NEXT: vmovups %ymm0, 224(%rdi) ; AVX1-NEXT: vmovups %ymm0, 192(%rdi) ; AVX1-NEXT: vmovups %ymm0, 160(%rdi) @@ -237,7 +237,7 @@ ; ; AVX2-LABEL: memset_256_nonzero_bytes: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovaps {{.*#+}} ymm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42] +; AVX2-NEXT: vbroadcastss {{.*#+}} ymm0 = [1.51136617E-13,1.51136617E-13,1.51136617E-13,1.51136617E-13,1.51136617E-13,1.51136617E-13,1.51136617E-13,1.51136617E-13] ; AVX2-NEXT: vmovups %ymm0, 224(%rdi) ; AVX2-NEXT: vmovups %ymm0, 192(%rdi) ; AVX2-NEXT: vmovups %ymm0, 160(%rdi) diff --git a/llvm/test/CodeGen/X86/merge-store-constants.ll b/llvm/test/CodeGen/X86/merge-store-constants.ll --- a/llvm/test/CodeGen/X86/merge-store-constants.ll +++ b/llvm/test/CodeGen/X86/merge-store-constants.ll @@ -58,14 +58,14 @@ ; X32-LABEL: big_nonzero_32_bytes_splat: ; X32: # %bb.0: ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: vmovaps {{.*#+}} ymm0 = [42,42,42,42,42,42,42,42] +; X32-NEXT: vbroadcastss {{.*#+}} ymm0 = [5.88545355E-44,5.88545355E-44,5.88545355E-44,5.88545355E-44,5.88545355E-44,5.88545355E-44,5.88545355E-44,5.88545355E-44] ; X32-NEXT: vmovups %ymm0, (%eax) ; X32-NEXT: vzeroupper ; X32-NEXT: retl ; ; X64-LABEL: big_nonzero_32_bytes_splat: ; X64: # %bb.0: -; X64-NEXT: vmovaps {{.*#+}} ymm0 = [42,42,42,42,42,42,42,42] +; X64-NEXT: vbroadcastss {{.*#+}} ymm0 = [5.88545355E-44,5.88545355E-44,5.88545355E-44,5.88545355E-44,5.88545355E-44,5.88545355E-44,5.88545355E-44,5.88545355E-44] ; X64-NEXT: vmovups %ymm0, (%rdi) ; X64-NEXT: vzeroupper ; X64-NEXT: retq diff --git a/llvm/test/CodeGen/X86/midpoint-int-vec-128.ll b/llvm/test/CodeGen/X86/midpoint-int-vec-128.ll --- a/llvm/test/CodeGen/X86/midpoint-int-vec-128.ll +++ b/llvm/test/CodeGen/X86/midpoint-int-vec-128.ll @@ -63,64 +63,28 @@ ; SSE41-NEXT: paddd %xmm2, %xmm0 ; SSE41-NEXT: retq ; -; AVX1-LABEL: vec128_i32_signed_reg_reg: -; AVX1: # %bb.0: -; AVX1-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm2 -; AVX1-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 -; AVX1-NEXT: vpminsd %xmm1, %xmm0, %xmm3 -; AVX1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm1 -; AVX1-NEXT: vpsubd %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpsrld $1, %xmm1, %xmm1 -; AVX1-NEXT: vpmulld %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: vec128_i32_signed_reg_reg: -; AVX2: # %bb.0: -; AVX2-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm2 -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [1,1,1,1] -; AVX2-NEXT: vpor %xmm3, %xmm2, %xmm2 -; AVX2-NEXT: vpminsd %xmm1, %xmm0, %xmm3 -; AVX2-NEXT: vpmaxsd %xmm1, %xmm0, %xmm1 -; AVX2-NEXT: vpsubd %xmm3, %xmm1, %xmm1 -; AVX2-NEXT: vpsrld $1, %xmm1, %xmm1 -; AVX2-NEXT: vpmulld %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpaddd %xmm0, %xmm1, %xmm0 -; AVX2-NEXT: retq -; -; XOP-FALLBACK-LABEL: vec128_i32_signed_reg_reg: -; XOP-FALLBACK: # %bb.0: -; XOP-FALLBACK-NEXT: vpcomgtd %xmm1, %xmm0, %xmm2 -; XOP-FALLBACK-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 -; XOP-FALLBACK-NEXT: vpminsd %xmm1, %xmm0, %xmm3 -; XOP-FALLBACK-NEXT: vpmaxsd %xmm1, %xmm0, %xmm1 -; XOP-FALLBACK-NEXT: vpsubd %xmm3, %xmm1, %xmm1 -; XOP-FALLBACK-NEXT: vpsrld $1, %xmm1, %xmm1 -; XOP-FALLBACK-NEXT: vpmacsdd %xmm0, %xmm2, %xmm1, %xmm0 -; XOP-FALLBACK-NEXT: retq -; -; XOPAVX1-LABEL: vec128_i32_signed_reg_reg: -; XOPAVX1: # %bb.0: -; XOPAVX1-NEXT: vpcomgtd %xmm1, %xmm0, %xmm2 -; XOPAVX1-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 -; XOPAVX1-NEXT: vpminsd %xmm1, %xmm0, %xmm3 -; XOPAVX1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm1 -; XOPAVX1-NEXT: vpsubd %xmm3, %xmm1, %xmm1 -; XOPAVX1-NEXT: vpsrld $1, %xmm1, %xmm1 -; XOPAVX1-NEXT: vpmacsdd %xmm0, %xmm2, %xmm1, %xmm0 -; XOPAVX1-NEXT: retq +; AVX-LABEL: vec128_i32_signed_reg_reg: +; AVX: # %bb.0: +; AVX-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm2 +; AVX-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; AVX-NEXT: vpminsd %xmm1, %xmm0, %xmm3 +; AVX-NEXT: vpmaxsd %xmm1, %xmm0, %xmm1 +; AVX-NEXT: vpsubd %xmm3, %xmm1, %xmm1 +; AVX-NEXT: vpsrld $1, %xmm1, %xmm1 +; AVX-NEXT: vpmulld %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vpaddd %xmm0, %xmm1, %xmm0 +; AVX-NEXT: retq ; -; XOPAVX2-LABEL: vec128_i32_signed_reg_reg: -; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vpcomgtd %xmm1, %xmm0, %xmm2 -; XOPAVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [1,1,1,1] -; XOPAVX2-NEXT: vpor %xmm3, %xmm2, %xmm2 -; XOPAVX2-NEXT: vpminsd %xmm1, %xmm0, %xmm3 -; XOPAVX2-NEXT: vpmaxsd %xmm1, %xmm0, %xmm1 -; XOPAVX2-NEXT: vpsubd %xmm3, %xmm1, %xmm1 -; XOPAVX2-NEXT: vpsrld $1, %xmm1, %xmm1 -; XOPAVX2-NEXT: vpmacsdd %xmm0, %xmm2, %xmm1, %xmm0 -; XOPAVX2-NEXT: retq +; XOP-LABEL: vec128_i32_signed_reg_reg: +; XOP: # %bb.0: +; XOP-NEXT: vpcomgtd %xmm1, %xmm0, %xmm2 +; XOP-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; XOP-NEXT: vpminsd %xmm1, %xmm0, %xmm3 +; XOP-NEXT: vpmaxsd %xmm1, %xmm0, %xmm1 +; XOP-NEXT: vpsubd %xmm3, %xmm1, %xmm1 +; XOP-NEXT: vpsrld $1, %xmm1, %xmm1 +; XOP-NEXT: vpmacsdd %xmm0, %xmm2, %xmm1, %xmm0 +; XOP-NEXT: retq ; ; AVX512F-LABEL: vec128_i32_signed_reg_reg: ; AVX512F: # %bb.0: @@ -224,68 +188,30 @@ ; SSE41-NEXT: paddd %xmm4, %xmm0 ; SSE41-NEXT: retq ; -; AVX1-LABEL: vec128_i32_unsigned_reg_reg: -; AVX1: # %bb.0: -; AVX1-NEXT: vpminud %xmm1, %xmm0, %xmm2 -; AVX1-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm3 -; AVX1-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4 -; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm3 -; AVX1-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 -; AVX1-NEXT: vpmaxud %xmm1, %xmm0, %xmm1 -; AVX1-NEXT: vpsubd %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpsrld $1, %xmm1, %xmm1 -; AVX1-NEXT: vpmulld %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: vec128_i32_unsigned_reg_reg: -; AVX2: # %bb.0: -; AVX2-NEXT: vpminud %xmm1, %xmm0, %xmm2 -; AVX2-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm3 -; AVX2-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4 -; AVX2-NEXT: vpxor %xmm4, %xmm3, %xmm3 -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm4 = [1,1,1,1] -; AVX2-NEXT: vpor %xmm4, %xmm3, %xmm3 -; AVX2-NEXT: vpmaxud %xmm1, %xmm0, %xmm1 -; AVX2-NEXT: vpsubd %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpsrld $1, %xmm1, %xmm1 -; AVX2-NEXT: vpmulld %xmm3, %xmm1, %xmm1 -; AVX2-NEXT: vpaddd %xmm0, %xmm1, %xmm0 -; AVX2-NEXT: retq -; -; XOP-FALLBACK-LABEL: vec128_i32_unsigned_reg_reg: -; XOP-FALLBACK: # %bb.0: -; XOP-FALLBACK-NEXT: vpcomgtud %xmm1, %xmm0, %xmm2 -; XOP-FALLBACK-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 -; XOP-FALLBACK-NEXT: vpminud %xmm1, %xmm0, %xmm3 -; XOP-FALLBACK-NEXT: vpmaxud %xmm1, %xmm0, %xmm1 -; XOP-FALLBACK-NEXT: vpsubd %xmm3, %xmm1, %xmm1 -; XOP-FALLBACK-NEXT: vpsrld $1, %xmm1, %xmm1 -; XOP-FALLBACK-NEXT: vpmacsdd %xmm0, %xmm2, %xmm1, %xmm0 -; XOP-FALLBACK-NEXT: retq -; -; XOPAVX1-LABEL: vec128_i32_unsigned_reg_reg: -; XOPAVX1: # %bb.0: -; XOPAVX1-NEXT: vpcomgtud %xmm1, %xmm0, %xmm2 -; XOPAVX1-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 -; XOPAVX1-NEXT: vpminud %xmm1, %xmm0, %xmm3 -; XOPAVX1-NEXT: vpmaxud %xmm1, %xmm0, %xmm1 -; XOPAVX1-NEXT: vpsubd %xmm3, %xmm1, %xmm1 -; XOPAVX1-NEXT: vpsrld $1, %xmm1, %xmm1 -; XOPAVX1-NEXT: vpmacsdd %xmm0, %xmm2, %xmm1, %xmm0 -; XOPAVX1-NEXT: retq +; AVX-LABEL: vec128_i32_unsigned_reg_reg: +; AVX: # %bb.0: +; AVX-NEXT: vpminud %xmm1, %xmm0, %xmm2 +; AVX-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm3 +; AVX-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4 +; AVX-NEXT: vpxor %xmm4, %xmm3, %xmm3 +; AVX-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 +; AVX-NEXT: vpmaxud %xmm1, %xmm0, %xmm1 +; AVX-NEXT: vpsubd %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vpsrld $1, %xmm1, %xmm1 +; AVX-NEXT: vpmulld %xmm3, %xmm1, %xmm1 +; AVX-NEXT: vpaddd %xmm0, %xmm1, %xmm0 +; AVX-NEXT: retq ; -; XOPAVX2-LABEL: vec128_i32_unsigned_reg_reg: -; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vpcomgtud %xmm1, %xmm0, %xmm2 -; XOPAVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [1,1,1,1] -; XOPAVX2-NEXT: vpor %xmm3, %xmm2, %xmm2 -; XOPAVX2-NEXT: vpminud %xmm1, %xmm0, %xmm3 -; XOPAVX2-NEXT: vpmaxud %xmm1, %xmm0, %xmm1 -; XOPAVX2-NEXT: vpsubd %xmm3, %xmm1, %xmm1 -; XOPAVX2-NEXT: vpsrld $1, %xmm1, %xmm1 -; XOPAVX2-NEXT: vpmacsdd %xmm0, %xmm2, %xmm1, %xmm0 -; XOPAVX2-NEXT: retq +; XOP-LABEL: vec128_i32_unsigned_reg_reg: +; XOP: # %bb.0: +; XOP-NEXT: vpcomgtud %xmm1, %xmm0, %xmm2 +; XOP-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; XOP-NEXT: vpminud %xmm1, %xmm0, %xmm3 +; XOP-NEXT: vpmaxud %xmm1, %xmm0, %xmm1 +; XOP-NEXT: vpsubd %xmm3, %xmm1, %xmm1 +; XOP-NEXT: vpsrld $1, %xmm1, %xmm1 +; XOP-NEXT: vpmacsdd %xmm0, %xmm2, %xmm1, %xmm0 +; XOP-NEXT: retq ; ; AVX512F-LABEL: vec128_i32_unsigned_reg_reg: ; AVX512F: # %bb.0: @@ -388,69 +314,30 @@ ; SSE41-NEXT: paddd %xmm1, %xmm0 ; SSE41-NEXT: retq ; -; AVX1-LABEL: vec128_i32_signed_mem_reg: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa (%rdi), %xmm1 -; AVX1-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm2 -; AVX1-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 -; AVX1-NEXT: vpminsd %xmm0, %xmm1, %xmm3 -; AVX1-NEXT: vpmaxsd %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vpsubd %xmm3, %xmm0, %xmm0 -; AVX1-NEXT: vpsrld $1, %xmm0, %xmm0 -; AVX1-NEXT: vpmulld %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: vec128_i32_signed_mem_reg: -; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa (%rdi), %xmm1 -; AVX2-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm2 -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [1,1,1,1] -; AVX2-NEXT: vpor %xmm3, %xmm2, %xmm2 -; AVX2-NEXT: vpminsd %xmm0, %xmm1, %xmm3 -; AVX2-NEXT: vpmaxsd %xmm0, %xmm1, %xmm0 -; AVX2-NEXT: vpsubd %xmm3, %xmm0, %xmm0 -; AVX2-NEXT: vpsrld $1, %xmm0, %xmm0 -; AVX2-NEXT: vpmulld %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: retq -; -; XOP-FALLBACK-LABEL: vec128_i32_signed_mem_reg: -; XOP-FALLBACK: # %bb.0: -; XOP-FALLBACK-NEXT: vmovdqa (%rdi), %xmm1 -; XOP-FALLBACK-NEXT: vpcomgtd %xmm0, %xmm1, %xmm2 -; XOP-FALLBACK-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 -; XOP-FALLBACK-NEXT: vpminsd %xmm0, %xmm1, %xmm3 -; XOP-FALLBACK-NEXT: vpmaxsd %xmm0, %xmm1, %xmm0 -; XOP-FALLBACK-NEXT: vpsubd %xmm3, %xmm0, %xmm0 -; XOP-FALLBACK-NEXT: vpsrld $1, %xmm0, %xmm0 -; XOP-FALLBACK-NEXT: vpmacsdd %xmm1, %xmm2, %xmm0, %xmm0 -; XOP-FALLBACK-NEXT: retq -; -; XOPAVX1-LABEL: vec128_i32_signed_mem_reg: -; XOPAVX1: # %bb.0: -; XOPAVX1-NEXT: vmovdqa (%rdi), %xmm1 -; XOPAVX1-NEXT: vpcomgtd %xmm0, %xmm1, %xmm2 -; XOPAVX1-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 -; XOPAVX1-NEXT: vpminsd %xmm0, %xmm1, %xmm3 -; XOPAVX1-NEXT: vpmaxsd %xmm0, %xmm1, %xmm0 -; XOPAVX1-NEXT: vpsubd %xmm3, %xmm0, %xmm0 -; XOPAVX1-NEXT: vpsrld $1, %xmm0, %xmm0 -; XOPAVX1-NEXT: vpmacsdd %xmm1, %xmm2, %xmm0, %xmm0 -; XOPAVX1-NEXT: retq +; AVX-LABEL: vec128_i32_signed_mem_reg: +; AVX: # %bb.0: +; AVX-NEXT: vmovdqa (%rdi), %xmm1 +; AVX-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm2 +; AVX-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; AVX-NEXT: vpminsd %xmm0, %xmm1, %xmm3 +; AVX-NEXT: vpmaxsd %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vpsubd %xmm3, %xmm0, %xmm0 +; AVX-NEXT: vpsrld $1, %xmm0, %xmm0 +; AVX-NEXT: vpmulld %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: retq ; -; XOPAVX2-LABEL: vec128_i32_signed_mem_reg: -; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vmovdqa (%rdi), %xmm1 -; XOPAVX2-NEXT: vpcomgtd %xmm0, %xmm1, %xmm2 -; XOPAVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [1,1,1,1] -; XOPAVX2-NEXT: vpor %xmm3, %xmm2, %xmm2 -; XOPAVX2-NEXT: vpminsd %xmm0, %xmm1, %xmm3 -; XOPAVX2-NEXT: vpmaxsd %xmm0, %xmm1, %xmm0 -; XOPAVX2-NEXT: vpsubd %xmm3, %xmm0, %xmm0 -; XOPAVX2-NEXT: vpsrld $1, %xmm0, %xmm0 -; XOPAVX2-NEXT: vpmacsdd %xmm1, %xmm2, %xmm0, %xmm0 -; XOPAVX2-NEXT: retq +; XOP-LABEL: vec128_i32_signed_mem_reg: +; XOP: # %bb.0: +; XOP-NEXT: vmovdqa (%rdi), %xmm1 +; XOP-NEXT: vpcomgtd %xmm0, %xmm1, %xmm2 +; XOP-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; XOP-NEXT: vpminsd %xmm0, %xmm1, %xmm3 +; XOP-NEXT: vpmaxsd %xmm0, %xmm1, %xmm0 +; XOP-NEXT: vpsubd %xmm3, %xmm0, %xmm0 +; XOP-NEXT: vpsrld $1, %xmm0, %xmm0 +; XOP-NEXT: vpmacsdd %xmm1, %xmm2, %xmm0, %xmm0 +; XOP-NEXT: retq ; ; AVX512F-LABEL: vec128_i32_signed_mem_reg: ; AVX512F: # %bb.0: @@ -553,69 +440,30 @@ ; SSE41-NEXT: paddd %xmm1, %xmm0 ; SSE41-NEXT: retq ; -; AVX1-LABEL: vec128_i32_signed_reg_mem: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa (%rdi), %xmm1 -; AVX1-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm2 -; AVX1-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 -; AVX1-NEXT: vpminsd %xmm1, %xmm0, %xmm3 -; AVX1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm1 -; AVX1-NEXT: vpsubd %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpsrld $1, %xmm1, %xmm1 -; AVX1-NEXT: vpmulld %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: vec128_i32_signed_reg_mem: -; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa (%rdi), %xmm1 -; AVX2-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm2 -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [1,1,1,1] -; AVX2-NEXT: vpor %xmm3, %xmm2, %xmm2 -; AVX2-NEXT: vpminsd %xmm1, %xmm0, %xmm3 -; AVX2-NEXT: vpmaxsd %xmm1, %xmm0, %xmm1 -; AVX2-NEXT: vpsubd %xmm3, %xmm1, %xmm1 -; AVX2-NEXT: vpsrld $1, %xmm1, %xmm1 -; AVX2-NEXT: vpmulld %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpaddd %xmm0, %xmm1, %xmm0 -; AVX2-NEXT: retq -; -; XOP-FALLBACK-LABEL: vec128_i32_signed_reg_mem: -; XOP-FALLBACK: # %bb.0: -; XOP-FALLBACK-NEXT: vmovdqa (%rdi), %xmm1 -; XOP-FALLBACK-NEXT: vpcomgtd %xmm1, %xmm0, %xmm2 -; XOP-FALLBACK-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 -; XOP-FALLBACK-NEXT: vpminsd %xmm1, %xmm0, %xmm3 -; XOP-FALLBACK-NEXT: vpmaxsd %xmm1, %xmm0, %xmm1 -; XOP-FALLBACK-NEXT: vpsubd %xmm3, %xmm1, %xmm1 -; XOP-FALLBACK-NEXT: vpsrld $1, %xmm1, %xmm1 -; XOP-FALLBACK-NEXT: vpmacsdd %xmm0, %xmm2, %xmm1, %xmm0 -; XOP-FALLBACK-NEXT: retq -; -; XOPAVX1-LABEL: vec128_i32_signed_reg_mem: -; XOPAVX1: # %bb.0: -; XOPAVX1-NEXT: vmovdqa (%rdi), %xmm1 -; XOPAVX1-NEXT: vpcomgtd %xmm1, %xmm0, %xmm2 -; XOPAVX1-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 -; XOPAVX1-NEXT: vpminsd %xmm1, %xmm0, %xmm3 -; XOPAVX1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm1 -; XOPAVX1-NEXT: vpsubd %xmm3, %xmm1, %xmm1 -; XOPAVX1-NEXT: vpsrld $1, %xmm1, %xmm1 -; XOPAVX1-NEXT: vpmacsdd %xmm0, %xmm2, %xmm1, %xmm0 -; XOPAVX1-NEXT: retq +; AVX-LABEL: vec128_i32_signed_reg_mem: +; AVX: # %bb.0: +; AVX-NEXT: vmovdqa (%rdi), %xmm1 +; AVX-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm2 +; AVX-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; AVX-NEXT: vpminsd %xmm1, %xmm0, %xmm3 +; AVX-NEXT: vpmaxsd %xmm1, %xmm0, %xmm1 +; AVX-NEXT: vpsubd %xmm3, %xmm1, %xmm1 +; AVX-NEXT: vpsrld $1, %xmm1, %xmm1 +; AVX-NEXT: vpmulld %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vpaddd %xmm0, %xmm1, %xmm0 +; AVX-NEXT: retq ; -; XOPAVX2-LABEL: vec128_i32_signed_reg_mem: -; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vmovdqa (%rdi), %xmm1 -; XOPAVX2-NEXT: vpcomgtd %xmm1, %xmm0, %xmm2 -; XOPAVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [1,1,1,1] -; XOPAVX2-NEXT: vpor %xmm3, %xmm2, %xmm2 -; XOPAVX2-NEXT: vpminsd %xmm1, %xmm0, %xmm3 -; XOPAVX2-NEXT: vpmaxsd %xmm1, %xmm0, %xmm1 -; XOPAVX2-NEXT: vpsubd %xmm3, %xmm1, %xmm1 -; XOPAVX2-NEXT: vpsrld $1, %xmm1, %xmm1 -; XOPAVX2-NEXT: vpmacsdd %xmm0, %xmm2, %xmm1, %xmm0 -; XOPAVX2-NEXT: retq +; XOP-LABEL: vec128_i32_signed_reg_mem: +; XOP: # %bb.0: +; XOP-NEXT: vmovdqa (%rdi), %xmm1 +; XOP-NEXT: vpcomgtd %xmm1, %xmm0, %xmm2 +; XOP-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; XOP-NEXT: vpminsd %xmm1, %xmm0, %xmm3 +; XOP-NEXT: vpmaxsd %xmm1, %xmm0, %xmm1 +; XOP-NEXT: vpsubd %xmm3, %xmm1, %xmm1 +; XOP-NEXT: vpsrld $1, %xmm1, %xmm1 +; XOP-NEXT: vpmacsdd %xmm0, %xmm2, %xmm1, %xmm0 +; XOP-NEXT: retq ; ; AVX512F-LABEL: vec128_i32_signed_reg_mem: ; AVX512F: # %bb.0: @@ -720,74 +568,32 @@ ; SSE41-NEXT: paddd %xmm1, %xmm0 ; SSE41-NEXT: retq ; -; AVX1-LABEL: vec128_i32_signed_mem_mem: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa (%rdi), %xmm0 -; AVX1-NEXT: vmovdqa (%rsi), %xmm1 -; AVX1-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm2 -; AVX1-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 -; AVX1-NEXT: vpminsd %xmm1, %xmm0, %xmm3 -; AVX1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm1 -; AVX1-NEXT: vpsubd %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpsrld $1, %xmm1, %xmm1 -; AVX1-NEXT: vpmulld %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: vec128_i32_signed_mem_mem: -; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa (%rdi), %xmm0 -; AVX2-NEXT: vmovdqa (%rsi), %xmm1 -; AVX2-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm2 -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [1,1,1,1] -; AVX2-NEXT: vpor %xmm3, %xmm2, %xmm2 -; AVX2-NEXT: vpminsd %xmm1, %xmm0, %xmm3 -; AVX2-NEXT: vpmaxsd %xmm1, %xmm0, %xmm1 -; AVX2-NEXT: vpsubd %xmm3, %xmm1, %xmm1 -; AVX2-NEXT: vpsrld $1, %xmm1, %xmm1 -; AVX2-NEXT: vpmulld %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpaddd %xmm0, %xmm1, %xmm0 -; AVX2-NEXT: retq -; -; XOP-FALLBACK-LABEL: vec128_i32_signed_mem_mem: -; XOP-FALLBACK: # %bb.0: -; XOP-FALLBACK-NEXT: vmovdqa (%rdi), %xmm0 -; XOP-FALLBACK-NEXT: vmovdqa (%rsi), %xmm1 -; XOP-FALLBACK-NEXT: vpcomgtd %xmm1, %xmm0, %xmm2 -; XOP-FALLBACK-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 -; XOP-FALLBACK-NEXT: vpminsd %xmm1, %xmm0, %xmm3 -; XOP-FALLBACK-NEXT: vpmaxsd %xmm1, %xmm0, %xmm1 -; XOP-FALLBACK-NEXT: vpsubd %xmm3, %xmm1, %xmm1 -; XOP-FALLBACK-NEXT: vpsrld $1, %xmm1, %xmm1 -; XOP-FALLBACK-NEXT: vpmacsdd %xmm0, %xmm2, %xmm1, %xmm0 -; XOP-FALLBACK-NEXT: retq -; -; XOPAVX1-LABEL: vec128_i32_signed_mem_mem: -; XOPAVX1: # %bb.0: -; XOPAVX1-NEXT: vmovdqa (%rdi), %xmm0 -; XOPAVX1-NEXT: vmovdqa (%rsi), %xmm1 -; XOPAVX1-NEXT: vpcomgtd %xmm1, %xmm0, %xmm2 -; XOPAVX1-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 -; XOPAVX1-NEXT: vpminsd %xmm1, %xmm0, %xmm3 -; XOPAVX1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm1 -; XOPAVX1-NEXT: vpsubd %xmm3, %xmm1, %xmm1 -; XOPAVX1-NEXT: vpsrld $1, %xmm1, %xmm1 -; XOPAVX1-NEXT: vpmacsdd %xmm0, %xmm2, %xmm1, %xmm0 -; XOPAVX1-NEXT: retq +; AVX-LABEL: vec128_i32_signed_mem_mem: +; AVX: # %bb.0: +; AVX-NEXT: vmovdqa (%rdi), %xmm0 +; AVX-NEXT: vmovdqa (%rsi), %xmm1 +; AVX-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm2 +; AVX-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; AVX-NEXT: vpminsd %xmm1, %xmm0, %xmm3 +; AVX-NEXT: vpmaxsd %xmm1, %xmm0, %xmm1 +; AVX-NEXT: vpsubd %xmm3, %xmm1, %xmm1 +; AVX-NEXT: vpsrld $1, %xmm1, %xmm1 +; AVX-NEXT: vpmulld %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vpaddd %xmm0, %xmm1, %xmm0 +; AVX-NEXT: retq ; -; XOPAVX2-LABEL: vec128_i32_signed_mem_mem: -; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vmovdqa (%rdi), %xmm0 -; XOPAVX2-NEXT: vmovdqa (%rsi), %xmm1 -; XOPAVX2-NEXT: vpcomgtd %xmm1, %xmm0, %xmm2 -; XOPAVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [1,1,1,1] -; XOPAVX2-NEXT: vpor %xmm3, %xmm2, %xmm2 -; XOPAVX2-NEXT: vpminsd %xmm1, %xmm0, %xmm3 -; XOPAVX2-NEXT: vpmaxsd %xmm1, %xmm0, %xmm1 -; XOPAVX2-NEXT: vpsubd %xmm3, %xmm1, %xmm1 -; XOPAVX2-NEXT: vpsrld $1, %xmm1, %xmm1 -; XOPAVX2-NEXT: vpmacsdd %xmm0, %xmm2, %xmm1, %xmm0 -; XOPAVX2-NEXT: retq +; XOP-LABEL: vec128_i32_signed_mem_mem: +; XOP: # %bb.0: +; XOP-NEXT: vmovdqa (%rdi), %xmm0 +; XOP-NEXT: vmovdqa (%rsi), %xmm1 +; XOP-NEXT: vpcomgtd %xmm1, %xmm0, %xmm2 +; XOP-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; XOP-NEXT: vpminsd %xmm1, %xmm0, %xmm3 +; XOP-NEXT: vpmaxsd %xmm1, %xmm0, %xmm1 +; XOP-NEXT: vpsubd %xmm3, %xmm1, %xmm1 +; XOP-NEXT: vpsrld $1, %xmm1, %xmm1 +; XOP-NEXT: vpmacsdd %xmm0, %xmm2, %xmm1, %xmm0 +; XOP-NEXT: retq ; ; AVX512F-LABEL: vec128_i32_signed_mem_mem: ; AVX512F: # %bb.0: @@ -975,7 +781,7 @@ ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512F-NEXT: vpcmpgtq %zmm1, %zmm0, %k1 ; AVX512F-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = [1,1] +; AVX512F-NEXT: vpbroadcastq {{.*#+}} xmm3 = [1,1] ; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} ; AVX512F-NEXT: vpminsq %zmm1, %zmm0, %zmm2 ; AVX512F-NEXT: vpmaxsq %zmm1, %zmm0, %zmm1 @@ -1011,7 +817,7 @@ ; AVX512BW-FALLBACK-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512BW-FALLBACK-NEXT: vpcmpgtq %zmm1, %zmm0, %k1 ; AVX512BW-FALLBACK-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX512BW-FALLBACK-NEXT: vmovdqa {{.*#+}} xmm3 = [1,1] +; AVX512BW-FALLBACK-NEXT: vpbroadcastq {{.*#+}} xmm3 = [1,1] ; AVX512BW-FALLBACK-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} ; AVX512BW-FALLBACK-NEXT: vpminsq %zmm1, %zmm0, %zmm2 ; AVX512BW-FALLBACK-NEXT: vpmaxsq %zmm1, %zmm0, %zmm1 @@ -1115,27 +921,49 @@ ; SSE41-NEXT: paddq %xmm3, %xmm0 ; SSE41-NEXT: retq ; -; AVX-LABEL: vec128_i64_unsigned_reg_reg: -; AVX: # %bb.0: -; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] -; AVX-NEXT: vpxor %xmm2, %xmm1, %xmm3 -; AVX-NEXT: vpxor %xmm2, %xmm0, %xmm2 -; AVX-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2 -; AVX-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm3 -; AVX-NEXT: vpsubq %xmm1, %xmm0, %xmm4 -; AVX-NEXT: vpsubq %xmm0, %xmm1, %xmm1 -; AVX-NEXT: vblendvpd %xmm2, %xmm4, %xmm1, %xmm1 -; AVX-NEXT: vpsrlq $1, %xmm1, %xmm2 -; AVX-NEXT: vpsrlq $33, %xmm1, %xmm1 -; AVX-NEXT: vpmuludq %xmm3, %xmm1, %xmm1 -; AVX-NEXT: vpsrlq $32, %xmm3, %xmm4 -; AVX-NEXT: vpmuludq %xmm4, %xmm2, %xmm4 -; AVX-NEXT: vpaddq %xmm1, %xmm4, %xmm1 -; AVX-NEXT: vpsllq $32, %xmm1, %xmm1 -; AVX-NEXT: vpmuludq %xmm3, %xmm2, %xmm2 -; AVX-NEXT: vpaddq %xmm0, %xmm2, %xmm0 -; AVX-NEXT: vpaddq %xmm1, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: vec128_i64_unsigned_reg_reg: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm3 +; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm2 +; AVX1-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm3 +; AVX1-NEXT: vpsubq %xmm1, %xmm0, %xmm4 +; AVX1-NEXT: vpsubq %xmm0, %xmm1, %xmm1 +; AVX1-NEXT: vblendvpd %xmm2, %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vpsrlq $1, %xmm1, %xmm2 +; AVX1-NEXT: vpsrlq $33, %xmm1, %xmm1 +; AVX1-NEXT: vpmuludq %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpsrlq $32, %xmm3, %xmm4 +; AVX1-NEXT: vpmuludq %xmm4, %xmm2, %xmm4 +; AVX1-NEXT: vpaddq %xmm1, %xmm4, %xmm1 +; AVX1-NEXT: vpsllq $32, %xmm1, %xmm1 +; AVX1-NEXT: vpmuludq %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpaddq %xmm0, %xmm2, %xmm0 +; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: vec128_i64_unsigned_reg_reg: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm3 +; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm2 +; AVX2-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2 +; AVX2-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm3 +; AVX2-NEXT: vpsubq %xmm1, %xmm0, %xmm4 +; AVX2-NEXT: vpsubq %xmm0, %xmm1, %xmm1 +; AVX2-NEXT: vblendvpd %xmm2, %xmm4, %xmm1, %xmm1 +; AVX2-NEXT: vpsrlq $1, %xmm1, %xmm2 +; AVX2-NEXT: vpsrlq $33, %xmm1, %xmm1 +; AVX2-NEXT: vpmuludq %xmm3, %xmm1, %xmm1 +; AVX2-NEXT: vpsrlq $32, %xmm3, %xmm4 +; AVX2-NEXT: vpmuludq %xmm4, %xmm2, %xmm4 +; AVX2-NEXT: vpaddq %xmm1, %xmm4, %xmm1 +; AVX2-NEXT: vpsllq $32, %xmm1, %xmm1 +; AVX2-NEXT: vpmuludq %xmm3, %xmm2, %xmm2 +; AVX2-NEXT: vpaddq %xmm0, %xmm2, %xmm0 +; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: retq ; ; XOP-LABEL: vec128_i64_unsigned_reg_reg: ; XOP: # %bb.0: @@ -1162,7 +990,7 @@ ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512F-NEXT: vpcmpnleuq %zmm1, %zmm0, %k1 ; AVX512F-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = [1,1] +; AVX512F-NEXT: vpbroadcastq {{.*#+}} xmm3 = [1,1] ; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} ; AVX512F-NEXT: vpminuq %zmm1, %zmm0, %zmm2 ; AVX512F-NEXT: vpmaxuq %zmm1, %zmm0, %zmm1 @@ -1198,7 +1026,7 @@ ; AVX512BW-FALLBACK-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512BW-FALLBACK-NEXT: vpcmpnleuq %zmm1, %zmm0, %k1 ; AVX512BW-FALLBACK-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX512BW-FALLBACK-NEXT: vmovdqa {{.*#+}} xmm3 = [1,1] +; AVX512BW-FALLBACK-NEXT: vpbroadcastq {{.*#+}} xmm3 = [1,1] ; AVX512BW-FALLBACK-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} ; AVX512BW-FALLBACK-NEXT: vpminuq %zmm1, %zmm0, %zmm2 ; AVX512BW-FALLBACK-NEXT: vpmaxuq %zmm1, %zmm0, %zmm1 @@ -1352,7 +1180,7 @@ ; AVX512F-NEXT: vmovdqa (%rdi), %xmm1 ; AVX512F-NEXT: vpcmpgtq %zmm0, %zmm1, %k1 ; AVX512F-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = [1,1] +; AVX512F-NEXT: vpbroadcastq {{.*#+}} xmm3 = [1,1] ; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} ; AVX512F-NEXT: vpminsq %zmm0, %zmm1, %zmm2 ; AVX512F-NEXT: vpmaxsq %zmm0, %zmm1, %zmm0 @@ -1389,7 +1217,7 @@ ; AVX512BW-FALLBACK-NEXT: vmovdqa (%rdi), %xmm1 ; AVX512BW-FALLBACK-NEXT: vpcmpgtq %zmm0, %zmm1, %k1 ; AVX512BW-FALLBACK-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX512BW-FALLBACK-NEXT: vmovdqa {{.*#+}} xmm3 = [1,1] +; AVX512BW-FALLBACK-NEXT: vpbroadcastq {{.*#+}} xmm3 = [1,1] ; AVX512BW-FALLBACK-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} ; AVX512BW-FALLBACK-NEXT: vpminsq %zmm0, %zmm1, %zmm2 ; AVX512BW-FALLBACK-NEXT: vpmaxsq %zmm0, %zmm1, %zmm0 @@ -1542,7 +1370,7 @@ ; AVX512F-NEXT: vmovdqa (%rdi), %xmm1 ; AVX512F-NEXT: vpcmpgtq %zmm1, %zmm0, %k1 ; AVX512F-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = [1,1] +; AVX512F-NEXT: vpbroadcastq {{.*#+}} xmm3 = [1,1] ; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} ; AVX512F-NEXT: vpminsq %zmm1, %zmm0, %zmm2 ; AVX512F-NEXT: vpmaxsq %zmm1, %zmm0, %zmm1 @@ -1579,7 +1407,7 @@ ; AVX512BW-FALLBACK-NEXT: vmovdqa (%rdi), %xmm1 ; AVX512BW-FALLBACK-NEXT: vpcmpgtq %zmm1, %zmm0, %k1 ; AVX512BW-FALLBACK-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX512BW-FALLBACK-NEXT: vmovdqa {{.*#+}} xmm3 = [1,1] +; AVX512BW-FALLBACK-NEXT: vpbroadcastq {{.*#+}} xmm3 = [1,1] ; AVX512BW-FALLBACK-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} ; AVX512BW-FALLBACK-NEXT: vpminsq %zmm1, %zmm0, %zmm2 ; AVX512BW-FALLBACK-NEXT: vpmaxsq %zmm1, %zmm0, %zmm1 @@ -1735,7 +1563,7 @@ ; AVX512F-NEXT: vmovdqa (%rsi), %xmm1 ; AVX512F-NEXT: vpcmpgtq %zmm1, %zmm0, %k1 ; AVX512F-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = [1,1] +; AVX512F-NEXT: vpbroadcastq {{.*#+}} xmm3 = [1,1] ; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} ; AVX512F-NEXT: vpminsq %zmm1, %zmm0, %zmm2 ; AVX512F-NEXT: vpmaxsq %zmm1, %zmm0, %zmm1 @@ -1773,7 +1601,7 @@ ; AVX512BW-FALLBACK-NEXT: vmovdqa (%rsi), %xmm1 ; AVX512BW-FALLBACK-NEXT: vpcmpgtq %zmm1, %zmm0, %k1 ; AVX512BW-FALLBACK-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX512BW-FALLBACK-NEXT: vmovdqa {{.*#+}} xmm3 = [1,1] +; AVX512BW-FALLBACK-NEXT: vpbroadcastq {{.*#+}} xmm3 = [1,1] ; AVX512BW-FALLBACK-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} ; AVX512BW-FALLBACK-NEXT: vpminsq %zmm1, %zmm0, %zmm2 ; AVX512BW-FALLBACK-NEXT: vpmaxsq %zmm1, %zmm0, %zmm1 @@ -1877,7 +1705,7 @@ ; AVX512BW-FALLBACK-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512BW-FALLBACK-NEXT: vpcmpgtw %zmm1, %zmm0, %k1 ; AVX512BW-FALLBACK-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX512BW-FALLBACK-NEXT: vmovdqa {{.*#+}} xmm3 = [1,1,1,1,1,1,1,1] +; AVX512BW-FALLBACK-NEXT: vpbroadcastw {{.*#+}} xmm3 = [1,1,1,1,1,1,1,1] ; AVX512BW-FALLBACK-NEXT: vmovdqu16 %zmm2, %zmm3 {%k1} ; AVX512BW-FALLBACK-NEXT: vpminsw %xmm1, %xmm0, %xmm2 ; AVX512BW-FALLBACK-NEXT: vpmaxsw %xmm1, %xmm0, %xmm1 @@ -2002,7 +1830,7 @@ ; AVX512BW-FALLBACK-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512BW-FALLBACK-NEXT: vpcmpnleuw %zmm1, %zmm0, %k1 ; AVX512BW-FALLBACK-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX512BW-FALLBACK-NEXT: vmovdqa {{.*#+}} xmm3 = [1,1,1,1,1,1,1,1] +; AVX512BW-FALLBACK-NEXT: vpbroadcastw {{.*#+}} xmm3 = [1,1,1,1,1,1,1,1] ; AVX512BW-FALLBACK-NEXT: vmovdqu16 %zmm2, %zmm3 {%k1} ; AVX512BW-FALLBACK-NEXT: vpminuw %xmm1, %xmm0, %xmm2 ; AVX512BW-FALLBACK-NEXT: vpmaxuw %xmm1, %xmm0, %xmm1 @@ -2110,7 +1938,7 @@ ; AVX512BW-FALLBACK-NEXT: vmovdqa (%rdi), %xmm1 ; AVX512BW-FALLBACK-NEXT: vpcmpgtw %zmm0, %zmm1, %k1 ; AVX512BW-FALLBACK-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX512BW-FALLBACK-NEXT: vmovdqa {{.*#+}} xmm3 = [1,1,1,1,1,1,1,1] +; AVX512BW-FALLBACK-NEXT: vpbroadcastw {{.*#+}} xmm3 = [1,1,1,1,1,1,1,1] ; AVX512BW-FALLBACK-NEXT: vmovdqu16 %zmm2, %zmm3 {%k1} ; AVX512BW-FALLBACK-NEXT: vpminsw %xmm0, %xmm1, %xmm2 ; AVX512BW-FALLBACK-NEXT: vpmaxsw %xmm0, %xmm1, %xmm0 @@ -2218,7 +2046,7 @@ ; AVX512BW-FALLBACK-NEXT: vmovdqa (%rdi), %xmm1 ; AVX512BW-FALLBACK-NEXT: vpcmpgtw %zmm1, %zmm0, %k1 ; AVX512BW-FALLBACK-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX512BW-FALLBACK-NEXT: vmovdqa {{.*#+}} xmm3 = [1,1,1,1,1,1,1,1] +; AVX512BW-FALLBACK-NEXT: vpbroadcastw {{.*#+}} xmm3 = [1,1,1,1,1,1,1,1] ; AVX512BW-FALLBACK-NEXT: vmovdqu16 %zmm2, %zmm3 {%k1} ; AVX512BW-FALLBACK-NEXT: vpminsw %xmm1, %xmm0, %xmm2 ; AVX512BW-FALLBACK-NEXT: vpmaxsw %xmm1, %xmm0, %xmm1 @@ -2331,7 +2159,7 @@ ; AVX512BW-FALLBACK-NEXT: vmovdqa (%rsi), %xmm1 ; AVX512BW-FALLBACK-NEXT: vpcmpgtw %zmm1, %zmm0, %k1 ; AVX512BW-FALLBACK-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX512BW-FALLBACK-NEXT: vmovdqa {{.*#+}} xmm3 = [1,1,1,1,1,1,1,1] +; AVX512BW-FALLBACK-NEXT: vpbroadcastw {{.*#+}} xmm3 = [1,1,1,1,1,1,1,1] ; AVX512BW-FALLBACK-NEXT: vmovdqu16 %zmm2, %zmm3 {%k1} ; AVX512BW-FALLBACK-NEXT: vpminsw %xmm1, %xmm0, %xmm2 ; AVX512BW-FALLBACK-NEXT: vpmaxsw %xmm1, %xmm0, %xmm1 @@ -2565,7 +2393,7 @@ ; AVX512BW-FALLBACK-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512BW-FALLBACK-NEXT: vpcmpgtb %zmm1, %zmm0, %k1 ; AVX512BW-FALLBACK-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX512BW-FALLBACK-NEXT: vmovdqa {{.*#+}} xmm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX512BW-FALLBACK-NEXT: vpbroadcastb {{.*#+}} xmm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] ; AVX512BW-FALLBACK-NEXT: vmovdqu8 %zmm2, %zmm3 {%k1} ; AVX512BW-FALLBACK-NEXT: vpminsb %xmm1, %xmm0, %xmm2 ; AVX512BW-FALLBACK-NEXT: vpmaxsb %xmm1, %xmm0, %xmm1 @@ -2798,7 +2626,7 @@ ; AVX512BW-FALLBACK-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512BW-FALLBACK-NEXT: vpcmpnleub %zmm1, %zmm0, %k1 ; AVX512BW-FALLBACK-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX512BW-FALLBACK-NEXT: vmovdqa {{.*#+}} xmm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX512BW-FALLBACK-NEXT: vpbroadcastb {{.*#+}} xmm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] ; AVX512BW-FALLBACK-NEXT: vmovdqu8 %zmm2, %zmm3 {%k1} ; AVX512BW-FALLBACK-NEXT: vpminub %xmm1, %xmm0, %xmm2 ; AVX512BW-FALLBACK-NEXT: vpmaxub %xmm1, %xmm0, %xmm1 @@ -3040,7 +2868,7 @@ ; AVX512BW-FALLBACK-NEXT: vmovdqa (%rdi), %xmm1 ; AVX512BW-FALLBACK-NEXT: vpcmpgtb %zmm0, %zmm1, %k1 ; AVX512BW-FALLBACK-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX512BW-FALLBACK-NEXT: vmovdqa {{.*#+}} xmm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX512BW-FALLBACK-NEXT: vpbroadcastb {{.*#+}} xmm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] ; AVX512BW-FALLBACK-NEXT: vmovdqu8 %zmm2, %zmm3 {%k1} ; AVX512BW-FALLBACK-NEXT: vpminsb %xmm0, %xmm1, %xmm2 ; AVX512BW-FALLBACK-NEXT: vpmaxsb %xmm0, %xmm1, %xmm0 @@ -3280,7 +3108,7 @@ ; AVX512BW-FALLBACK-NEXT: vmovdqa (%rdi), %xmm1 ; AVX512BW-FALLBACK-NEXT: vpcmpgtb %zmm1, %zmm0, %k1 ; AVX512BW-FALLBACK-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX512BW-FALLBACK-NEXT: vmovdqa {{.*#+}} xmm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX512BW-FALLBACK-NEXT: vpbroadcastb {{.*#+}} xmm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] ; AVX512BW-FALLBACK-NEXT: vmovdqu8 %zmm2, %zmm3 {%k1} ; AVX512BW-FALLBACK-NEXT: vpminsb %xmm1, %xmm0, %xmm2 ; AVX512BW-FALLBACK-NEXT: vpmaxsb %xmm1, %xmm0, %xmm1 @@ -3529,7 +3357,7 @@ ; AVX512BW-FALLBACK-NEXT: vmovdqa (%rsi), %xmm1 ; AVX512BW-FALLBACK-NEXT: vpcmpgtb %zmm1, %zmm0, %k1 ; AVX512BW-FALLBACK-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX512BW-FALLBACK-NEXT: vmovdqa {{.*#+}} xmm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX512BW-FALLBACK-NEXT: vpbroadcastb {{.*#+}} xmm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] ; AVX512BW-FALLBACK-NEXT: vmovdqu8 %zmm2, %zmm3 {%k1} ; AVX512BW-FALLBACK-NEXT: vpminsb %xmm1, %xmm0, %xmm2 ; AVX512BW-FALLBACK-NEXT: vpmaxsb %xmm1, %xmm0, %xmm1 diff --git a/llvm/test/CodeGen/X86/midpoint-int-vec-256.ll b/llvm/test/CodeGen/X86/midpoint-int-vec-256.ll --- a/llvm/test/CodeGen/X86/midpoint-int-vec-256.ll +++ b/llvm/test/CodeGen/X86/midpoint-int-vec-256.ll @@ -425,8 +425,7 @@ ; AVX2-LABEL: vec256_i64_signed_reg_reg: ; AVX2: # %bb.0: ; AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [1,1,1,1] -; AVX2-NEXT: vpor %ymm3, %ymm2, %ymm3 +; AVX2-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm3 ; AVX2-NEXT: vpsubq %ymm1, %ymm0, %ymm4 ; AVX2-NEXT: vpsubq %ymm0, %ymm1, %ymm1 ; AVX2-NEXT: vblendvpd %ymm2, %ymm4, %ymm1, %ymm1 @@ -598,8 +597,7 @@ ; AVX2-NEXT: vpxor %ymm2, %ymm1, %ymm3 ; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm2 ; AVX2-NEXT: vpcmpgtq %ymm3, %ymm2, %ymm2 -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [1,1,1,1] -; AVX2-NEXT: vpor %ymm3, %ymm2, %ymm3 +; AVX2-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm3 ; AVX2-NEXT: vpsubq %ymm1, %ymm0, %ymm4 ; AVX2-NEXT: vpsubq %ymm0, %ymm1, %ymm1 ; AVX2-NEXT: vblendvpd %ymm2, %ymm4, %ymm1, %ymm1 @@ -767,8 +765,7 @@ ; AVX2: # %bb.0: ; AVX2-NEXT: vmovdqa (%rdi), %ymm1 ; AVX2-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm2 -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [1,1,1,1] -; AVX2-NEXT: vpor %ymm3, %ymm2, %ymm3 +; AVX2-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm3 ; AVX2-NEXT: vpsubq %ymm0, %ymm1, %ymm4 ; AVX2-NEXT: vpsubq %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vblendvpd %ymm2, %ymm4, %ymm0, %ymm0 @@ -937,8 +934,7 @@ ; AVX2: # %bb.0: ; AVX2-NEXT: vmovdqa (%rdi), %ymm1 ; AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [1,1,1,1] -; AVX2-NEXT: vpor %ymm3, %ymm2, %ymm3 +; AVX2-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm3 ; AVX2-NEXT: vpsubq %ymm1, %ymm0, %ymm4 ; AVX2-NEXT: vpsubq %ymm0, %ymm1, %ymm1 ; AVX2-NEXT: vblendvpd %ymm2, %ymm4, %ymm1, %ymm1 @@ -1109,8 +1105,7 @@ ; AVX2-NEXT: vmovdqa (%rdi), %ymm0 ; AVX2-NEXT: vmovdqa (%rsi), %ymm1 ; AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [1,1,1,1] -; AVX2-NEXT: vpor %ymm3, %ymm2, %ymm3 +; AVX2-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm3 ; AVX2-NEXT: vpsubq %ymm1, %ymm0, %ymm4 ; AVX2-NEXT: vpsubq %ymm0, %ymm1, %ymm1 ; AVX2-NEXT: vblendvpd %ymm2, %ymm4, %ymm1, %ymm1 @@ -1333,7 +1328,7 @@ ; AVX512BW-FALLBACK-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; AVX512BW-FALLBACK-NEXT: vpcmpgtw %zmm1, %zmm0, %k1 ; AVX512BW-FALLBACK-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 -; AVX512BW-FALLBACK-NEXT: vmovdqa {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX512BW-FALLBACK-NEXT: vpbroadcastw {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] ; AVX512BW-FALLBACK-NEXT: vmovdqu16 %zmm2, %zmm3 {%k1} ; AVX512BW-FALLBACK-NEXT: vpminsw %ymm1, %ymm0, %ymm2 ; AVX512BW-FALLBACK-NEXT: vpmaxsw %ymm1, %ymm0, %ymm1 @@ -1461,7 +1456,7 @@ ; AVX512BW-FALLBACK-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; AVX512BW-FALLBACK-NEXT: vpcmpnleuw %zmm1, %zmm0, %k1 ; AVX512BW-FALLBACK-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 -; AVX512BW-FALLBACK-NEXT: vmovdqa {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX512BW-FALLBACK-NEXT: vpbroadcastw {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] ; AVX512BW-FALLBACK-NEXT: vmovdqu16 %zmm2, %zmm3 {%k1} ; AVX512BW-FALLBACK-NEXT: vpminuw %ymm1, %ymm0, %ymm2 ; AVX512BW-FALLBACK-NEXT: vpmaxuw %ymm1, %ymm0, %ymm1 @@ -1589,7 +1584,7 @@ ; AVX512BW-FALLBACK-NEXT: vmovdqa (%rdi), %ymm1 ; AVX512BW-FALLBACK-NEXT: vpcmpgtw %zmm0, %zmm1, %k1 ; AVX512BW-FALLBACK-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 -; AVX512BW-FALLBACK-NEXT: vmovdqa {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX512BW-FALLBACK-NEXT: vpbroadcastw {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] ; AVX512BW-FALLBACK-NEXT: vmovdqu16 %zmm2, %zmm3 {%k1} ; AVX512BW-FALLBACK-NEXT: vpminsw %ymm0, %ymm1, %ymm2 ; AVX512BW-FALLBACK-NEXT: vpmaxsw %ymm0, %ymm1, %ymm0 @@ -1717,7 +1712,7 @@ ; AVX512BW-FALLBACK-NEXT: vmovdqa (%rdi), %ymm1 ; AVX512BW-FALLBACK-NEXT: vpcmpgtw %zmm1, %zmm0, %k1 ; AVX512BW-FALLBACK-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 -; AVX512BW-FALLBACK-NEXT: vmovdqa {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX512BW-FALLBACK-NEXT: vpbroadcastw {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] ; AVX512BW-FALLBACK-NEXT: vmovdqu16 %zmm2, %zmm3 {%k1} ; AVX512BW-FALLBACK-NEXT: vpminsw %ymm1, %ymm0, %ymm2 ; AVX512BW-FALLBACK-NEXT: vpmaxsw %ymm1, %ymm0, %ymm1 @@ -1850,7 +1845,7 @@ ; AVX512BW-FALLBACK-NEXT: vmovdqa (%rsi), %ymm1 ; AVX512BW-FALLBACK-NEXT: vpcmpgtw %zmm1, %zmm0, %k1 ; AVX512BW-FALLBACK-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 -; AVX512BW-FALLBACK-NEXT: vmovdqa {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX512BW-FALLBACK-NEXT: vpbroadcastw {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] ; AVX512BW-FALLBACK-NEXT: vmovdqu16 %zmm2, %zmm3 {%k1} ; AVX512BW-FALLBACK-NEXT: vpminsw %ymm1, %ymm0, %ymm2 ; AVX512BW-FALLBACK-NEXT: vpmaxsw %ymm1, %ymm0, %ymm1 @@ -1949,7 +1944,7 @@ ; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; AVX2-NEXT: vpmullw %ymm4, %ymm3, %ymm3 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX2-NEXT: vpand %ymm4, %ymm3, %ymm3 ; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] ; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] @@ -2009,7 +2004,7 @@ ; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; AVX512F-NEXT: vpmullw %ymm4, %ymm3, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX512F-NEXT: vpand %ymm4, %ymm3, %ymm3 ; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] ; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] @@ -2037,7 +2032,7 @@ ; AVX512BW-FALLBACK-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; AVX512BW-FALLBACK-NEXT: vpcmpgtb %zmm1, %zmm0, %k1 ; AVX512BW-FALLBACK-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 -; AVX512BW-FALLBACK-NEXT: vmovdqa {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX512BW-FALLBACK-NEXT: vpbroadcastb {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] ; AVX512BW-FALLBACK-NEXT: vmovdqu8 %zmm2, %zmm3 {%k1} ; AVX512BW-FALLBACK-NEXT: vpminsb %ymm1, %ymm0, %ymm2 ; AVX512BW-FALLBACK-NEXT: vpmaxsb %ymm1, %ymm0, %ymm1 @@ -2136,7 +2131,7 @@ ; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; AVX2-NEXT: vpmullw %ymm4, %ymm2, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX2-NEXT: vpand %ymm4, %ymm2, %ymm2 ; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] ; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] @@ -2197,7 +2192,7 @@ ; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; AVX512F-NEXT: vpmullw %ymm4, %ymm2, %ymm2 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX512F-NEXT: vpand %ymm4, %ymm2, %ymm2 ; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] ; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] @@ -2226,7 +2221,7 @@ ; AVX512BW-FALLBACK-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; AVX512BW-FALLBACK-NEXT: vpcmpnleub %zmm1, %zmm0, %k1 ; AVX512BW-FALLBACK-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 -; AVX512BW-FALLBACK-NEXT: vmovdqa {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX512BW-FALLBACK-NEXT: vpbroadcastb {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] ; AVX512BW-FALLBACK-NEXT: vmovdqu8 %zmm2, %zmm3 {%k1} ; AVX512BW-FALLBACK-NEXT: vpminub %ymm1, %ymm0, %ymm2 ; AVX512BW-FALLBACK-NEXT: vpmaxub %ymm1, %ymm0, %ymm1 @@ -2324,7 +2319,7 @@ ; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; AVX2-NEXT: vpmullw %ymm4, %ymm3, %ymm3 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX2-NEXT: vpand %ymm4, %ymm3, %ymm3 ; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] ; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] @@ -2386,7 +2381,7 @@ ; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; AVX512F-NEXT: vpmullw %ymm4, %ymm3, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX512F-NEXT: vpand %ymm4, %ymm3, %ymm3 ; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] ; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] @@ -2415,7 +2410,7 @@ ; AVX512BW-FALLBACK-NEXT: vmovdqa (%rdi), %ymm1 ; AVX512BW-FALLBACK-NEXT: vpcmpgtb %zmm0, %zmm1, %k1 ; AVX512BW-FALLBACK-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 -; AVX512BW-FALLBACK-NEXT: vmovdqa {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX512BW-FALLBACK-NEXT: vpbroadcastb {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] ; AVX512BW-FALLBACK-NEXT: vmovdqu8 %zmm2, %zmm3 {%k1} ; AVX512BW-FALLBACK-NEXT: vpminsb %ymm0, %ymm1, %ymm2 ; AVX512BW-FALLBACK-NEXT: vpmaxsb %ymm0, %ymm1, %ymm0 @@ -2513,7 +2508,7 @@ ; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; AVX2-NEXT: vpmullw %ymm4, %ymm3, %ymm3 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX2-NEXT: vpand %ymm4, %ymm3, %ymm3 ; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] ; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] @@ -2575,7 +2570,7 @@ ; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; AVX512F-NEXT: vpmullw %ymm4, %ymm3, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX512F-NEXT: vpand %ymm4, %ymm3, %ymm3 ; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] ; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] @@ -2604,7 +2599,7 @@ ; AVX512BW-FALLBACK-NEXT: vmovdqa (%rdi), %ymm1 ; AVX512BW-FALLBACK-NEXT: vpcmpgtb %zmm1, %zmm0, %k1 ; AVX512BW-FALLBACK-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 -; AVX512BW-FALLBACK-NEXT: vmovdqa {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX512BW-FALLBACK-NEXT: vpbroadcastb {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] ; AVX512BW-FALLBACK-NEXT: vmovdqu8 %zmm2, %zmm3 {%k1} ; AVX512BW-FALLBACK-NEXT: vpminsb %ymm1, %ymm0, %ymm2 ; AVX512BW-FALLBACK-NEXT: vpmaxsb %ymm1, %ymm0, %ymm1 @@ -2704,7 +2699,7 @@ ; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; AVX2-NEXT: vpmullw %ymm4, %ymm3, %ymm3 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX2-NEXT: vpand %ymm4, %ymm3, %ymm3 ; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] ; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] @@ -2768,7 +2763,7 @@ ; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; AVX512F-NEXT: vpmullw %ymm4, %ymm3, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX512F-NEXT: vpand %ymm4, %ymm3, %ymm3 ; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] ; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] @@ -2798,7 +2793,7 @@ ; AVX512BW-FALLBACK-NEXT: vmovdqa (%rsi), %ymm1 ; AVX512BW-FALLBACK-NEXT: vpcmpgtb %zmm1, %zmm0, %k1 ; AVX512BW-FALLBACK-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 -; AVX512BW-FALLBACK-NEXT: vmovdqa {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX512BW-FALLBACK-NEXT: vpbroadcastb {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] ; AVX512BW-FALLBACK-NEXT: vmovdqu8 %zmm2, %zmm3 {%k1} ; AVX512BW-FALLBACK-NEXT: vpminsb %ymm1, %ymm0, %ymm2 ; AVX512BW-FALLBACK-NEXT: vpmaxsb %ymm1, %ymm0, %ymm1 diff --git a/llvm/test/CodeGen/X86/movmsk-cmp.ll b/llvm/test/CodeGen/X86/movmsk-cmp.ll --- a/llvm/test/CodeGen/X86/movmsk-cmp.ll +++ b/llvm/test/CodeGen/X86/movmsk-cmp.ll @@ -1670,18 +1670,11 @@ ; SSE41-NEXT: sete %al ; SSE41-NEXT: retq ; -; AVX1-LABEL: allzeros_v4i32_and1: -; AVX1: # %bb.0: -; AVX1-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; AVX1-NEXT: sete %al -; AVX1-NEXT: retq -; -; AVX2-LABEL: allzeros_v4i32_and1: -; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1,1,1,1] -; AVX2-NEXT: vptest %xmm1, %xmm0 -; AVX2-NEXT: sete %al -; AVX2-NEXT: retq +; AVX1OR2-LABEL: allzeros_v4i32_and1: +; AVX1OR2: # %bb.0: +; AVX1OR2-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; AVX1OR2-NEXT: sete %al +; AVX1OR2-NEXT: retq ; ; AVX512-LABEL: allzeros_v4i32_and1: ; AVX512: # %bb.0: @@ -1769,20 +1762,12 @@ ; SSE41-NEXT: sete %al ; SSE41-NEXT: retq ; -; AVX1-LABEL: allzeros_v8i32_and1: -; AVX1: # %bb.0: -; AVX1-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0 -; AVX1-NEXT: sete %al -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: allzeros_v8i32_and1: -; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm1 = [1,1,1,1,1,1,1,1] -; AVX2-NEXT: vptest %ymm1, %ymm0 -; AVX2-NEXT: sete %al -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq +; AVX1OR2-LABEL: allzeros_v8i32_and1: +; AVX1OR2: # %bb.0: +; AVX1OR2-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0 +; AVX1OR2-NEXT: sete %al +; AVX1OR2-NEXT: vzeroupper +; AVX1OR2-NEXT: retq ; ; AVX512-LABEL: allzeros_v8i32_and1: ; AVX512: # %bb.0: @@ -1889,8 +1874,7 @@ ; AVX2-LABEL: allzeros_v16i32_and1: ; AVX2: # %bb.0: ; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm1 = [1,1,1,1,1,1,1,1] -; AVX2-NEXT: vptest %ymm1, %ymm0 +; AVX2-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0 ; AVX2-NEXT: sete %al ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -1929,7 +1913,7 @@ ; KNL-LABEL: allones_v2i64_and1: ; KNL: # %bb.0: ; KNL-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; KNL-NEXT: vmovdqa {{.*#+}} xmm1 = [1,1] +; KNL-NEXT: vpbroadcastq {{.*#+}} xmm1 = [1,1] ; KNL-NEXT: vptestnmq %zmm1, %zmm0, %k0 ; KNL-NEXT: kmovw %k0, %eax ; KNL-NEXT: testb $3, %al @@ -2064,20 +2048,12 @@ ; SSE41-NEXT: sete %al ; SSE41-NEXT: retq ; -; AVX1-LABEL: allzeros_v4i64_and1: -; AVX1: # %bb.0: -; AVX1-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0 -; AVX1-NEXT: sete %al -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: allzeros_v4i64_and1: -; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [1,1,1,1] -; AVX2-NEXT: vptest %ymm1, %ymm0 -; AVX2-NEXT: sete %al -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq +; AVX1OR2-LABEL: allzeros_v4i64_and1: +; AVX1OR2: # %bb.0: +; AVX1OR2-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0 +; AVX1OR2-NEXT: sete %al +; AVX1OR2-NEXT: vzeroupper +; AVX1OR2-NEXT: retq ; ; AVX512-LABEL: allzeros_v4i64_and1: ; AVX512: # %bb.0: @@ -2192,8 +2168,7 @@ ; AVX2-LABEL: allzeros_v8i64_and1: ; AVX2: # %bb.0: ; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [1,1,1,1] -; AVX2-NEXT: vptest %ymm1, %ymm0 +; AVX2-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0 ; AVX2-NEXT: sete %al ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -2816,18 +2791,11 @@ ; SSE41-NEXT: sete %al ; SSE41-NEXT: retq ; -; AVX1-LABEL: allzeros_v4i32_and4: -; AVX1: # %bb.0: -; AVX1-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; AVX1-NEXT: sete %al -; AVX1-NEXT: retq -; -; AVX2-LABEL: allzeros_v4i32_and4: -; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [4,4,4,4] -; AVX2-NEXT: vptest %xmm1, %xmm0 -; AVX2-NEXT: sete %al -; AVX2-NEXT: retq +; AVX1OR2-LABEL: allzeros_v4i32_and4: +; AVX1OR2: # %bb.0: +; AVX1OR2-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; AVX1OR2-NEXT: sete %al +; AVX1OR2-NEXT: retq ; ; AVX512-LABEL: allzeros_v4i32_and4: ; AVX512: # %bb.0: @@ -2915,20 +2883,12 @@ ; SSE41-NEXT: sete %al ; SSE41-NEXT: retq ; -; AVX1-LABEL: allzeros_v8i32_and4: -; AVX1: # %bb.0: -; AVX1-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0 -; AVX1-NEXT: sete %al -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: allzeros_v8i32_and4: -; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4] -; AVX2-NEXT: vptest %ymm1, %ymm0 -; AVX2-NEXT: sete %al -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq +; AVX1OR2-LABEL: allzeros_v8i32_and4: +; AVX1OR2: # %bb.0: +; AVX1OR2-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0 +; AVX1OR2-NEXT: sete %al +; AVX1OR2-NEXT: vzeroupper +; AVX1OR2-NEXT: retq ; ; AVX512-LABEL: allzeros_v8i32_and4: ; AVX512: # %bb.0: @@ -3035,8 +2995,7 @@ ; AVX2-LABEL: allzeros_v16i32_and4: ; AVX2: # %bb.0: ; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4] -; AVX2-NEXT: vptest %ymm1, %ymm0 +; AVX2-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0 ; AVX2-NEXT: sete %al ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -3075,7 +3034,7 @@ ; KNL-LABEL: allones_v2i64_and4: ; KNL: # %bb.0: ; KNL-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; KNL-NEXT: vmovdqa {{.*#+}} xmm1 = [4,4] +; KNL-NEXT: vpbroadcastq {{.*#+}} xmm1 = [4,4] ; KNL-NEXT: vptestnmq %zmm1, %zmm0, %k0 ; KNL-NEXT: kmovw %k0, %eax ; KNL-NEXT: testb $3, %al @@ -3210,20 +3169,12 @@ ; SSE41-NEXT: sete %al ; SSE41-NEXT: retq ; -; AVX1-LABEL: allzeros_v4i64_and4: -; AVX1: # %bb.0: -; AVX1-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0 -; AVX1-NEXT: sete %al -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: allzeros_v4i64_and4: -; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [4,4,4,4] -; AVX2-NEXT: vptest %ymm1, %ymm0 -; AVX2-NEXT: sete %al -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq +; AVX1OR2-LABEL: allzeros_v4i64_and4: +; AVX1OR2: # %bb.0: +; AVX1OR2-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0 +; AVX1OR2-NEXT: sete %al +; AVX1OR2-NEXT: vzeroupper +; AVX1OR2-NEXT: retq ; ; AVX512-LABEL: allzeros_v4i64_and4: ; AVX512: # %bb.0: @@ -3338,8 +3289,7 @@ ; AVX2-LABEL: allzeros_v8i64_and4: ; AVX2: # %bb.0: ; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [4,4,4,4] -; AVX2-NEXT: vptest %ymm1, %ymm0 +; AVX2-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0 ; AVX2-NEXT: sete %al ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq diff --git a/llvm/test/CodeGen/X86/oddshuffles.ll b/llvm/test/CodeGen/X86/oddshuffles.ll --- a/llvm/test/CodeGen/X86/oddshuffles.ll +++ b/llvm/test/CodeGen/X86/oddshuffles.ll @@ -1504,14 +1504,13 @@ ; AVX2-SLOW-NEXT: vmovups (%rdi), %ymm0 ; AVX2-SLOW-NEXT: vmovups 32(%rdi), %ymm1 ; AVX2-SLOW-NEXT: vmovups 64(%rdi), %ymm2 -; AVX2-SLOW-NEXT: vbroadcastsd {{.*#+}} ymm3 = [21474836482,21474836482,21474836482,21474836482] +; AVX2-SLOW-NEXT: vmovaps {{.*#+}} ymm3 = ; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm3, %ymm3 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7] ; AVX2-SLOW-NEXT: vmovaps {{.*#+}} ymm5 = <0,3,6,1,4,7,u,u> ; AVX2-SLOW-NEXT: vpermps %ymm4, %ymm5, %ymm4 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7] -; AVX2-SLOW-NEXT: vbroadcastf128 {{.*#+}} ymm4 = [0,0,3,6,0,0,3,6] -; AVX2-SLOW-NEXT: # ymm4 = mem[0,1,0,1] +; AVX2-SLOW-NEXT: vmovaps {{.*#+}} ymm4 = ; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm4, %ymm4 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] ; AVX2-SLOW-NEXT: vmovaps {{.*#+}} ymm6 = <1,4,7,2,5,u,u,u> @@ -1534,14 +1533,13 @@ ; AVX2-FAST-ALL-NEXT: vmovups (%rdi), %ymm0 ; AVX2-FAST-ALL-NEXT: vmovups 32(%rdi), %ymm1 ; AVX2-FAST-ALL-NEXT: vmovups 64(%rdi), %ymm2 -; AVX2-FAST-ALL-NEXT: vbroadcastsd {{.*#+}} ymm3 = [21474836482,21474836482,21474836482,21474836482] +; AVX2-FAST-ALL-NEXT: vmovaps {{.*#+}} ymm3 = ; AVX2-FAST-ALL-NEXT: vpermps %ymm2, %ymm3, %ymm3 ; AVX2-FAST-ALL-NEXT: vblendps {{.*#+}} ymm4 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7] ; AVX2-FAST-ALL-NEXT: vmovaps {{.*#+}} ymm5 = <0,3,6,1,4,7,u,u> ; AVX2-FAST-ALL-NEXT: vpermps %ymm4, %ymm5, %ymm4 ; AVX2-FAST-ALL-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7] -; AVX2-FAST-ALL-NEXT: vbroadcastf128 {{.*#+}} ymm4 = [0,0,3,6,0,0,3,6] -; AVX2-FAST-ALL-NEXT: # ymm4 = mem[0,1,0,1] +; AVX2-FAST-ALL-NEXT: vmovaps {{.*#+}} ymm4 = ; AVX2-FAST-ALL-NEXT: vpermps %ymm2, %ymm4, %ymm4 ; AVX2-FAST-ALL-NEXT: vblendps {{.*#+}} ymm5 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] ; AVX2-FAST-ALL-NEXT: vmovaps {{.*#+}} ymm6 = <1,4,7,2,5,u,u,u> @@ -1564,14 +1562,13 @@ ; AVX2-FAST-PERLANE-NEXT: vmovups (%rdi), %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovups 32(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovups 64(%rdi), %ymm2 -; AVX2-FAST-PERLANE-NEXT: vbroadcastsd {{.*#+}} ymm3 = [21474836482,21474836482,21474836482,21474836482] +; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} ymm3 = ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm2, %ymm3, %ymm3 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} ymm5 = <0,3,6,1,4,7,u,u> ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm4, %ymm5, %ymm4 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastf128 {{.*#+}} ymm4 = [0,0,3,6,0,0,3,6] -; AVX2-FAST-PERLANE-NEXT: # ymm4 = mem[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} ymm4 = ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm2, %ymm4, %ymm4 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} ymm6 = <1,4,7,2,5,u,u,u> @@ -1779,8 +1776,7 @@ ; AVX2-FAST-ALL-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7] ; AVX2-FAST-ALL-NEXT: vpermpd {{.*#+}} ymm4 = ymm2[2,1,3,3] ; AVX2-FAST-ALL-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5,6],ymm4[7] -; AVX2-FAST-ALL-NEXT: vbroadcastf128 {{.*#+}} ymm4 = [1,0,2,2,1,0,2,2] -; AVX2-FAST-ALL-NEXT: # ymm4 = mem[0,1,0,1] +; AVX2-FAST-ALL-NEXT: vmovaps {{.*#+}} ymm4 = ; AVX2-FAST-ALL-NEXT: vpermps %ymm1, %ymm4, %ymm4 ; AVX2-FAST-ALL-NEXT: vpermpd {{.*#+}} ymm5 = ymm0[0,0,2,1] ; AVX2-FAST-ALL-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5,6],ymm4[7] diff --git a/llvm/test/CodeGen/X86/omit-urem-of-power-of-two-or-zero-when-comparing-with-zero.ll b/llvm/test/CodeGen/X86/omit-urem-of-power-of-two-or-zero-when-comparing-with-zero.ll --- a/llvm/test/CodeGen/X86/omit-urem-of-power-of-two-or-zero-when-comparing-with-zero.ll +++ b/llvm/test/CodeGen/X86/omit-urem-of-power-of-two-or-zero-when-comparing-with-zero.ll @@ -111,13 +111,10 @@ ; ; AVX2-LABEL: p4_vector_urem_by_const__splat: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [128,128,128,128] -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2863311531,2863311531,2863311531,2863311531] -; AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX2-NEXT: vpsrld $1, %xmm0, %xmm0 -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [715827882,715827882,715827882,715827882] -; AVX2-NEXT: vpminud %xmm1, %xmm0, %xmm1 +; AVX2-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 ; AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq %t0 = and <4 x i32> %x, ; clearly a power-of-two or zero @@ -211,13 +208,12 @@ ; ; AVX2-LABEL: p6_vector_urem_by_const__nonsplat_undef0: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [128,128,128,128] -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2863311531,2863311531,2863311531,2863311531] -; AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpsrld $1, %xmm0, %xmm0 -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [715827882,715827882,715827882,715827882] -; AVX2-NEXT: vpminud %xmm1, %xmm0, %xmm1 +; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vpsrld $1, %xmm0, %xmm1 +; AVX2-NEXT: vpslld $31, %xmm0, %xmm0 +; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 ; AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq %t0 = and <4 x i32> %x, @@ -263,8 +259,7 @@ ; ; AVX2-LABEL: p7_vector_urem_by_const__nonsplat_undef2: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [128,128,128,128] -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] ; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2863311531,2863311531,2863311531,2863311531] ; AVX2-NEXT: vpmuludq %xmm2, %xmm1, %xmm1 @@ -326,8 +321,7 @@ ; ; AVX2-LABEL: p8_vector_urem_by_const__nonsplat_undef3: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [128,128,128,128] -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] ; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2863311531,2863311531,2863311531,2863311531] ; AVX2-NEXT: vpmuludq %xmm2, %xmm1, %xmm1 @@ -335,8 +329,7 @@ ; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] ; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] ; AVX2-NEXT: vpsrld $2, %xmm1, %xmm1 -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [6,6,6,6] -; AVX2-NEXT: vpmulld %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 diff --git a/llvm/test/CodeGen/X86/opt-pipeline.ll b/llvm/test/CodeGen/X86/opt-pipeline.ll --- a/llvm/test/CodeGen/X86/opt-pipeline.ll +++ b/llvm/test/CodeGen/X86/opt-pipeline.ll @@ -204,6 +204,7 @@ ; CHECK-NEXT: X86 Atom pad short functions ; CHECK-NEXT: X86 LEA Fixup ; CHECK-NEXT: X86 Fixup Inst Tuning +; CHECK-NEXT: X86 Fixup Vector Constants ; CHECK-NEXT: Compressing EVEX instrs to VEX encoding when possible ; CHECK-NEXT: X86 Discriminate Memory Operands ; CHECK-NEXT: X86 Insert Cache Prefetches diff --git a/llvm/test/CodeGen/X86/packss.ll b/llvm/test/CodeGen/X86/packss.ll --- a/llvm/test/CodeGen/X86/packss.ll +++ b/llvm/test/CodeGen/X86/packss.ll @@ -170,19 +170,17 @@ ; ; X86-AVX1-LABEL: trunc_ashr_v4i64_demandedelts: ; X86-AVX1: # %bb.0: -; X86-AVX1-NEXT: vpsllq $63, %xmm0, %xmm1 -; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; X86-AVX1-NEXT: vpsllq $63, %xmm1, %xmm1 ; X86-AVX1-NEXT: vpsllq $63, %xmm0, %xmm0 ; X86-AVX1-NEXT: vpsrlq $63, %xmm0, %xmm0 -; X86-AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [1,1] -; X86-AVX1-NEXT: # xmm2 = mem[0,0] +; X86-AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [1,0,0,0] ; X86-AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0 -; X86-AVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 -; X86-AVX1-NEXT: vpaddq %xmm3, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpsubq %xmm2, %xmm0, %xmm0 ; X86-AVX1-NEXT: vpsrlq $63, %xmm1, %xmm1 ; X86-AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm1 -; X86-AVX1-NEXT: vpaddq %xmm3, %xmm1, %xmm1 -; X86-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; X86-AVX1-NEXT: vpsubq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1, %xmm1 +; X86-AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; X86-AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4] ; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; X86-AVX1-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 @@ -191,12 +189,10 @@ ; ; X86-AVX2-LABEL: trunc_ashr_v4i64_demandedelts: ; X86-AVX2: # %bb.0: -; X86-AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [63,0,0,0,63,0,0,0] -; X86-AVX2-NEXT: # ymm1 = mem[0,1,0,1] +; X86-AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [63,0,0,0,63,0,0,0] ; X86-AVX2-NEXT: vpsllvq %ymm1, %ymm0, %ymm0 ; X86-AVX2-NEXT: vpsrlvq %ymm1, %ymm0, %ymm0 -; X86-AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [1,0,0,2147483648,1,0,0,2147483648] -; X86-AVX2-NEXT: # ymm1 = mem[0,1,0,1] +; X86-AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [1,0,0,2147483648,1,0,0,2147483648] ; X86-AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0 ; X86-AVX2-NEXT: vpsubq %ymm1, %ymm0, %ymm0 ; X86-AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4] @@ -242,10 +238,8 @@ ; ; X64-AVX2-LABEL: trunc_ashr_v4i64_demandedelts: ; X64-AVX2: # %bb.0: -; X64-AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [1,1,1,1] -; X64-AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 -; X64-AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [1,9223372036854775808,1,9223372036854775808] -; X64-AVX2-NEXT: # ymm1 = mem[0,1,0,1] +; X64-AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; X64-AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [1,9223372036854775808,1,9223372036854775808] ; X64-AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0 ; X64-AVX2-NEXT: vpsubq %ymm1, %ymm0, %ymm0 ; X64-AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4] diff --git a/llvm/test/CodeGen/X86/paddus.ll b/llvm/test/CodeGen/X86/paddus.ll --- a/llvm/test/CodeGen/X86/paddus.ll +++ b/llvm/test/CodeGen/X86/paddus.ll @@ -487,7 +487,7 @@ ; ; AVX2-LABEL: test14: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm2 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] ; AVX2-NEXT: vpaddusb %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpaddusb %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: retq @@ -527,7 +527,7 @@ ; ; AVX2-LABEL: test15: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm2 = [129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129] ; AVX2-NEXT: vpaddusb %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpaddusb %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: retq @@ -567,7 +567,7 @@ ; ; AVX2-LABEL: test16: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm2 = [254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254] ; AVX2-NEXT: vpaddusb %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpaddusb %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: retq @@ -614,7 +614,7 @@ ; ; AVX1-LABEL: test17: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; AVX1-NEXT: vbroadcastss {{.*#+}} ymm2 = [-1.18010406E-38,-1.18010406E-38,-1.18010406E-38,-1.18010406E-38,-1.18010406E-38,-1.18010406E-38,-1.18010406E-38,-1.18010406E-38] ; AVX1-NEXT: vxorps %ymm2, %ymm0, %ymm3 ; AVX1-NEXT: vxorps %ymm2, %ymm1, %ymm2 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4 @@ -642,7 +642,7 @@ ; ; AVX2-LABEL: test17: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] ; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm3 ; AVX2-NEXT: vpxor %ymm2, %ymm1, %ymm2 ; AVX2-NEXT: vpmaxub %ymm1, %ymm2, %ymm1 @@ -695,7 +695,7 @@ ; ; AVX2-LABEL: test18: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm2 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2] ; AVX2-NEXT: vpaddusb %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpaddusb %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: retq @@ -1240,7 +1240,7 @@ ; ; AVX2-LABEL: test32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767] +; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm2 = [32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767] ; AVX2-NEXT: vpaddusw %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpaddusw %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: retq @@ -1280,7 +1280,7 @@ ; ; AVX2-LABEL: test33: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [32769,32769,32769,32769,32769,32769,32769,32769,32769,32769,32769,32769,32769,32769,32769,32769] +; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm2 = [32769,32769,32769,32769,32769,32769,32769,32769,32769,32769,32769,32769,32769,32769,32769,32769] ; AVX2-NEXT: vpaddusw %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpaddusw %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: retq @@ -1320,7 +1320,7 @@ ; ; AVX2-LABEL: test34: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [65534,65534,65534,65534,65534,65534,65534,65534,65534,65534,65534,65534,65534,65534,65534,65534] +; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm2 = [65534,65534,65534,65534,65534,65534,65534,65534,65534,65534,65534,65534,65534,65534,65534,65534] ; AVX2-NEXT: vpaddusw %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpaddusw %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: retq @@ -1423,7 +1423,7 @@ ; ; AVX1-LABEL: test35: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768] +; AVX1-NEXT: vbroadcastss {{.*#+}} ymm2 = [-4.59177481E-41,-4.59177481E-41,-4.59177481E-41,-4.59177481E-41,-4.59177481E-41,-4.59177481E-41,-4.59177481E-41,-4.59177481E-41] ; AVX1-NEXT: vxorps %ymm2, %ymm0, %ymm3 ; AVX1-NEXT: vxorps %ymm2, %ymm1, %ymm2 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4 @@ -1451,7 +1451,7 @@ ; ; AVX2-LABEL: test35: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768] +; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm2 = [32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768] ; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm3 ; AVX2-NEXT: vpxor %ymm2, %ymm1, %ymm2 ; AVX2-NEXT: vpmaxuw %ymm1, %ymm2, %ymm1 @@ -1504,7 +1504,7 @@ ; ; AVX2-LABEL: test36: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2] +; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm2 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2] ; AVX2-NEXT: vpaddusw %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpaddusw %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: retq diff --git a/llvm/test/CodeGen/X86/pmul.ll b/llvm/test/CodeGen/X86/pmul.ll --- a/llvm/test/CodeGen/X86/pmul.ll +++ b/llvm/test/CodeGen/X86/pmul.ll @@ -98,11 +98,16 @@ ; SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE41-NEXT: retq ; -; AVX-LABEL: mul_v4i32c: -; AVX: # %bb.0: # %entry -; AVX-NEXT: vpbroadcastd {{.*#+}} xmm1 = [117,117,117,117] -; AVX-NEXT: vpmulld %xmm1, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX2-LABEL: mul_v4i32c: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: mul_v4i32c: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm1 = [117,117,117,117] +; AVX512-NEXT: vpmulld %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: retq entry: %A = mul <4 x i32> %i, < i32 117, i32 117, i32 117, i32 117 > ret <4 x i32> %A @@ -122,7 +127,7 @@ ; ; AVX-LABEL: mul_v2i64c: ; AVX: # %bb.0: # %entry -; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [117,117] +; AVX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [117,117] ; AVX-NEXT: vpmuludq %xmm1, %xmm0, %xmm2 ; AVX-NEXT: vpsrlq $32, %xmm0, %xmm0 ; AVX-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 @@ -417,9 +422,9 @@ ; AVX2-LABEL: mul_v32i8c: ; AVX2: # %bb.0: # %entry ; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm1 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117] +; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm2 = [117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117] ; AVX2-NEXT: vpmullw %ymm2, %ymm1, %ymm1 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm1 ; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] ; AVX2-NEXT: vpmullw %ymm2, %ymm0, %ymm0 @@ -430,9 +435,9 @@ ; AVX512F-LABEL: mul_v32i8c: ; AVX512F: # %bb.0: # %entry ; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm1 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117] +; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm2 = [117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117] ; AVX512F-NEXT: vpmullw %ymm2, %ymm1, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX512F-NEXT: vpand %ymm3, %ymm1, %ymm1 ; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] ; AVX512F-NEXT: vpmullw %ymm2, %ymm0, %ymm0 @@ -493,11 +498,16 @@ ; SSE41-NEXT: pmulld %xmm2, %xmm1 ; SSE41-NEXT: retq ; -; AVX-LABEL: mul_v8i32c: -; AVX: # %bb.0: # %entry -; AVX-NEXT: vpbroadcastd {{.*#+}} ymm1 = [117,117,117,117,117,117,117,117] -; AVX-NEXT: vpmulld %ymm1, %ymm0, %ymm0 -; AVX-NEXT: retq +; AVX2-LABEL: mul_v8i32c: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: mul_v8i32c: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vpbroadcastd {{.*#+}} ymm1 = [117,117,117,117,117,117,117,117] +; AVX512-NEXT: vpmulld %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: retq entry: %A = mul <8 x i32> %i, < i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117, i32 117 > ret <8 x i32> %A @@ -593,7 +603,7 @@ ; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; AVX2-NEXT: vpmullw %ymm2, %ymm3, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX2-NEXT: vpand %ymm3, %ymm2, %ymm2 ; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] ; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] @@ -607,7 +617,7 @@ ; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; AVX512F-NEXT: vpmullw %ymm2, %ymm3, %ymm2 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX512F-NEXT: vpand %ymm3, %ymm2, %ymm2 ; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] ; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] @@ -799,9 +809,9 @@ ; AVX2-LABEL: mul_v64i8c: ; AVX2: # %bb.0: # %entry ; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117] +; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm3 = [117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117] ; AVX2-NEXT: vpmullw %ymm3, %ymm2, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX2-NEXT: vpand %ymm4, %ymm2, %ymm2 ; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] ; AVX2-NEXT: vpmullw %ymm3, %ymm0, %ymm0 @@ -820,9 +830,9 @@ ; AVX512F: # %bb.0: # %entry ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117] +; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm3 = [117,117,117,117,117,117,117,117,117,117,117,117,117,117,117,117] ; AVX512F-NEXT: vpmullw %ymm3, %ymm2, %ymm2 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX512F-NEXT: vpand %ymm4, %ymm2, %ymm2 ; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] ; AVX512F-NEXT: vpmullw %ymm3, %ymm1, %ymm1 @@ -955,7 +965,7 @@ ; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; AVX2-NEXT: vpmullw %ymm4, %ymm5, %ymm4 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX2-NEXT: vpand %ymm5, %ymm4, %ymm4 ; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] ; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] @@ -980,7 +990,7 @@ ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm4 ; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; AVX512F-NEXT: vpmullw %ymm3, %ymm5, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX512F-NEXT: vpand %ymm5, %ymm3, %ymm3 ; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] ; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] diff --git a/llvm/test/CodeGen/X86/pmulh.ll b/llvm/test/CodeGen/X86/pmulh.ll --- a/llvm/test/CodeGen/X86/pmulh.ll +++ b/llvm/test/CodeGen/X86/pmulh.ll @@ -372,7 +372,7 @@ ; AVX512F-LABEL: and_mulhuw_v16i16: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpmovdw %zmm1, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767] +; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm2 = [32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1 ; AVX512F-NEXT: vpmovdw %zmm0, %ymm0 ; AVX512F-NEXT: vpand %ymm2, %ymm0, %ymm0 diff --git a/llvm/test/CodeGen/X86/pr30290.ll b/llvm/test/CodeGen/X86/pr30290.ll --- a/llvm/test/CodeGen/X86/pr30290.ll +++ b/llvm/test/CodeGen/X86/pr30290.ll @@ -20,7 +20,7 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: subq $40, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 48 -; CHECK-NEXT: vmovaps {{.*#+}} xmm0 = [1,1,1,1] +; CHECK-NEXT: vbroadcastss {{.*#+}} xmm0 = [1.40129846E-45,1.40129846E-45,1.40129846E-45,1.40129846E-45] ; CHECK-NEXT: vmovaps %xmm0, {{[0-9]+}}(%rsp) ; CHECK-NEXT: movl $1, {{[0-9]+}}(%rsp) ; CHECK-NEXT: vmovups {{[0-9]+}}(%rsp), %xmm0 diff --git a/llvm/test/CodeGen/X86/pr32368.ll b/llvm/test/CodeGen/X86/pr32368.ll --- a/llvm/test/CodeGen/X86/pr32368.ll +++ b/llvm/test/CodeGen/X86/pr32368.ll @@ -21,11 +21,9 @@ ; ; AVX2-LABEL: PR32368_128: ; AVX2: # %bb.0: -; AVX2-NEXT: vbroadcastss {{.*#+}} xmm1 = [4294967004,4294967004,4294967004,4294967004] -; AVX2-NEXT: vandps %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX2-NEXT: vaddps %xmm0, %xmm0, %xmm0 -; AVX2-NEXT: vbroadcastss {{.*#+}} xmm1 = [291,291,291,291] -; AVX2-NEXT: vandps %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX2-NEXT: retq ; ; AVX512-LABEL: PR32368_128: @@ -68,11 +66,9 @@ ; ; AVX2-LABEL: PR32368_256: ; AVX2: # %bb.0: -; AVX2-NEXT: vbroadcastss {{.*#+}} ymm1 = [4294967004,4294967004,4294967004,4294967004,4294967004,4294967004,4294967004,4294967004] -; AVX2-NEXT: vandps %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: vaddps %ymm0, %ymm0, %ymm0 -; AVX2-NEXT: vbroadcastss {{.*#+}} ymm1 = [291,291,291,291,291,291,291,291] -; AVX2-NEXT: vandps %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512-LABEL: PR32368_256: @@ -114,24 +110,24 @@ ; ; AVX1-LABEL: PR32368_512: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [4294967004,4294967004,4294967004,4294967004,4294967004,4294967004,4294967004,4294967004] +; AVX1-NEXT: vbroadcastss {{.*#+}} ymm2 = [NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN] ; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 ; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 ; AVX1-NEXT: vaddps %ymm1, %ymm1, %ymm1 ; AVX1-NEXT: vaddps %ymm0, %ymm0, %ymm0 -; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [291,291,291,291,291,291,291,291] +; AVX1-NEXT: vbroadcastss {{.*#+}} ymm2 = [4.07777853E-43,4.07777853E-43,4.07777853E-43,4.07777853E-43,4.07777853E-43,4.07777853E-43,4.07777853E-43,4.07777853E-43] ; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 ; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 ; AVX1-NEXT: retq ; ; AVX2-LABEL: PR32368_512: ; AVX2: # %bb.0: -; AVX2-NEXT: vbroadcastss {{.*#+}} ymm2 = [4294967004,4294967004,4294967004,4294967004,4294967004,4294967004,4294967004,4294967004] +; AVX2-NEXT: vbroadcastss {{.*#+}} ymm2 = [NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN] ; AVX2-NEXT: vandps %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vandps %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vaddps %ymm1, %ymm1, %ymm1 ; AVX2-NEXT: vaddps %ymm0, %ymm0, %ymm0 -; AVX2-NEXT: vbroadcastss {{.*#+}} ymm2 = [291,291,291,291,291,291,291,291] +; AVX2-NEXT: vbroadcastss {{.*#+}} ymm2 = [4.07777853E-43,4.07777853E-43,4.07777853E-43,4.07777853E-43,4.07777853E-43,4.07777853E-43,4.07777853E-43,4.07777853E-43] ; AVX2-NEXT: vandps %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vandps %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: retq diff --git a/llvm/test/CodeGen/X86/pr38639.ll b/llvm/test/CodeGen/X86/pr38639.ll --- a/llvm/test/CodeGen/X86/pr38639.ll +++ b/llvm/test/CodeGen/X86/pr38639.ll @@ -4,11 +4,12 @@ define <8 x double> @test(<4 x double> %a, <4 x double> %b) { ; CHECK-LABEL: test: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovaps {{.*#+}} ymm1 = +; CHECK-NEXT: vbroadcastsd {{.*#+}} ymm1 = [8.2071743224100002E-1,8.2071743224100002E-1,8.2071743224100002E-1,8.2071743224100002E-1] ; CHECK-NEXT: vblendps {{.*#+}} ymm2 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; CHECK-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; CHECK-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm2[1],ymm1[3],ymm2[3] -; CHECK-NEXT: vmovaps {{.*#+}} xmm2 = [8.2071743224100002E-1,8.2071743224100002E-1] +; CHECK-NEXT: vmovddup {{.*#+}} xmm2 = [8.2071743224100002E-1,8.2071743224100002E-1] +; CHECK-NEXT: # xmm2 = mem[0,0] ; CHECK-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; CHECK-NEXT: retq %1 = shufflevector <4 x double> %a, <4 x double> , <8 x i32> diff --git a/llvm/test/CodeGen/X86/prefer-avx256-popcnt.ll b/llvm/test/CodeGen/X86/prefer-avx256-popcnt.ll --- a/llvm/test/CodeGen/X86/prefer-avx256-popcnt.ll +++ b/llvm/test/CodeGen/X86/prefer-avx256-popcnt.ll @@ -84,17 +84,41 @@ } define <32 x i8> @testv32i8(<32 x i8> %in) { -; CHECK-LABEL: testv32i8: -; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; CHECK-NEXT: vpand %ymm1, %ymm0, %ymm2 -; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; CHECK-NEXT: vpshufb %ymm2, %ymm3, %ymm2 -; CHECK-NEXT: vpsrlw $4, %ymm0, %ymm0 -; CHECK-NEXT: vpand %ymm1, %ymm0, %ymm0 -; CHECK-NEXT: vpshufb %ymm0, %ymm3, %ymm0 -; CHECK-NEXT: vpaddb %ymm2, %ymm0, %ymm0 -; CHECK-NEXT: retq +; AVX256-LABEL: testv32i8: +; AVX256: # %bb.0: +; AVX256-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX256-NEXT: vpand %ymm1, %ymm0, %ymm2 +; AVX256-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX256-NEXT: vpshufb %ymm2, %ymm3, %ymm2 +; AVX256-NEXT: vpsrlw $4, %ymm0, %ymm0 +; AVX256-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX256-NEXT: vpshufb %ymm0, %ymm3, %ymm0 +; AVX256-NEXT: vpaddb %ymm2, %ymm0, %ymm0 +; AVX256-NEXT: retq +; +; AVX512VL-LABEL: testv32i8: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VL-NEXT: vpand %ymm1, %ymm0, %ymm2 +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512VL-NEXT: vpshufb %ymm2, %ymm3, %ymm2 +; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm0 +; AVX512VL-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX512VL-NEXT: vpshufb %ymm0, %ymm3, %ymm0 +; AVX512VL-NEXT: vpaddb %ymm2, %ymm0, %ymm0 +; AVX512VL-NEXT: retq +; +; AVX512F-LABEL: testv32i8: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm2 +; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512F-NEXT: vpshufb %ymm2, %ymm3, %ymm2 +; AVX512F-NEXT: vpsrlw $4, %ymm0, %ymm0 +; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX512F-NEXT: vpshufb %ymm0, %ymm3, %ymm0 +; AVX512F-NEXT: vpaddb %ymm2, %ymm0, %ymm0 +; AVX512F-NEXT: retq %out = call <32 x i8> @llvm.ctpop.v32i8(<32 x i8> %in) ret <32 x i8> %out } @@ -103,3 +127,5 @@ declare <16 x i8> @llvm.ctpop.v16i8(<16 x i8>) declare <16 x i16> @llvm.ctpop.v16i16(<16 x i16>) declare <32 x i8> @llvm.ctpop.v32i8(<32 x i8>) +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; CHECK: {{.*}} diff --git a/llvm/test/CodeGen/X86/psubus.ll b/llvm/test/CodeGen/X86/psubus.ll --- a/llvm/test/CodeGen/X86/psubus.ll +++ b/llvm/test/CodeGen/X86/psubus.ll @@ -793,8 +793,7 @@ ; ; AVX2-LABEL: test13: ; AVX2: # %bb.0: # %vector.ph -; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [65535,65535,65535,65535,65535,65535,65535,65535] -; AVX2-NEXT: vpminud %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX2-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 ; AVX2-NEXT: vpsubusw %xmm1, %xmm0, %xmm0 @@ -928,7 +927,7 @@ ; AVX1-NEXT: vpxor %xmm7, %xmm3, %xmm3 ; AVX1-NEXT: vpackssdw %xmm3, %xmm4, %xmm3 ; AVX1-NEXT: vpacksswb %xmm5, %xmm3, %xmm3 -; AVX1-NEXT: vmovaps {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255] +; AVX1-NEXT: vbroadcastss {{.*#+}} ymm4 = [3.57331108E-43,3.57331108E-43,3.57331108E-43,3.57331108E-43,3.57331108E-43,3.57331108E-43,3.57331108E-43,3.57331108E-43] ; AVX1-NEXT: vandps %ymm4, %ymm2, %ymm2 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5 ; AVX1-NEXT: vpackusdw %xmm5, %xmm2, %xmm2 @@ -1059,8 +1058,7 @@ ; ; AVX2-LABEL: test15: ; AVX2: # %bb.0: # %vector.ph -; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [65535,65535,65535,65535,65535,65535,65535,65535] -; AVX2-NEXT: vpminud %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX2-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 ; AVX2-NEXT: vpsubusw %xmm1, %xmm0, %xmm0 @@ -1582,8 +1580,7 @@ ; ; AVX2-LABEL: psubus_8i32_max: ; AVX2: # %bb.0: # %vector.ph -; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [65535,65535,65535,65535,65535,65535,65535,65535] -; AVX2-NEXT: vpminud %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX2-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 ; AVX2-NEXT: vpsubusw %xmm1, %xmm0, %xmm0 @@ -1726,7 +1723,8 @@ ; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm5 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [9223372036854841343,9223372036854841343] ; AVX1-NEXT: vpcmpgtq %xmm5, %xmm6, %xmm5 -; AVX1-NEXT: vmovapd {{.*#+}} xmm7 = [65535,65535] +; AVX1-NEXT: vmovddup {{.*#+}} xmm7 = [3.2378592100206092E-319,3.2378592100206092E-319] +; AVX1-NEXT: # xmm7 = mem[0,0] ; AVX1-NEXT: vblendvpd %xmm5, %xmm3, %xmm7, %xmm3 ; AVX1-NEXT: vpxor %xmm4, %xmm2, %xmm5 ; AVX1-NEXT: vpcmpgtq %xmm5, %xmm6, %xmm5 @@ -1751,7 +1749,7 @@ ; AVX2-NEXT: vpxor %ymm3, %ymm2, %ymm4 ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm5 = [9223372036854841343,9223372036854841343,9223372036854841343,9223372036854841343] ; AVX2-NEXT: vpcmpgtq %ymm4, %ymm5, %ymm4 -; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm6 = [65535,65535,65535,65535] +; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm6 = [3.2378592100206092E-319,3.2378592100206092E-319,3.2378592100206092E-319,3.2378592100206092E-319] ; AVX2-NEXT: vblendvpd %ymm4, %ymm2, %ymm6, %ymm2 ; AVX2-NEXT: vpxor %ymm3, %ymm1, %ymm3 ; AVX2-NEXT: vpcmpgtq %ymm3, %ymm5, %ymm3 @@ -1951,8 +1949,7 @@ ; ; AVX2-LABEL: psubus_i16_i32_max_swapped: ; AVX2: # %bb.0: # %vector.ph -; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [65535,65535,65535,65535,65535,65535,65535,65535] -; AVX2-NEXT: vpminud %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX2-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 ; AVX2-NEXT: vpsubusw %xmm1, %xmm0, %xmm0 @@ -2046,8 +2043,7 @@ ; ; AVX2-LABEL: psubus_i16_i32_min: ; AVX2: # %bb.0: # %vector.ph -; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [65535,65535,65535,65535,65535,65535,65535,65535] -; AVX2-NEXT: vpminud %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX2-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 ; AVX2-NEXT: vpsubusw %xmm1, %xmm0, %xmm0 @@ -2424,7 +2420,7 @@ ; ; AVX2-LABEL: test27: ; AVX2: # %bb.0: # %entry -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm2 = [154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154,154] ; AVX2-NEXT: vpsubusb %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpsubusb %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: retq @@ -2635,8 +2631,7 @@ ; ; AVX2-LABEL: test32: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [65535,65535,65535,65535,65535,65535,65535,65535] -; AVX2-NEXT: vpminud %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX2-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 ; AVX2-NEXT: vpsubusw %xmm1, %xmm0, %xmm0 @@ -2783,7 +2778,8 @@ ; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm5 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [9223372041149743103,9223372041149743103] ; AVX1-NEXT: vpcmpgtq %xmm5, %xmm6, %xmm5 -; AVX1-NEXT: vmovapd {{.*#+}} xmm7 = [4294967295,4294967295] +; AVX1-NEXT: vmovddup {{.*#+}} xmm7 = [2.1219957904712067E-314,2.1219957904712067E-314] +; AVX1-NEXT: # xmm7 = mem[0,0] ; AVX1-NEXT: vblendvpd %xmm5, %xmm3, %xmm7, %xmm3 ; AVX1-NEXT: vpxor %xmm4, %xmm2, %xmm5 ; AVX1-NEXT: vpcmpgtq %xmm5, %xmm6, %xmm5 @@ -2811,7 +2807,7 @@ ; AVX2-SLOW-NEXT: vpxor %ymm3, %ymm2, %ymm4 ; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm5 = [9223372041149743103,9223372041149743103,9223372041149743103,9223372041149743103] ; AVX2-SLOW-NEXT: vpcmpgtq %ymm4, %ymm5, %ymm4 -; AVX2-SLOW-NEXT: vbroadcastsd {{.*#+}} ymm6 = [4294967295,4294967295,4294967295,4294967295] +; AVX2-SLOW-NEXT: vbroadcastsd {{.*#+}} ymm6 = [2.1219957904712067E-314,2.1219957904712067E-314,2.1219957904712067E-314,2.1219957904712067E-314] ; AVX2-SLOW-NEXT: vblendvpd %ymm4, %ymm2, %ymm6, %ymm2 ; AVX2-SLOW-NEXT: vpxor %ymm3, %ymm1, %ymm3 ; AVX2-SLOW-NEXT: vpcmpgtq %ymm3, %ymm5, %ymm3 @@ -2829,7 +2825,7 @@ ; AVX2-FAST-ALL-NEXT: vpxor %ymm3, %ymm1, %ymm4 ; AVX2-FAST-ALL-NEXT: vpbroadcastq {{.*#+}} ymm5 = [9223372041149743103,9223372041149743103,9223372041149743103,9223372041149743103] ; AVX2-FAST-ALL-NEXT: vpcmpgtq %ymm4, %ymm5, %ymm4 -; AVX2-FAST-ALL-NEXT: vbroadcastsd {{.*#+}} ymm6 = [4294967295,4294967295,4294967295,4294967295] +; AVX2-FAST-ALL-NEXT: vbroadcastsd {{.*#+}} ymm6 = [2.1219957904712067E-314,2.1219957904712067E-314,2.1219957904712067E-314,2.1219957904712067E-314] ; AVX2-FAST-ALL-NEXT: vblendvpd %ymm4, %ymm1, %ymm6, %ymm1 ; AVX2-FAST-ALL-NEXT: vmovapd {{.*#+}} ymm4 = [0,2,4,6,4,6,6,7] ; AVX2-FAST-ALL-NEXT: vpermps %ymm1, %ymm4, %ymm1 @@ -2848,7 +2844,7 @@ ; AVX2-FAST-PERLANE-NEXT: vpxor %ymm3, %ymm2, %ymm4 ; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} ymm5 = [9223372041149743103,9223372041149743103,9223372041149743103,9223372041149743103] ; AVX2-FAST-PERLANE-NEXT: vpcmpgtq %ymm4, %ymm5, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vbroadcastsd {{.*#+}} ymm6 = [4294967295,4294967295,4294967295,4294967295] +; AVX2-FAST-PERLANE-NEXT: vbroadcastsd {{.*#+}} ymm6 = [2.1219957904712067E-314,2.1219957904712067E-314,2.1219957904712067E-314,2.1219957904712067E-314] ; AVX2-FAST-PERLANE-NEXT: vblendvpd %ymm4, %ymm2, %ymm6, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vpxor %ymm3, %ymm1, %ymm3 ; AVX2-FAST-PERLANE-NEXT: vpcmpgtq %ymm3, %ymm5, %ymm3 @@ -3007,7 +3003,8 @@ ; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm5 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [9223372041149743103,9223372041149743103] ; AVX1-NEXT: vpcmpgtq %xmm5, %xmm6, %xmm5 -; AVX1-NEXT: vmovapd {{.*#+}} xmm7 = [4294967295,4294967295] +; AVX1-NEXT: vmovddup {{.*#+}} xmm7 = [2.1219957904712067E-314,2.1219957904712067E-314] +; AVX1-NEXT: # xmm7 = mem[0,0] ; AVX1-NEXT: vblendvpd %xmm5, %xmm3, %xmm7, %xmm3 ; AVX1-NEXT: vpxor %xmm4, %xmm2, %xmm5 ; AVX1-NEXT: vpcmpgtq %xmm5, %xmm6, %xmm5 @@ -3031,13 +3028,12 @@ ; ; AVX2-SLOW-LABEL: test34: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vpbroadcastd {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1] -; AVX2-SLOW-NEXT: vpand %ymm3, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm3 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] ; AVX2-SLOW-NEXT: vpxor %ymm3, %ymm2, %ymm4 ; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm5 = [9223372041149743103,9223372041149743103,9223372041149743103,9223372041149743103] ; AVX2-SLOW-NEXT: vpcmpgtq %ymm4, %ymm5, %ymm4 -; AVX2-SLOW-NEXT: vbroadcastsd {{.*#+}} ymm6 = [4294967295,4294967295,4294967295,4294967295] +; AVX2-SLOW-NEXT: vbroadcastsd {{.*#+}} ymm6 = [2.1219957904712067E-314,2.1219957904712067E-314,2.1219957904712067E-314,2.1219957904712067E-314] ; AVX2-SLOW-NEXT: vblendvpd %ymm4, %ymm2, %ymm6, %ymm2 ; AVX2-SLOW-NEXT: vpxor %ymm3, %ymm1, %ymm3 ; AVX2-SLOW-NEXT: vpcmpgtq %ymm3, %ymm5, %ymm3 @@ -3051,13 +3047,12 @@ ; ; AVX2-FAST-ALL-LABEL: test34: ; AVX2-FAST-ALL: # %bb.0: -; AVX2-FAST-ALL-NEXT: vpbroadcastd {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1] -; AVX2-FAST-ALL-NEXT: vpand %ymm3, %ymm0, %ymm0 +; AVX2-FAST-ALL-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX2-FAST-ALL-NEXT: vpbroadcastq {{.*#+}} ymm3 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] ; AVX2-FAST-ALL-NEXT: vpxor %ymm3, %ymm1, %ymm4 ; AVX2-FAST-ALL-NEXT: vpbroadcastq {{.*#+}} ymm5 = [9223372041149743103,9223372041149743103,9223372041149743103,9223372041149743103] ; AVX2-FAST-ALL-NEXT: vpcmpgtq %ymm4, %ymm5, %ymm4 -; AVX2-FAST-ALL-NEXT: vbroadcastsd {{.*#+}} ymm6 = [4294967295,4294967295,4294967295,4294967295] +; AVX2-FAST-ALL-NEXT: vbroadcastsd {{.*#+}} ymm6 = [2.1219957904712067E-314,2.1219957904712067E-314,2.1219957904712067E-314,2.1219957904712067E-314] ; AVX2-FAST-ALL-NEXT: vblendvpd %ymm4, %ymm1, %ymm6, %ymm1 ; AVX2-FAST-ALL-NEXT: vmovapd {{.*#+}} ymm4 = [0,2,4,6,4,6,6,7] ; AVX2-FAST-ALL-NEXT: vpermps %ymm1, %ymm4, %ymm1 @@ -3072,13 +3067,12 @@ ; ; AVX2-FAST-PERLANE-LABEL: test34: ; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1] -; AVX2-FAST-PERLANE-NEXT: vpand %ymm3, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} ymm3 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] ; AVX2-FAST-PERLANE-NEXT: vpxor %ymm3, %ymm2, %ymm4 ; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} ymm5 = [9223372041149743103,9223372041149743103,9223372041149743103,9223372041149743103] ; AVX2-FAST-PERLANE-NEXT: vpcmpgtq %ymm4, %ymm5, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vbroadcastsd {{.*#+}} ymm6 = [4294967295,4294967295,4294967295,4294967295] +; AVX2-FAST-PERLANE-NEXT: vbroadcastsd {{.*#+}} ymm6 = [2.1219957904712067E-314,2.1219957904712067E-314,2.1219957904712067E-314,2.1219957904712067E-314] ; AVX2-FAST-PERLANE-NEXT: vblendvpd %ymm4, %ymm2, %ymm6, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vpxor %ymm3, %ymm1, %ymm3 ; AVX2-FAST-PERLANE-NEXT: vpcmpgtq %ymm3, %ymm5, %ymm3 diff --git a/llvm/test/CodeGen/X86/recip-fastmath.ll b/llvm/test/CodeGen/X86/recip-fastmath.ll --- a/llvm/test/CodeGen/X86/recip-fastmath.ll +++ b/llvm/test/CodeGen/X86/recip-fastmath.ll @@ -333,53 +333,11 @@ ; SSE-NEXT: movaps %xmm1, %xmm0 ; SSE-NEXT: retq ; -; AVX-RECIP-LABEL: v4f32_no_estimate: -; AVX-RECIP: # %bb.0: -; AVX-RECIP-NEXT: vmovaps {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; AVX-RECIP-NEXT: vdivps %xmm0, %xmm1, %xmm0 -; AVX-RECIP-NEXT: retq -; -; FMA-RECIP-LABEL: v4f32_no_estimate: -; FMA-RECIP: # %bb.0: -; FMA-RECIP-NEXT: vmovaps {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; FMA-RECIP-NEXT: vdivps %xmm0, %xmm1, %xmm0 -; FMA-RECIP-NEXT: retq -; -; BDVER2-LABEL: v4f32_no_estimate: -; BDVER2: # %bb.0: -; BDVER2-NEXT: vmovaps {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; BDVER2-NEXT: vdivps %xmm0, %xmm1, %xmm0 -; BDVER2-NEXT: retq -; -; BTVER2-LABEL: v4f32_no_estimate: -; BTVER2: # %bb.0: -; BTVER2-NEXT: vmovaps {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; BTVER2-NEXT: vdivps %xmm0, %xmm1, %xmm0 -; BTVER2-NEXT: retq -; -; SANDY-LABEL: v4f32_no_estimate: -; SANDY: # %bb.0: -; SANDY-NEXT: vmovaps {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; SANDY-NEXT: vdivps %xmm0, %xmm1, %xmm0 -; SANDY-NEXT: retq -; -; HASWELL-LABEL: v4f32_no_estimate: -; HASWELL: # %bb.0: -; HASWELL-NEXT: vbroadcastss {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; HASWELL-NEXT: vdivps %xmm0, %xmm1, %xmm0 -; HASWELL-NEXT: retq -; -; HASWELL-NO-FMA-LABEL: v4f32_no_estimate: -; HASWELL-NO-FMA: # %bb.0: -; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; HASWELL-NO-FMA-NEXT: vdivps %xmm0, %xmm1, %xmm0 -; HASWELL-NO-FMA-NEXT: retq -; -; AVX512-LABEL: v4f32_no_estimate: -; AVX512: # %bb.0: -; AVX512-NEXT: vbroadcastss {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; AVX512-NEXT: vdivps %xmm0, %xmm1, %xmm0 -; AVX512-NEXT: retq +; AVX-LABEL: v4f32_no_estimate: +; AVX: # %bb.0: +; AVX-NEXT: vbroadcastss {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; AVX-NEXT: vdivps %xmm0, %xmm1, %xmm0 +; AVX-NEXT: retq %div = fdiv fast <4 x float> , %x ret <4 x float> %div } @@ -400,7 +358,7 @@ ; AVX-RECIP: # %bb.0: ; AVX-RECIP-NEXT: vrcpps %xmm0, %xmm1 ; AVX-RECIP-NEXT: vmulps %xmm1, %xmm0, %xmm0 -; AVX-RECIP-NEXT: vmovaps {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; AVX-RECIP-NEXT: vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; AVX-RECIP-NEXT: vsubps %xmm0, %xmm2, %xmm0 ; AVX-RECIP-NEXT: vmulps %xmm0, %xmm1, %xmm0 ; AVX-RECIP-NEXT: vaddps %xmm0, %xmm1, %xmm0 @@ -422,7 +380,7 @@ ; ; BTVER2-LABEL: v4f32_one_step: ; BTVER2: # %bb.0: -; BTVER2-NEXT: vmovaps {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; BTVER2-NEXT: vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; BTVER2-NEXT: vrcpps %xmm0, %xmm1 ; BTVER2-NEXT: vmulps %xmm1, %xmm0, %xmm0 ; BTVER2-NEXT: vsubps %xmm0, %xmm2, %xmm0 @@ -434,7 +392,7 @@ ; SANDY: # %bb.0: ; SANDY-NEXT: vrcpps %xmm0, %xmm1 ; SANDY-NEXT: vmulps %xmm1, %xmm0, %xmm0 -; SANDY-NEXT: vmovaps {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; SANDY-NEXT: vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; SANDY-NEXT: vsubps %xmm0, %xmm2, %xmm0 ; SANDY-NEXT: vmulps %xmm0, %xmm1, %xmm0 ; SANDY-NEXT: vaddps %xmm0, %xmm1, %xmm0 @@ -443,8 +401,7 @@ ; HASWELL-LABEL: v4f32_one_step: ; HASWELL: # %bb.0: ; HASWELL-NEXT: vrcpps %xmm0, %xmm1 -; HASWELL-NEXT: vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; HASWELL-NEXT: vfmsub213ps {{.*#+}} xmm0 = (xmm1 * xmm0) - xmm2 +; HASWELL-NEXT: vfmsub213ps {{.*#+}} xmm0 = (xmm1 * xmm0) - mem ; HASWELL-NEXT: vfnmadd132ps {{.*#+}} xmm0 = -(xmm0 * xmm1) + xmm1 ; HASWELL-NEXT: retq ; @@ -585,7 +542,7 @@ ; AVX-RECIP: # %bb.0: ; AVX-RECIP-NEXT: vrcpps %xmm0, %xmm1 ; AVX-RECIP-NEXT: vmulps %xmm1, %xmm0, %xmm2 -; AVX-RECIP-NEXT: vmovaps {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; AVX-RECIP-NEXT: vbroadcastss {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; AVX-RECIP-NEXT: vsubps %xmm2, %xmm3, %xmm2 ; AVX-RECIP-NEXT: vmulps %xmm2, %xmm1, %xmm2 ; AVX-RECIP-NEXT: vaddps %xmm2, %xmm1, %xmm1 @@ -598,7 +555,7 @@ ; FMA-RECIP-LABEL: v4f32_two_step: ; FMA-RECIP: # %bb.0: ; FMA-RECIP-NEXT: vrcpps %xmm0, %xmm1 -; FMA-RECIP-NEXT: vmovaps {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; FMA-RECIP-NEXT: vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; FMA-RECIP-NEXT: vmovaps %xmm1, %xmm3 ; FMA-RECIP-NEXT: vfmsub213ps {{.*#+}} xmm3 = (xmm0 * xmm3) - xmm2 ; FMA-RECIP-NEXT: vfnmadd132ps {{.*#+}} xmm3 = -(xmm3 * xmm1) + xmm1 @@ -609,7 +566,7 @@ ; BDVER2-LABEL: v4f32_two_step: ; BDVER2: # %bb.0: ; BDVER2-NEXT: vrcpps %xmm0, %xmm1 -; BDVER2-NEXT: vmovaps {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; BDVER2-NEXT: vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; BDVER2-NEXT: vfmsubps {{.*#+}} xmm3 = (xmm0 * xmm1) - xmm2 ; BDVER2-NEXT: vfnmaddps {{.*#+}} xmm1 = -(xmm1 * xmm3) + xmm1 ; BDVER2-NEXT: vfmsubps {{.*#+}} xmm0 = (xmm0 * xmm1) - xmm2 @@ -618,7 +575,7 @@ ; ; BTVER2-LABEL: v4f32_two_step: ; BTVER2: # %bb.0: -; BTVER2-NEXT: vmovaps {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; BTVER2-NEXT: vbroadcastss {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; BTVER2-NEXT: vrcpps %xmm0, %xmm1 ; BTVER2-NEXT: vmulps %xmm1, %xmm0, %xmm2 ; BTVER2-NEXT: vsubps %xmm2, %xmm3, %xmm2 @@ -634,7 +591,7 @@ ; SANDY: # %bb.0: ; SANDY-NEXT: vrcpps %xmm0, %xmm1 ; SANDY-NEXT: vmulps %xmm1, %xmm0, %xmm2 -; SANDY-NEXT: vmovaps {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; SANDY-NEXT: vbroadcastss {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; SANDY-NEXT: vsubps %xmm2, %xmm3, %xmm2 ; SANDY-NEXT: vmulps %xmm2, %xmm1, %xmm2 ; SANDY-NEXT: vaddps %xmm2, %xmm1, %xmm1 @@ -694,53 +651,11 @@ ; SSE-NEXT: movaps %xmm2, %xmm1 ; SSE-NEXT: retq ; -; AVX-RECIP-LABEL: v8f32_no_estimate: -; AVX-RECIP: # %bb.0: -; AVX-RECIP-NEXT: vmovaps {{.*#+}} ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; AVX-RECIP-NEXT: vdivps %ymm0, %ymm1, %ymm0 -; AVX-RECIP-NEXT: retq -; -; FMA-RECIP-LABEL: v8f32_no_estimate: -; FMA-RECIP: # %bb.0: -; FMA-RECIP-NEXT: vmovaps {{.*#+}} ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; FMA-RECIP-NEXT: vdivps %ymm0, %ymm1, %ymm0 -; FMA-RECIP-NEXT: retq -; -; BDVER2-LABEL: v8f32_no_estimate: -; BDVER2: # %bb.0: -; BDVER2-NEXT: vmovaps {{.*#+}} ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; BDVER2-NEXT: vdivps %ymm0, %ymm1, %ymm0 -; BDVER2-NEXT: retq -; -; BTVER2-LABEL: v8f32_no_estimate: -; BTVER2: # %bb.0: -; BTVER2-NEXT: vmovaps {{.*#+}} ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; BTVER2-NEXT: vdivps %ymm0, %ymm1, %ymm0 -; BTVER2-NEXT: retq -; -; SANDY-LABEL: v8f32_no_estimate: -; SANDY: # %bb.0: -; SANDY-NEXT: vmovaps {{.*#+}} ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; SANDY-NEXT: vdivps %ymm0, %ymm1, %ymm0 -; SANDY-NEXT: retq -; -; HASWELL-LABEL: v8f32_no_estimate: -; HASWELL: # %bb.0: -; HASWELL-NEXT: vbroadcastss {{.*#+}} ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; HASWELL-NEXT: vdivps %ymm0, %ymm1, %ymm0 -; HASWELL-NEXT: retq -; -; HASWELL-NO-FMA-LABEL: v8f32_no_estimate: -; HASWELL-NO-FMA: # %bb.0: -; HASWELL-NO-FMA-NEXT: vbroadcastss {{.*#+}} ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; HASWELL-NO-FMA-NEXT: vdivps %ymm0, %ymm1, %ymm0 -; HASWELL-NO-FMA-NEXT: retq -; -; AVX512-LABEL: v8f32_no_estimate: -; AVX512: # %bb.0: -; AVX512-NEXT: vbroadcastss {{.*#+}} ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; AVX512-NEXT: vdivps %ymm0, %ymm1, %ymm0 -; AVX512-NEXT: retq +; AVX-LABEL: v8f32_no_estimate: +; AVX: # %bb.0: +; AVX-NEXT: vbroadcastss {{.*#+}} ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; AVX-NEXT: vdivps %ymm0, %ymm1, %ymm0 +; AVX-NEXT: retq %div = fdiv fast <8 x float> , %x ret <8 x float> %div } @@ -768,7 +683,7 @@ ; AVX-RECIP: # %bb.0: ; AVX-RECIP-NEXT: vrcpps %ymm0, %ymm1 ; AVX-RECIP-NEXT: vmulps %ymm1, %ymm0, %ymm0 -; AVX-RECIP-NEXT: vmovaps {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; AVX-RECIP-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; AVX-RECIP-NEXT: vsubps %ymm0, %ymm2, %ymm0 ; AVX-RECIP-NEXT: vmulps %ymm0, %ymm1, %ymm0 ; AVX-RECIP-NEXT: vaddps %ymm0, %ymm1, %ymm0 @@ -790,7 +705,7 @@ ; ; BTVER2-LABEL: v8f32_one_step: ; BTVER2: # %bb.0: -; BTVER2-NEXT: vmovaps {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; BTVER2-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; BTVER2-NEXT: vrcpps %ymm0, %ymm1 ; BTVER2-NEXT: vmulps %ymm1, %ymm0, %ymm0 ; BTVER2-NEXT: vsubps %ymm0, %ymm2, %ymm0 @@ -802,7 +717,7 @@ ; SANDY: # %bb.0: ; SANDY-NEXT: vrcpps %ymm0, %ymm1 ; SANDY-NEXT: vmulps %ymm1, %ymm0, %ymm0 -; SANDY-NEXT: vmovaps {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; SANDY-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; SANDY-NEXT: vsubps %ymm0, %ymm2, %ymm0 ; SANDY-NEXT: vmulps %ymm0, %ymm1, %ymm0 ; SANDY-NEXT: vaddps %ymm0, %ymm1, %ymm0 @@ -811,8 +726,7 @@ ; HASWELL-LABEL: v8f32_one_step: ; HASWELL: # %bb.0: ; HASWELL-NEXT: vrcpps %ymm0, %ymm1 -; HASWELL-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; HASWELL-NEXT: vfmsub213ps {{.*#+}} ymm0 = (ymm1 * ymm0) - ymm2 +; HASWELL-NEXT: vfmsub213ps {{.*#+}} ymm0 = (ymm1 * ymm0) - mem ; HASWELL-NEXT: vfnmadd132ps {{.*#+}} ymm0 = -(ymm0 * ymm1) + ymm1 ; HASWELL-NEXT: retq ; @@ -879,7 +793,7 @@ ; AVX-RECIP: # %bb.0: ; AVX-RECIP-NEXT: vrcpps %ymm0, %ymm1 ; AVX-RECIP-NEXT: vmulps %ymm1, %ymm0, %ymm2 -; AVX-RECIP-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; AVX-RECIP-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; AVX-RECIP-NEXT: vsubps %ymm2, %ymm3, %ymm2 ; AVX-RECIP-NEXT: vmulps %ymm2, %ymm1, %ymm2 ; AVX-RECIP-NEXT: vaddps %ymm2, %ymm1, %ymm1 @@ -892,7 +806,7 @@ ; FMA-RECIP-LABEL: v8f32_two_step: ; FMA-RECIP: # %bb.0: ; FMA-RECIP-NEXT: vrcpps %ymm0, %ymm1 -; FMA-RECIP-NEXT: vmovaps {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; FMA-RECIP-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; FMA-RECIP-NEXT: vmovaps %ymm1, %ymm3 ; FMA-RECIP-NEXT: vfmsub213ps {{.*#+}} ymm3 = (ymm0 * ymm3) - ymm2 ; FMA-RECIP-NEXT: vfnmadd132ps {{.*#+}} ymm3 = -(ymm3 * ymm1) + ymm1 @@ -903,7 +817,7 @@ ; BDVER2-LABEL: v8f32_two_step: ; BDVER2: # %bb.0: ; BDVER2-NEXT: vrcpps %ymm0, %ymm1 -; BDVER2-NEXT: vmovaps {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; BDVER2-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; BDVER2-NEXT: vfmsubps {{.*#+}} ymm3 = (ymm0 * ymm1) - ymm2 ; BDVER2-NEXT: vfnmaddps {{.*#+}} ymm1 = -(ymm1 * ymm3) + ymm1 ; BDVER2-NEXT: vfmsubps {{.*#+}} ymm0 = (ymm0 * ymm1) - ymm2 @@ -912,7 +826,7 @@ ; ; BTVER2-LABEL: v8f32_two_step: ; BTVER2: # %bb.0: -; BTVER2-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; BTVER2-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; BTVER2-NEXT: vrcpps %ymm0, %ymm1 ; BTVER2-NEXT: vmulps %ymm1, %ymm0, %ymm2 ; BTVER2-NEXT: vsubps %ymm2, %ymm3, %ymm2 @@ -928,7 +842,7 @@ ; SANDY: # %bb.0: ; SANDY-NEXT: vrcpps %ymm0, %ymm1 ; SANDY-NEXT: vmulps %ymm1, %ymm0, %ymm2 -; SANDY-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; SANDY-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; SANDY-NEXT: vsubps %ymm2, %ymm3, %ymm2 ; SANDY-NEXT: vmulps %ymm2, %ymm1, %ymm2 ; SANDY-NEXT: vaddps %ymm2, %ymm1, %ymm1 @@ -996,35 +910,35 @@ ; ; AVX-RECIP-LABEL: v16f32_no_estimate: ; AVX-RECIP: # %bb.0: -; AVX-RECIP-NEXT: vmovaps {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; AVX-RECIP-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; AVX-RECIP-NEXT: vdivps %ymm0, %ymm2, %ymm0 ; AVX-RECIP-NEXT: vdivps %ymm1, %ymm2, %ymm1 ; AVX-RECIP-NEXT: retq ; ; FMA-RECIP-LABEL: v16f32_no_estimate: ; FMA-RECIP: # %bb.0: -; FMA-RECIP-NEXT: vmovaps {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; FMA-RECIP-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; FMA-RECIP-NEXT: vdivps %ymm0, %ymm2, %ymm0 ; FMA-RECIP-NEXT: vdivps %ymm1, %ymm2, %ymm1 ; FMA-RECIP-NEXT: retq ; ; BDVER2-LABEL: v16f32_no_estimate: ; BDVER2: # %bb.0: -; BDVER2-NEXT: vmovaps {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; BDVER2-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; BDVER2-NEXT: vdivps %ymm0, %ymm2, %ymm0 ; BDVER2-NEXT: vdivps %ymm1, %ymm2, %ymm1 ; BDVER2-NEXT: retq ; ; BTVER2-LABEL: v16f32_no_estimate: ; BTVER2: # %bb.0: -; BTVER2-NEXT: vmovaps {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; BTVER2-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; BTVER2-NEXT: vdivps %ymm0, %ymm2, %ymm0 ; BTVER2-NEXT: vdivps %ymm1, %ymm2, %ymm1 ; BTVER2-NEXT: retq ; ; SANDY-LABEL: v16f32_no_estimate: ; SANDY: # %bb.0: -; SANDY-NEXT: vmovaps {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; SANDY-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; SANDY-NEXT: vdivps %ymm0, %ymm2, %ymm0 ; SANDY-NEXT: vdivps %ymm1, %ymm2, %ymm1 ; SANDY-NEXT: retq @@ -1089,7 +1003,7 @@ ; AVX-RECIP: # %bb.0: ; AVX-RECIP-NEXT: vrcpps %ymm0, %ymm2 ; AVX-RECIP-NEXT: vmulps %ymm2, %ymm0, %ymm0 -; AVX-RECIP-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; AVX-RECIP-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; AVX-RECIP-NEXT: vsubps %ymm0, %ymm3, %ymm0 ; AVX-RECIP-NEXT: vmulps %ymm0, %ymm2, %ymm0 ; AVX-RECIP-NEXT: vaddps %ymm0, %ymm2, %ymm0 @@ -1103,7 +1017,7 @@ ; FMA-RECIP-LABEL: v16f32_one_step: ; FMA-RECIP: # %bb.0: ; FMA-RECIP-NEXT: vrcpps %ymm0, %ymm2 -; FMA-RECIP-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; FMA-RECIP-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; FMA-RECIP-NEXT: vfmsub213ps {{.*#+}} ymm0 = (ymm2 * ymm0) - ymm3 ; FMA-RECIP-NEXT: vfnmadd132ps {{.*#+}} ymm0 = -(ymm0 * ymm2) + ymm2 ; FMA-RECIP-NEXT: vrcpps %ymm1, %ymm2 @@ -1114,7 +1028,7 @@ ; BDVER2-LABEL: v16f32_one_step: ; BDVER2: # %bb.0: ; BDVER2-NEXT: vrcpps %ymm0, %ymm2 -; BDVER2-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; BDVER2-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; BDVER2-NEXT: vrcpps %ymm1, %ymm4 ; BDVER2-NEXT: vfmsubps {{.*#+}} ymm0 = (ymm0 * ymm2) - ymm3 ; BDVER2-NEXT: vfmsubps {{.*#+}} ymm1 = (ymm1 * ymm4) - ymm3 @@ -1124,7 +1038,7 @@ ; ; BTVER2-LABEL: v16f32_one_step: ; BTVER2: # %bb.0: -; BTVER2-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; BTVER2-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; BTVER2-NEXT: vrcpps %ymm0, %ymm2 ; BTVER2-NEXT: vrcpps %ymm1, %ymm4 ; BTVER2-NEXT: vmulps %ymm2, %ymm0, %ymm0 @@ -1141,7 +1055,7 @@ ; SANDY: # %bb.0: ; SANDY-NEXT: vrcpps %ymm0, %ymm2 ; SANDY-NEXT: vmulps %ymm2, %ymm0, %ymm0 -; SANDY-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; SANDY-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; SANDY-NEXT: vsubps %ymm0, %ymm3, %ymm0 ; SANDY-NEXT: vrcpps %ymm1, %ymm4 ; SANDY-NEXT: vmulps %ymm0, %ymm2, %ymm0 @@ -1249,7 +1163,7 @@ ; AVX-RECIP: # %bb.0: ; AVX-RECIP-NEXT: vrcpps %ymm0, %ymm2 ; AVX-RECIP-NEXT: vmulps %ymm2, %ymm0, %ymm3 -; AVX-RECIP-NEXT: vmovaps {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; AVX-RECIP-NEXT: vbroadcastss {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; AVX-RECIP-NEXT: vsubps %ymm3, %ymm4, %ymm3 ; AVX-RECIP-NEXT: vmulps %ymm3, %ymm2, %ymm3 ; AVX-RECIP-NEXT: vaddps %ymm3, %ymm2, %ymm2 @@ -1271,7 +1185,7 @@ ; FMA-RECIP-LABEL: v16f32_two_step: ; FMA-RECIP: # %bb.0: ; FMA-RECIP-NEXT: vrcpps %ymm0, %ymm2 -; FMA-RECIP-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; FMA-RECIP-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; FMA-RECIP-NEXT: vmovaps %ymm2, %ymm4 ; FMA-RECIP-NEXT: vfmsub213ps {{.*#+}} ymm4 = (ymm0 * ymm4) - ymm3 ; FMA-RECIP-NEXT: vfnmadd132ps {{.*#+}} ymm4 = -(ymm4 * ymm2) + ymm2 @@ -1288,7 +1202,7 @@ ; BDVER2-LABEL: v16f32_two_step: ; BDVER2: # %bb.0: ; BDVER2-NEXT: vrcpps %ymm0, %ymm2 -; BDVER2-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; BDVER2-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; BDVER2-NEXT: vfmsubps {{.*#+}} ymm4 = (ymm0 * ymm2) - ymm3 ; BDVER2-NEXT: vfnmaddps {{.*#+}} ymm2 = -(ymm2 * ymm4) + ymm2 ; BDVER2-NEXT: vfmsubps {{.*#+}} ymm0 = (ymm0 * ymm2) - ymm3 @@ -1302,7 +1216,7 @@ ; ; BTVER2-LABEL: v16f32_two_step: ; BTVER2: # %bb.0: -; BTVER2-NEXT: vmovaps {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; BTVER2-NEXT: vbroadcastss {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; BTVER2-NEXT: vrcpps %ymm0, %ymm2 ; BTVER2-NEXT: vmulps %ymm2, %ymm0, %ymm3 ; BTVER2-NEXT: vsubps %ymm3, %ymm4, %ymm3 @@ -1327,7 +1241,7 @@ ; SANDY: # %bb.0: ; SANDY-NEXT: vrcpps %ymm0, %ymm2 ; SANDY-NEXT: vmulps %ymm2, %ymm0, %ymm3 -; SANDY-NEXT: vmovaps {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; SANDY-NEXT: vbroadcastss {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; SANDY-NEXT: vsubps %ymm3, %ymm4, %ymm3 ; SANDY-NEXT: vmulps %ymm3, %ymm2, %ymm3 ; SANDY-NEXT: vaddps %ymm3, %ymm2, %ymm2 diff --git a/llvm/test/CodeGen/X86/recip-fastmath2.ll b/llvm/test/CodeGen/X86/recip-fastmath2.ll --- a/llvm/test/CodeGen/X86/recip-fastmath2.ll +++ b/llvm/test/CodeGen/X86/recip-fastmath2.ll @@ -476,7 +476,7 @@ ; AVX-RECIP: # %bb.0: ; AVX-RECIP-NEXT: vrcpps %xmm0, %xmm1 ; AVX-RECIP-NEXT: vmulps %xmm1, %xmm0, %xmm0 -; AVX-RECIP-NEXT: vmovaps {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; AVX-RECIP-NEXT: vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; AVX-RECIP-NEXT: vsubps %xmm0, %xmm2, %xmm0 ; AVX-RECIP-NEXT: vmulps %xmm0, %xmm1, %xmm0 ; AVX-RECIP-NEXT: vaddps %xmm0, %xmm1, %xmm0 @@ -504,7 +504,7 @@ ; ; BTVER2-LABEL: v4f32_one_step_2_divs: ; BTVER2: # %bb.0: -; BTVER2-NEXT: vmovaps {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; BTVER2-NEXT: vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; BTVER2-NEXT: vrcpps %xmm0, %xmm1 ; BTVER2-NEXT: vmulps %xmm1, %xmm0, %xmm0 ; BTVER2-NEXT: vsubps %xmm0, %xmm2, %xmm0 @@ -518,7 +518,7 @@ ; SANDY: # %bb.0: ; SANDY-NEXT: vrcpps %xmm0, %xmm1 ; SANDY-NEXT: vmulps %xmm1, %xmm0, %xmm0 -; SANDY-NEXT: vmovaps {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; SANDY-NEXT: vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; SANDY-NEXT: vsubps %xmm0, %xmm2, %xmm0 ; SANDY-NEXT: vmulps %xmm0, %xmm1, %xmm0 ; SANDY-NEXT: vaddps %xmm0, %xmm1, %xmm0 @@ -529,11 +529,10 @@ ; HASWELL-LABEL: v4f32_one_step_2_divs: ; HASWELL: # %bb.0: ; HASWELL-NEXT: vrcpps %xmm0, %xmm1 -; HASWELL-NEXT: vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; HASWELL-NEXT: vfmsub231ps {{.*#+}} xmm2 = (xmm1 * xmm0) - xmm2 -; HASWELL-NEXT: vfnmadd132ps {{.*#+}} xmm2 = -(xmm2 * xmm1) + xmm1 -; HASWELL-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm0 -; HASWELL-NEXT: vmulps %xmm2, %xmm0, %xmm0 +; HASWELL-NEXT: vfmsub213ps {{.*#+}} xmm0 = (xmm1 * xmm0) - mem +; HASWELL-NEXT: vfnmadd132ps {{.*#+}} xmm0 = -(xmm0 * xmm1) + xmm1 +; HASWELL-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; HASWELL-NEXT: vmulps %xmm0, %xmm1, %xmm0 ; HASWELL-NEXT: retq ; ; HASWELL-NO-FMA-LABEL: v4f32_one_step_2_divs: @@ -595,7 +594,7 @@ ; AVX-RECIP: # %bb.0: ; AVX-RECIP-NEXT: vrcpps %xmm0, %xmm1 ; AVX-RECIP-NEXT: vmulps %xmm1, %xmm0, %xmm2 -; AVX-RECIP-NEXT: vmovaps {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; AVX-RECIP-NEXT: vbroadcastss {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; AVX-RECIP-NEXT: vsubps %xmm2, %xmm3, %xmm2 ; AVX-RECIP-NEXT: vmulps %xmm2, %xmm1, %xmm2 ; AVX-RECIP-NEXT: vaddps %xmm2, %xmm1, %xmm1 @@ -610,7 +609,7 @@ ; FMA-RECIP-LABEL: v4f32_two_step2: ; FMA-RECIP: # %bb.0: ; FMA-RECIP-NEXT: vrcpps %xmm0, %xmm1 -; FMA-RECIP-NEXT: vmovaps {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; FMA-RECIP-NEXT: vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; FMA-RECIP-NEXT: vfmsub231ps {{.*#+}} xmm2 = (xmm0 * xmm1) - xmm2 ; FMA-RECIP-NEXT: vfnmadd132ps {{.*#+}} xmm2 = -(xmm2 * xmm1) + xmm1 ; FMA-RECIP-NEXT: vmovaps {{.*#+}} xmm1 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0] @@ -632,7 +631,7 @@ ; ; BTVER2-LABEL: v4f32_two_step2: ; BTVER2: # %bb.0: -; BTVER2-NEXT: vmovaps {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; BTVER2-NEXT: vbroadcastss {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; BTVER2-NEXT: vrcpps %xmm0, %xmm1 ; BTVER2-NEXT: vmovaps {{.*#+}} xmm4 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0] ; BTVER2-NEXT: vmulps %xmm1, %xmm0, %xmm2 @@ -650,7 +649,7 @@ ; SANDY: # %bb.0: ; SANDY-NEXT: vrcpps %xmm0, %xmm1 ; SANDY-NEXT: vmulps %xmm1, %xmm0, %xmm2 -; SANDY-NEXT: vmovaps {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; SANDY-NEXT: vbroadcastss {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; SANDY-NEXT: vsubps %xmm2, %xmm3, %xmm2 ; SANDY-NEXT: vmulps %xmm2, %xmm1, %xmm2 ; SANDY-NEXT: vaddps %xmm2, %xmm1, %xmm1 @@ -838,7 +837,7 @@ ; AVX-RECIP: # %bb.0: ; AVX-RECIP-NEXT: vrcpps %ymm0, %ymm1 ; AVX-RECIP-NEXT: vmulps %ymm1, %ymm0, %ymm0 -; AVX-RECIP-NEXT: vmovaps {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; AVX-RECIP-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; AVX-RECIP-NEXT: vsubps %ymm0, %ymm2, %ymm0 ; AVX-RECIP-NEXT: vmulps %ymm0, %ymm1, %ymm0 ; AVX-RECIP-NEXT: vaddps %ymm0, %ymm1, %ymm0 @@ -866,7 +865,7 @@ ; ; BTVER2-LABEL: v8f32_one_step_2_divs: ; BTVER2: # %bb.0: -; BTVER2-NEXT: vmovaps {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; BTVER2-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; BTVER2-NEXT: vrcpps %ymm0, %ymm1 ; BTVER2-NEXT: vmulps %ymm1, %ymm0, %ymm0 ; BTVER2-NEXT: vsubps %ymm0, %ymm2, %ymm0 @@ -880,7 +879,7 @@ ; SANDY: # %bb.0: ; SANDY-NEXT: vrcpps %ymm0, %ymm1 ; SANDY-NEXT: vmulps %ymm1, %ymm0, %ymm0 -; SANDY-NEXT: vmovaps {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; SANDY-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; SANDY-NEXT: vsubps %ymm0, %ymm2, %ymm0 ; SANDY-NEXT: vmulps %ymm0, %ymm1, %ymm0 ; SANDY-NEXT: vaddps %ymm0, %ymm1, %ymm0 @@ -891,11 +890,10 @@ ; HASWELL-LABEL: v8f32_one_step_2_divs: ; HASWELL: # %bb.0: ; HASWELL-NEXT: vrcpps %ymm0, %ymm1 -; HASWELL-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; HASWELL-NEXT: vfmsub231ps {{.*#+}} ymm2 = (ymm1 * ymm0) - ymm2 -; HASWELL-NEXT: vfnmadd132ps {{.*#+}} ymm2 = -(ymm2 * ymm1) + ymm1 -; HASWELL-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm0 -; HASWELL-NEXT: vmulps %ymm2, %ymm0, %ymm0 +; HASWELL-NEXT: vfmsub213ps {{.*#+}} ymm0 = (ymm1 * ymm0) - mem +; HASWELL-NEXT: vfnmadd132ps {{.*#+}} ymm0 = -(ymm0 * ymm1) + ymm1 +; HASWELL-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1 +; HASWELL-NEXT: vmulps %ymm0, %ymm1, %ymm0 ; HASWELL-NEXT: retq ; ; HASWELL-NO-FMA-LABEL: v8f32_one_step_2_divs: @@ -972,7 +970,7 @@ ; AVX-RECIP: # %bb.0: ; AVX-RECIP-NEXT: vrcpps %ymm0, %ymm1 ; AVX-RECIP-NEXT: vmulps %ymm1, %ymm0, %ymm2 -; AVX-RECIP-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; AVX-RECIP-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; AVX-RECIP-NEXT: vsubps %ymm2, %ymm3, %ymm2 ; AVX-RECIP-NEXT: vmulps %ymm2, %ymm1, %ymm2 ; AVX-RECIP-NEXT: vaddps %ymm2, %ymm1, %ymm1 @@ -987,7 +985,7 @@ ; FMA-RECIP-LABEL: v8f32_two_step2: ; FMA-RECIP: # %bb.0: ; FMA-RECIP-NEXT: vrcpps %ymm0, %ymm1 -; FMA-RECIP-NEXT: vmovaps {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; FMA-RECIP-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; FMA-RECIP-NEXT: vfmsub231ps {{.*#+}} ymm2 = (ymm0 * ymm1) - ymm2 ; FMA-RECIP-NEXT: vfnmadd132ps {{.*#+}} ymm2 = -(ymm2 * ymm1) + ymm1 ; FMA-RECIP-NEXT: vmovaps {{.*#+}} ymm1 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0] @@ -1009,7 +1007,7 @@ ; ; BTVER2-LABEL: v8f32_two_step2: ; BTVER2: # %bb.0: -; BTVER2-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; BTVER2-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; BTVER2-NEXT: vrcpps %ymm0, %ymm1 ; BTVER2-NEXT: vmovaps {{.*#+}} ymm4 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0] ; BTVER2-NEXT: vmulps %ymm1, %ymm0, %ymm2 @@ -1027,7 +1025,7 @@ ; SANDY: # %bb.0: ; SANDY-NEXT: vrcpps %ymm0, %ymm1 ; SANDY-NEXT: vmulps %ymm1, %ymm0, %ymm2 -; SANDY-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; SANDY-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; SANDY-NEXT: vsubps %ymm2, %ymm3, %ymm2 ; SANDY-NEXT: vmulps %ymm2, %ymm1, %ymm2 ; SANDY-NEXT: vaddps %ymm2, %ymm1, %ymm1 @@ -1327,7 +1325,7 @@ ; AVX-RECIP: # %bb.0: ; AVX-RECIP-NEXT: vrcpps %ymm0, %ymm2 ; AVX-RECIP-NEXT: vmulps %ymm2, %ymm0, %ymm0 -; AVX-RECIP-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; AVX-RECIP-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; AVX-RECIP-NEXT: vsubps %ymm0, %ymm3, %ymm0 ; AVX-RECIP-NEXT: vmulps %ymm0, %ymm2, %ymm0 ; AVX-RECIP-NEXT: vaddps %ymm0, %ymm2, %ymm0 @@ -1345,7 +1343,7 @@ ; FMA-RECIP-LABEL: v16f32_one_step_2_divs: ; FMA-RECIP: # %bb.0: ; FMA-RECIP-NEXT: vrcpps %ymm0, %ymm2 -; FMA-RECIP-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; FMA-RECIP-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; FMA-RECIP-NEXT: vfmsub213ps {{.*#+}} ymm0 = (ymm2 * ymm0) - ymm3 ; FMA-RECIP-NEXT: vfnmadd132ps {{.*#+}} ymm0 = -(ymm0 * ymm2) + ymm2 ; FMA-RECIP-NEXT: vrcpps %ymm1, %ymm2 @@ -1360,7 +1358,7 @@ ; BDVER2-LABEL: v16f32_one_step_2_divs: ; BDVER2: # %bb.0: ; BDVER2-NEXT: vrcpps %ymm0, %ymm2 -; BDVER2-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; BDVER2-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; BDVER2-NEXT: vfmsubps {{.*#+}} ymm0 = (ymm0 * ymm2) - ymm3 ; BDVER2-NEXT: vfnmaddps {{.*#+}} ymm0 = -(ymm2 * ymm0) + ymm2 ; BDVER2-NEXT: vrcpps %ymm1, %ymm2 @@ -1374,7 +1372,7 @@ ; ; BTVER2-LABEL: v16f32_one_step_2_divs: ; BTVER2: # %bb.0: -; BTVER2-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; BTVER2-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; BTVER2-NEXT: vrcpps %ymm0, %ymm2 ; BTVER2-NEXT: vmulps %ymm2, %ymm0, %ymm0 ; BTVER2-NEXT: vsubps %ymm0, %ymm3, %ymm0 @@ -1395,7 +1393,7 @@ ; SANDY: # %bb.0: ; SANDY-NEXT: vrcpps %ymm0, %ymm2 ; SANDY-NEXT: vmulps %ymm2, %ymm0, %ymm0 -; SANDY-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; SANDY-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; SANDY-NEXT: vsubps %ymm0, %ymm3, %ymm0 ; SANDY-NEXT: vmulps %ymm0, %ymm2, %ymm0 ; SANDY-NEXT: vaddps %ymm0, %ymm2, %ymm0 @@ -1526,7 +1524,7 @@ ; AVX-RECIP: # %bb.0: ; AVX-RECIP-NEXT: vrcpps %ymm0, %ymm2 ; AVX-RECIP-NEXT: vmulps %ymm2, %ymm0, %ymm3 -; AVX-RECIP-NEXT: vmovaps {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; AVX-RECIP-NEXT: vbroadcastss {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; AVX-RECIP-NEXT: vsubps %ymm3, %ymm4, %ymm3 ; AVX-RECIP-NEXT: vmulps %ymm3, %ymm2, %ymm3 ; AVX-RECIP-NEXT: vaddps %ymm3, %ymm2, %ymm2 @@ -1552,7 +1550,7 @@ ; FMA-RECIP-LABEL: v16f32_two_step2: ; FMA-RECIP: # %bb.0: ; FMA-RECIP-NEXT: vrcpps %ymm0, %ymm2 -; FMA-RECIP-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; FMA-RECIP-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; FMA-RECIP-NEXT: vmovaps %ymm2, %ymm4 ; FMA-RECIP-NEXT: vfmsub213ps {{.*#+}} ymm4 = (ymm0 * ymm4) - ymm3 ; FMA-RECIP-NEXT: vfnmadd132ps {{.*#+}} ymm4 = -(ymm4 * ymm2) + ymm2 @@ -1572,7 +1570,7 @@ ; BDVER2-LABEL: v16f32_two_step2: ; BDVER2: # %bb.0: ; BDVER2-NEXT: vrcpps %ymm0, %ymm2 -; BDVER2-NEXT: vmovaps {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; BDVER2-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; BDVER2-NEXT: vfmsubps {{.*#+}} ymm4 = (ymm0 * ymm2) - ymm3 ; BDVER2-NEXT: vfnmaddps {{.*#+}} ymm2 = -(ymm2 * ymm4) + ymm2 ; BDVER2-NEXT: vmovaps {{.*#+}} ymm4 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0] @@ -1590,7 +1588,7 @@ ; ; BTVER2-LABEL: v16f32_two_step2: ; BTVER2: # %bb.0: -; BTVER2-NEXT: vmovaps {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; BTVER2-NEXT: vbroadcastss {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; BTVER2-NEXT: vrcpps %ymm0, %ymm2 ; BTVER2-NEXT: vmulps %ymm2, %ymm0, %ymm3 ; BTVER2-NEXT: vsubps %ymm3, %ymm4, %ymm3 @@ -1619,7 +1617,7 @@ ; SANDY: # %bb.0: ; SANDY-NEXT: vrcpps %ymm0, %ymm2 ; SANDY-NEXT: vmulps %ymm2, %ymm0, %ymm3 -; SANDY-NEXT: vmovaps {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; SANDY-NEXT: vbroadcastss {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; SANDY-NEXT: vsubps %ymm3, %ymm4, %ymm3 ; SANDY-NEXT: vmulps %ymm3, %ymm2, %ymm3 ; SANDY-NEXT: vaddps %ymm3, %ymm2, %ymm2 diff --git a/llvm/test/CodeGen/X86/sadd_sat_vec.ll b/llvm/test/CodeGen/X86/sadd_sat_vec.ll --- a/llvm/test/CodeGen/X86/sadd_sat_vec.ll +++ b/llvm/test/CodeGen/X86/sadd_sat_vec.ll @@ -530,14 +530,14 @@ ; AVX2-LABEL: v16i4: ; AVX2: # %bb.0: ; AVX2-NEXT: vpsllw $4, %xmm1, %xmm1 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm2 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] ; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 ; AVX2-NEXT: vpsllw $4, %xmm0, %xmm0 ; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpaddsb %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpsrlw $4, %xmm0, %xmm0 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] ; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpsubb %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq @@ -545,14 +545,14 @@ ; AVX512F-LABEL: v16i4: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpsllw $4, %xmm1, %xmm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} xmm2 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] ; AVX512F-NEXT: vpand %xmm2, %xmm1, %xmm1 ; AVX512F-NEXT: vpsllw $4, %xmm0, %xmm0 ; AVX512F-NEXT: vpand %xmm2, %xmm0, %xmm0 ; AVX512F-NEXT: vpaddsb %xmm1, %xmm0, %xmm0 ; AVX512F-NEXT: vpsrlw $4, %xmm0, %xmm0 ; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] ; AVX512F-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX512F-NEXT: vpsubb %xmm1, %xmm0, %xmm0 ; AVX512F-NEXT: retq @@ -650,8 +650,7 @@ ; AVX2-NEXT: vpcmpgtd %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: vpsrad $31, %xmm2, %xmm1 -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648] -; AVX2-NEXT: vpxor %xmm3, %xmm1, %xmm1 +; AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; AVX2-NEXT: vblendvps %xmm0, %xmm1, %xmm2, %xmm0 ; AVX2-NEXT: retq ; @@ -741,8 +740,7 @@ ; AVX2-NEXT: vpcmpgtd %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: vpsrad $31, %xmm2, %xmm1 -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648] -; AVX2-NEXT: vpxor %xmm3, %xmm1, %xmm1 +; AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; AVX2-NEXT: vblendvps %xmm0, %xmm1, %xmm2, %xmm0 ; AVX2-NEXT: retq ; @@ -875,8 +873,7 @@ ; AVX2-NEXT: vpcmpgtd %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpxor %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vpsrad $31, %ymm2, %ymm1 -; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm3 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648] -; AVX2-NEXT: vpxor %ymm3, %ymm1, %ymm1 +; AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 ; AVX2-NEXT: vblendvps %ymm0, %ymm1, %ymm2, %ymm0 ; AVX2-NEXT: retq ; @@ -1063,7 +1060,7 @@ ; AVX1-NEXT: vpsrad $31, %xmm6, %xmm2 ; AVX1-NEXT: vpsrad $31, %xmm4, %xmm4 ; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 -; AVX1-NEXT: vmovaps {{.*#+}} ymm4 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648] +; AVX1-NEXT: vbroadcastss {{.*#+}} ymm4 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] ; AVX1-NEXT: vxorps %ymm4, %ymm2, %ymm2 ; AVX1-NEXT: vblendvps %ymm0, %ymm2, %ymm7, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm2 @@ -1197,7 +1194,8 @@ ; AVX1-LABEL: v2i64: ; AVX1: # %bb.0: ; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm2 -; AVX1-NEXT: vmovapd {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808] +; AVX1-NEXT: vmovddup {{.*#+}} xmm3 = [-0.0E+0,-0.0E+0] +; AVX1-NEXT: # xmm3 = mem[0,0] ; AVX1-NEXT: vblendvpd %xmm2, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm0, %xmm1, %xmm0 @@ -1207,7 +1205,8 @@ ; AVX2-LABEL: v2i64: ; AVX2: # %bb.0: ; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm2 -; AVX2-NEXT: vmovapd {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808] +; AVX2-NEXT: vmovddup {{.*#+}} xmm3 = [-0.0E+0,-0.0E+0] +; AVX2-NEXT: # xmm3 = mem[0,0] ; AVX2-NEXT: vblendvpd %xmm2, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 ; AVX2-NEXT: vpcmpgtq %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm0, %xmm1, %xmm0 @@ -1217,7 +1216,8 @@ ; AVX512F-LABEL: v2i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpaddq %xmm1, %xmm0, %xmm2 -; AVX512F-NEXT: vmovapd {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808] +; AVX512F-NEXT: vmovddup {{.*#+}} xmm3 = [-0.0E+0,-0.0E+0] +; AVX512F-NEXT: # xmm3 = mem[0,0] ; AVX512F-NEXT: vblendvpd %xmm2, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 ; AVX512F-NEXT: vpcmpgtq %xmm2, %xmm0, %xmm0 ; AVX512F-NEXT: vpxor %xmm0, %xmm1, %xmm0 @@ -1412,8 +1412,7 @@ ; AVX2-NEXT: vpxor %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpcmpgtq %ymm2, %ymm1, %ymm1 -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] -; AVX2-NEXT: vpxor %ymm3, %ymm1, %ymm1 +; AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 ; AVX2-NEXT: vblendvpd %ymm0, %ymm1, %ymm2, %ymm0 ; AVX2-NEXT: retq ; @@ -1733,7 +1732,7 @@ ; AVX1-NEXT: vpcmpgtq %xmm4, %xmm2, %xmm4 ; AVX1-NEXT: vpcmpgtq %xmm6, %xmm2, %xmm5 ; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm5, %ymm4 -; AVX1-NEXT: vmovapd {{.*#+}} ymm5 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] +; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm5 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] ; AVX1-NEXT: vxorpd %ymm5, %ymm4, %ymm4 ; AVX1-NEXT: vblendvpd %ymm0, %ymm4, %ymm7, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4 diff --git a/llvm/test/CodeGen/X86/sar_fold64.ll b/llvm/test/CodeGen/X86/sar_fold64.ll --- a/llvm/test/CodeGen/X86/sar_fold64.ll +++ b/llvm/test/CodeGen/X86/sar_fold64.ll @@ -85,8 +85,7 @@ ; ; AVX2-LABEL: all_sign_bit_ashr_vec0: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1,1,1,1] -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsubd %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq @@ -116,8 +115,7 @@ ; AVX2-LABEL: all_sign_bit_ashr_vec1: ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastd %xmm0, %xmm0 -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1,1,1,1] -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsubd %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq @@ -145,8 +143,7 @@ ; ; AVX2-LABEL: all_sign_bit_ashr_vec2: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1,1,1,1] -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq @@ -176,8 +173,7 @@ ; AVX2-LABEL: all_sign_bit_ashr_vec3: ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastd %xmm0, %xmm0 -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1,1,1,1] -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq diff --git a/llvm/test/CodeGen/X86/sat-add.ll b/llvm/test/CodeGen/X86/sat-add.ll --- a/llvm/test/CodeGen/X86/sat-add.ll +++ b/llvm/test/CodeGen/X86/sat-add.ll @@ -488,10 +488,8 @@ ; ; AVX2-LABEL: unsigned_sat_constant_v4i32_using_min: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [4294967253,4294967253,4294967253,4294967253] -; AVX2-NEXT: vpminud %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [42,42,42,42] -; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX2-NEXT: retq ; ; AVX512-LABEL: unsigned_sat_constant_v4i32_using_min: @@ -525,10 +523,8 @@ ; ; AVX2-LABEL: unsigned_sat_constant_v4i32_using_cmp_sum: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [42,42,42,42] -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [4294967253,4294967253,4294967253,4294967253] -; AVX2-NEXT: vpminud %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX2-NEXT: retq ; ; AVX512-LABEL: unsigned_sat_constant_v4i32_using_cmp_sum: @@ -560,10 +556,8 @@ ; ; AVX2-LABEL: unsigned_sat_constant_v4i32_using_cmp_notval: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [42,42,42,42] -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [4294967253,4294967253,4294967253,4294967253] -; AVX2-NEXT: vpminud %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX2-NEXT: retq ; ; AVX512-LABEL: unsigned_sat_constant_v4i32_using_cmp_notval: @@ -656,9 +650,10 @@ ; ; AVX2-LABEL: unsigned_sat_constant_v2i64_using_min: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovapd {{.*#+}} xmm1 = [18446744073709551573,18446744073709551573] +; AVX2-NEXT: vmovddup {{.*#+}} xmm1 = [NaN,NaN] +; AVX2-NEXT: # xmm1 = mem[0,0] ; AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [9223372036854775765,9223372036854775765] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm3 = [9223372036854775765,9223372036854775765] ; AVX2-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 ; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 @@ -725,7 +720,7 @@ ; ; AVX2-LABEL: unsigned_sat_constant_v2i64_using_cmp_sum: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808] ; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm1 @@ -784,7 +779,7 @@ ; ; AVX2-LABEL: unsigned_sat_constant_v2i64_using_cmp_notval: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808] ; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm1 @@ -1266,7 +1261,7 @@ ; ; AVX2-LABEL: unsigned_sat_variable_v2i64_using_cmp_sum: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] ; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm3 ; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm1 diff --git a/llvm/test/CodeGen/X86/sdiv-exact.ll b/llvm/test/CodeGen/X86/sdiv-exact.ll --- a/llvm/test/CodeGen/X86/sdiv-exact.ll +++ b/llvm/test/CodeGen/X86/sdiv-exact.ll @@ -49,8 +49,7 @@ ; X64-LABEL: test3: ; X64: # %bb.0: ; X64-NEXT: vpsrad $3, %xmm0, %xmm0 -; X64-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2863311531,2863311531,2863311531,2863311531] -; X64-NEXT: vpmulld %xmm1, %xmm0, %xmm0 +; X64-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; X64-NEXT: retq %div = sdiv exact <4 x i32> %x, ret <4 x i32> %div @@ -70,8 +69,7 @@ ; ; X64-LABEL: test4: ; X64: # %bb.0: -; X64-NEXT: vpbroadcastd {{.*#+}} xmm1 = [3264175145,3264175145,3264175145,3264175145] -; X64-NEXT: vpmulld %xmm1, %xmm0, %xmm0 +; X64-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; X64-NEXT: retq %div = sdiv exact <4 x i32> %x, ret <4 x i32> %div diff --git a/llvm/test/CodeGen/X86/select-of-fp-constants.ll b/llvm/test/CodeGen/X86/select-of-fp-constants.ll --- a/llvm/test/CodeGen/X86/select-of-fp-constants.ll +++ b/llvm/test/CodeGen/X86/select-of-fp-constants.ll @@ -77,9 +77,8 @@ ; X64-AVX2-LABEL: fcmp_select_fp_constants: ; X64-AVX2: # %bb.0: ; X64-AVX2-NEXT: vcmpneqss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; X64-AVX2-NEXT: vbroadcastss {{.*#+}} xmm1 = [4.2E+1,4.2E+1,4.2E+1,4.2E+1] -; X64-AVX2-NEXT: vbroadcastss {{.*#+}} xmm2 = [2.3E+1,2.3E+1,2.3E+1,2.3E+1] -; X64-AVX2-NEXT: vblendvps %xmm0, %xmm1, %xmm2, %xmm0 +; X64-AVX2-NEXT: vbroadcastss {{.*#+}} xmm1 = [2.3E+1,2.3E+1,2.3E+1,2.3E+1] +; X64-AVX2-NEXT: vblendvps %xmm0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm0 ; X64-AVX2-NEXT: retq ; ; X64-AVX512F-LABEL: fcmp_select_fp_constants: diff --git a/llvm/test/CodeGen/X86/setcc-non-simple-type.ll b/llvm/test/CodeGen/X86/setcc-non-simple-type.ll --- a/llvm/test/CodeGen/X86/setcc-non-simple-type.ll +++ b/llvm/test/CodeGen/X86/setcc-non-simple-type.ll @@ -108,8 +108,8 @@ ; CHECK-AVX2-NEXT: movq 32(%rsi), %rdx ; CHECK-AVX2-NEXT: vmovdqa {{.*#+}} xmm0 = [0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0] ; CHECK-AVX2-NEXT: xorl %esi, %esi -; CHECK-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [1,1] -; CHECK-AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [2,2] +; CHECK-AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [1,1] +; CHECK-AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [2,2] ; CHECK-AVX2-NEXT: .p2align 4, 0x90 ; CHECK-AVX2-NEXT: .LBB0_1: # %vector.ph ; CHECK-AVX2-NEXT: # =>This Loop Header: Depth=1 diff --git a/llvm/test/CodeGen/X86/shrink_vmul.ll b/llvm/test/CodeGen/X86/shrink_vmul.ll --- a/llvm/test/CodeGen/X86/shrink_vmul.ll +++ b/llvm/test/CodeGen/X86/shrink_vmul.ll @@ -2192,8 +2192,7 @@ ; X86-AVX2-NEXT: vmovd %xmm0, %eax ; X86-AVX2-NEXT: xorl %edx, %edx ; X86-AVX2-NEXT: divl 32(%esi) -; X86-AVX2-NEXT: vpbroadcastd {{.*#+}} ymm0 = [8199,8199,8199,8199,8199,8199,8199,8199] -; X86-AVX2-NEXT: vpmulld %ymm0, %ymm1, %ymm0 +; X86-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}, %ymm1, %ymm0 ; X86-AVX2-NEXT: imull $8199, %edx, %eax # imm = 0x2007 ; X86-AVX2-NEXT: movl %eax, (%eax) ; X86-AVX2-NEXT: vmovdqa %ymm0, (%eax) @@ -2395,8 +2394,7 @@ ; X64-AVX2-NEXT: vmovd %xmm0, %eax ; X64-AVX2-NEXT: xorl %edx, %edx ; X64-AVX2-NEXT: divl 32(%rsi) -; X64-AVX2-NEXT: vpbroadcastd {{.*#+}} ymm0 = [8199,8199,8199,8199,8199,8199,8199,8199] -; X64-AVX2-NEXT: vpmulld %ymm0, %ymm1, %ymm0 +; X64-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm0 ; X64-AVX2-NEXT: imull $8199, %edx, %eax # imm = 0x2007 ; X64-AVX2-NEXT: movl %eax, (%rax) ; X64-AVX2-NEXT: vmovdqa %ymm0, (%rax) diff --git a/llvm/test/CodeGen/X86/shuffle-blendw.ll b/llvm/test/CodeGen/X86/shuffle-blendw.ll --- a/llvm/test/CodeGen/X86/shuffle-blendw.ll +++ b/llvm/test/CodeGen/X86/shuffle-blendw.ll @@ -162,8 +162,7 @@ ; X86-AVX-NEXT: vpaddw %xmm3, %xmm4, %xmm3 ; X86-AVX-NEXT: vpaddw %xmm2, %xmm0, %xmm0 ; X86-AVX-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 -; X86-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm2 = [65535,65535,65535,0,65535,65535,0,0,65535,65535,65535,0,65535,65535,0,0] -; X86-AVX-NEXT: # ymm2 = mem[0,1,0,1] +; X86-AVX-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,0,65535,65535,0,0,65535,65535,65535,0,65535,65535,0,0] ; X86-AVX-NEXT: vandps %ymm2, %ymm0, %ymm0 ; X86-AVX-NEXT: vandnps %ymm1, %ymm2, %ymm1 ; X86-AVX-NEXT: vorps %ymm1, %ymm0, %ymm0 @@ -176,8 +175,7 @@ ; X64-AVX-NEXT: vpaddw %xmm3, %xmm4, %xmm3 ; X64-AVX-NEXT: vpaddw %xmm2, %xmm0, %xmm0 ; X64-AVX-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 -; X64-AVX-NEXT: vbroadcastf128 {{.*#+}} ymm2 = [65535,65535,65535,0,65535,65535,0,0,65535,65535,65535,0,65535,65535,0,0] -; X64-AVX-NEXT: # ymm2 = mem[0,1,0,1] +; X64-AVX-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,0,65535,65535,0,0,65535,65535,65535,0,65535,65535,0,0] ; X64-AVX-NEXT: vandps %ymm2, %ymm0, %ymm0 ; X64-AVX-NEXT: vandnps %ymm1, %ymm2, %ymm1 ; X64-AVX-NEXT: vorps %ymm1, %ymm0, %ymm0 diff --git a/llvm/test/CodeGen/X86/shuffle-of-splat-multiuses.ll b/llvm/test/CodeGen/X86/shuffle-of-splat-multiuses.ll --- a/llvm/test/CodeGen/X86/shuffle-of-splat-multiuses.ll +++ b/llvm/test/CodeGen/X86/shuffle-of-splat-multiuses.ll @@ -38,7 +38,7 @@ ; ; AVX2-FAST-ALL-LABEL: foo8: ; AVX2-FAST-ALL: # %bb.0: -; AVX2-FAST-ALL-NEXT: vbroadcastss {{.*#+}} ymm1 = [5,5,5,5,5,5,5,5] +; AVX2-FAST-ALL-NEXT: vbroadcastss {{.*#+}} ymm1 = [7.00649232E-45,7.00649232E-45,7.00649232E-45,7.00649232E-45,7.00649232E-45,7.00649232E-45,7.00649232E-45,7.00649232E-45] ; AVX2-FAST-ALL-NEXT: vpermps %ymm0, %ymm1, %ymm0 ; AVX2-FAST-ALL-NEXT: vmovaps %ymm0, (%rdi) ; AVX2-FAST-ALL-NEXT: retq diff --git a/llvm/test/CodeGen/X86/shuffle-vs-trunc-256.ll b/llvm/test/CodeGen/X86/shuffle-vs-trunc-256.ll --- a/llvm/test/CodeGen/X86/shuffle-vs-trunc-256.ll +++ b/llvm/test/CodeGen/X86/shuffle-vs-trunc-256.ll @@ -18,18 +18,27 @@ ; Ideally, the shuffles should be lowered to code with the same quality as the truncates. define void @shuffle_v32i8_to_v16i8(ptr %L, ptr %S) nounwind { -; AVX-LABEL: shuffle_v32i8_to_v16i8: -; AVX: # %bb.0: -; AVX-NEXT: vmovdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255] -; AVX-NEXT: vpand 16(%rdi), %xmm0, %xmm1 -; AVX-NEXT: vpand (%rdi), %xmm0, %xmm0 -; AVX-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm0, (%rsi) -; AVX-NEXT: retq +; AVX1-LABEL: shuffle_v32i8_to_v16i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255] +; AVX1-NEXT: vpand 16(%rdi), %xmm0, %xmm1 +; AVX1-NEXT: vpand (%rdi), %xmm0, %xmm0 +; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vmovdqa %xmm0, (%rsi) +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v32i8_to_v16i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpand 16(%rdi), %xmm0, %xmm1 +; AVX2-NEXT: vpand (%rdi), %xmm0, %xmm0 +; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vmovdqa %xmm0, (%rsi) +; AVX2-NEXT: retq ; ; AVX512F-LABEL: shuffle_v32i8_to_v16i8: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255] +; AVX512F-NEXT: vpbroadcastw {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255] ; AVX512F-NEXT: vpand 16(%rdi), %xmm0, %xmm1 ; AVX512F-NEXT: vpand (%rdi), %xmm0, %xmm0 ; AVX512F-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 diff --git a/llvm/test/CodeGen/X86/slow-pmulld.ll b/llvm/test/CodeGen/X86/slow-pmulld.ll --- a/llvm/test/CodeGen/X86/slow-pmulld.ll +++ b/llvm/test/CodeGen/X86/slow-pmulld.ll @@ -347,25 +347,37 @@ ; ; AVX2-SLOW-LABEL: test_mul_v4i32_v4i16: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = <18778,18778,18778,18778,u,u,u,u> +; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm1 = [18778,18778,18778,18778,18778,18778,18778,18778] ; AVX2-SLOW-NEXT: vpmulhuw %xmm1, %xmm0, %xmm2 ; AVX2-SLOW-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] ; AVX2-SLOW-NEXT: ret{{[l|q]}} ; -; AVX-32-LABEL: test_mul_v4i32_v4i16: -; AVX-32: # %bb.0: -; AVX-32-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; AVX-32-NEXT: vpbroadcastd {{.*#+}} xmm1 = [18778,18778,18778,18778] -; AVX-32-NEXT: vpmulld %xmm1, %xmm0, %xmm0 -; AVX-32-NEXT: retl -; -; AVX-64-LABEL: test_mul_v4i32_v4i16: -; AVX-64: # %bb.0: -; AVX-64-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; AVX-64-NEXT: vpbroadcastd {{.*#+}} xmm1 = [18778,18778,18778,18778] -; AVX-64-NEXT: vpmulld %xmm1, %xmm0, %xmm0 -; AVX-64-NEXT: retq +; AVX2-32-LABEL: test_mul_v4i32_v4i16: +; AVX2-32: # %bb.0: +; AVX2-32-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX2-32-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 +; AVX2-32-NEXT: retl +; +; AVX2-64-LABEL: test_mul_v4i32_v4i16: +; AVX2-64: # %bb.0: +; AVX2-64-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX2-64-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX2-64-NEXT: retq +; +; AVX512-32-LABEL: test_mul_v4i32_v4i16: +; AVX512-32: # %bb.0: +; AVX512-32-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX512-32-NEXT: vpbroadcastd {{.*#+}} xmm1 = [18778,18778,18778,18778] +; AVX512-32-NEXT: vpmulld %xmm1, %xmm0, %xmm0 +; AVX512-32-NEXT: retl +; +; AVX512-64-LABEL: test_mul_v4i32_v4i16: +; AVX512-64: # %bb.0: +; AVX512-64-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX512-64-NEXT: vpbroadcastd {{.*#+}} xmm1 = [18778,18778,18778,18778] +; AVX512-64-NEXT: vpmulld %xmm1, %xmm0, %xmm0 +; AVX512-64-NEXT: retq %z = zext <4 x i16> %A to <4 x i32> %m = mul nuw nsw <4 x i32> %z, ret <4 x i32> %m @@ -407,7 +419,7 @@ ; ; AVX2-SLOW-LABEL: test_mul_v8i32_v8i16: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = [18778,18778,18778,18778,18778,18778,18778,18778] +; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} xmm1 = [18778,18778,18778,18778,18778,18778,18778,18778] ; AVX2-SLOW-NEXT: vpmulhuw %xmm1, %xmm0, %xmm2 ; AVX2-SLOW-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] @@ -415,19 +427,31 @@ ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: ret{{[l|q]}} ; -; AVX-32-LABEL: test_mul_v8i32_v8i16: -; AVX-32: # %bb.0: -; AVX-32-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX-32-NEXT: vpbroadcastd {{.*#+}} ymm1 = [18778,18778,18778,18778,18778,18778,18778,18778] -; AVX-32-NEXT: vpmulld %ymm1, %ymm0, %ymm0 -; AVX-32-NEXT: retl -; -; AVX-64-LABEL: test_mul_v8i32_v8i16: -; AVX-64: # %bb.0: -; AVX-64-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX-64-NEXT: vpbroadcastd {{.*#+}} ymm1 = [18778,18778,18778,18778,18778,18778,18778,18778] -; AVX-64-NEXT: vpmulld %ymm1, %ymm0, %ymm0 -; AVX-64-NEXT: retq +; AVX2-32-LABEL: test_mul_v8i32_v8i16: +; AVX2-32: # %bb.0: +; AVX2-32-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX2-32-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0 +; AVX2-32-NEXT: retl +; +; AVX2-64-LABEL: test_mul_v8i32_v8i16: +; AVX2-64: # %bb.0: +; AVX2-64-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX2-64-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX2-64-NEXT: retq +; +; AVX512-32-LABEL: test_mul_v8i32_v8i16: +; AVX512-32: # %bb.0: +; AVX512-32-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX512-32-NEXT: vpbroadcastd {{.*#+}} ymm1 = [18778,18778,18778,18778,18778,18778,18778,18778] +; AVX512-32-NEXT: vpmulld %ymm1, %ymm0, %ymm0 +; AVX512-32-NEXT: retl +; +; AVX512-64-LABEL: test_mul_v8i32_v8i16: +; AVX512-64: # %bb.0: +; AVX512-64-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX512-64-NEXT: vpbroadcastd {{.*#+}} ymm1 = [18778,18778,18778,18778,18778,18778,18778,18778] +; AVX512-64-NEXT: vpmulld %ymm1, %ymm0, %ymm0 +; AVX512-64-NEXT: retq %z = zext <8 x i16> %A to <8 x i32> %m = mul nuw nsw <8 x i32> %z, ret <8 x i32> %m @@ -490,7 +514,7 @@ ; ; AVX2-SLOW-LABEL: test_mul_v16i32_v16i16: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = [18778,18778,18778,18778,18778,18778,18778,18778,18778,18778,18778,18778,18778,18778,18778,18778] +; AVX2-SLOW-NEXT: vpbroadcastw {{.*#+}} ymm1 = [18778,18778,18778,18778,18778,18778,18778,18778,18778,18778,18778,18778,18778,18778,18778,18778] ; AVX2-SLOW-NEXT: vpmulhuw %ymm1, %ymm0, %ymm2 ; AVX2-SLOW-NEXT: vpmullw %ymm1, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm0[4],ymm2[4],ymm0[5],ymm2[5],ymm0[6],ymm2[6],ymm0[7],ymm2[7],ymm0[12],ymm2[12],ymm0[13],ymm2[13],ymm0[14],ymm2[14],ymm0[15],ymm2[15] @@ -847,12 +871,43 @@ ; SSE-64-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE-64-NEXT: retq ; -; AVX2-LABEL: test_mul_v4i32_v4i16_minsize: -; AVX2: # %bb.0: -; AVX2-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [18778,18778,18778,18778] -; AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: ret{{[l|q]}} +; AVX2-SLOW32-LABEL: test_mul_v4i32_v4i16_minsize: +; AVX2-SLOW32: # %bb.0: +; AVX2-SLOW32-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX2-SLOW32-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 +; AVX2-SLOW32-NEXT: retl +; +; AVX2-SLOW64-LABEL: test_mul_v4i32_v4i16_minsize: +; AVX2-SLOW64: # %bb.0: +; AVX2-SLOW64-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX2-SLOW64-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX2-SLOW64-NEXT: retq +; +; AVX2-32-LABEL: test_mul_v4i32_v4i16_minsize: +; AVX2-32: # %bb.0: +; AVX2-32-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX2-32-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 +; AVX2-32-NEXT: retl +; +; AVX2-64-LABEL: test_mul_v4i32_v4i16_minsize: +; AVX2-64: # %bb.0: +; AVX2-64-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX2-64-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX2-64-NEXT: retq +; +; AVX512-32-LABEL: test_mul_v4i32_v4i16_minsize: +; AVX512-32: # %bb.0: +; AVX512-32-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX512-32-NEXT: vpbroadcastd {{.*#+}} xmm1 = [18778,18778,18778,18778] +; AVX512-32-NEXT: vpmulld %xmm1, %xmm0, %xmm0 +; AVX512-32-NEXT: retl +; +; AVX512-64-LABEL: test_mul_v4i32_v4i16_minsize: +; AVX512-64: # %bb.0: +; AVX512-64-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX512-64-NEXT: vpbroadcastd {{.*#+}} xmm1 = [18778,18778,18778,18778] +; AVX512-64-NEXT: vpmulld %xmm1, %xmm0, %xmm0 +; AVX512-64-NEXT: retq %z = zext <4 x i16> %A to <4 x i32> %m = mul nuw nsw <4 x i32> %z, ret <4 x i32> %m @@ -892,12 +947,43 @@ ; SSE4-NEXT: movdqa %xmm2, %xmm0 ; SSE4-NEXT: ret{{[l|q]}} ; -; AVX2-LABEL: test_mul_v8i32_v8i16_minsize: -; AVX2: # %bb.0: -; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm1 = [18778,18778,18778,18778,18778,18778,18778,18778] -; AVX2-NEXT: vpmulld %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: ret{{[l|q]}} +; AVX2-SLOW32-LABEL: test_mul_v8i32_v8i16_minsize: +; AVX2-SLOW32: # %bb.0: +; AVX2-SLOW32-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX2-SLOW32-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0 +; AVX2-SLOW32-NEXT: retl +; +; AVX2-SLOW64-LABEL: test_mul_v8i32_v8i16_minsize: +; AVX2-SLOW64: # %bb.0: +; AVX2-SLOW64-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX2-SLOW64-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX2-SLOW64-NEXT: retq +; +; AVX2-32-LABEL: test_mul_v8i32_v8i16_minsize: +; AVX2-32: # %bb.0: +; AVX2-32-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX2-32-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0 +; AVX2-32-NEXT: retl +; +; AVX2-64-LABEL: test_mul_v8i32_v8i16_minsize: +; AVX2-64: # %bb.0: +; AVX2-64-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX2-64-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX2-64-NEXT: retq +; +; AVX512-32-LABEL: test_mul_v8i32_v8i16_minsize: +; AVX512-32: # %bb.0: +; AVX512-32-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX512-32-NEXT: vpbroadcastd {{.*#+}} ymm1 = [18778,18778,18778,18778,18778,18778,18778,18778] +; AVX512-32-NEXT: vpmulld %ymm1, %ymm0, %ymm0 +; AVX512-32-NEXT: retl +; +; AVX512-64-LABEL: test_mul_v8i32_v8i16_minsize: +; AVX512-64: # %bb.0: +; AVX512-64-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX512-64-NEXT: vpbroadcastd {{.*#+}} ymm1 = [18778,18778,18778,18778,18778,18778,18778,18778] +; AVX512-64-NEXT: vpmulld %ymm1, %ymm0, %ymm0 +; AVX512-64-NEXT: retq %z = zext <8 x i16> %A to <8 x i32> %m = mul nuw nsw <8 x i32> %z, ret <8 x i32> %m @@ -998,6 +1084,9 @@ ret <16 x i32> %m } ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; AVX-32: {{.*}} +; AVX-64: {{.*}} +; AVX2: {{.*}} ; SLM-32: {{.*}} ; SLM-64: {{.*}} ; SLOW-32: {{.*}} diff --git a/llvm/test/CodeGen/X86/splat-const.ll b/llvm/test/CodeGen/X86/splat-const.ll --- a/llvm/test/CodeGen/X86/splat-const.ll +++ b/llvm/test/CodeGen/X86/splat-const.ll @@ -38,12 +38,12 @@ ; ; AVX-LABEL: const_vector: ; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [42,42,42,42] +; AVX-NEXT: vbroadcastss {{.*#+}} xmm0 = [5.88545355E-44,5.88545355E-44,5.88545355E-44,5.88545355E-44] ; AVX-NEXT: retq ; ; AVX2-LABEL: const_vector: ; AVX2: # %bb.0: -; AVX2-NEXT: vbroadcastss {{.*#+}} xmm0 = [42,42,42,42] +; AVX2-NEXT: vbroadcastss {{.*#+}} xmm0 = [5.88545355E-44,5.88545355E-44,5.88545355E-44,5.88545355E-44] ; AVX2-NEXT: retq %const = insertelement <4 x i32> undef, i32 42, i32 0 %splat = shufflevector <4 x i32> %const, <4 x i32> undef, <4 x i32> zeroinitializer diff --git a/llvm/test/CodeGen/X86/splat-for-size.ll b/llvm/test/CodeGen/X86/splat-for-size.ll --- a/llvm/test/CodeGen/X86/splat-for-size.ll +++ b/llvm/test/CodeGen/X86/splat-for-size.ll @@ -9,9 +9,7 @@ define <2 x double> @splat_v2f64(<2 x double> %x) #0 { ; CHECK-LABEL: splat_v2f64: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovddup {{.*#+}} xmm1 = [1.0E+0,1.0E+0] -; CHECK-NEXT: # xmm1 = mem[0,0] -; CHECK-NEXT: vaddpd %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vaddpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; CHECK-NEXT: retq %add = fadd <2 x double> %x, ret <2 x double> %add @@ -20,9 +18,7 @@ define <2 x double> @splat_v2f64_pgso(<2 x double> %x) !prof !14 { ; CHECK-LABEL: splat_v2f64_pgso: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovddup {{.*#+}} xmm1 = [1.0E+0,1.0E+0] -; CHECK-NEXT: # xmm1 = mem[0,0] -; CHECK-NEXT: vaddpd %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vaddpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; CHECK-NEXT: retq %add = fadd <2 x double> %x, ret <2 x double> %add @@ -31,8 +27,7 @@ define <4 x double> @splat_v4f64(<4 x double> %x) #1 { ; CHECK-LABEL: splat_v4f64: ; CHECK: # %bb.0: -; CHECK-NEXT: vbroadcastsd {{.*#+}} ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; CHECK-NEXT: vaddpd %ymm1, %ymm0, %ymm0 +; CHECK-NEXT: vaddpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; CHECK-NEXT: retq %add = fadd <4 x double> %x, ret <4 x double> %add @@ -41,8 +36,7 @@ define <4 x double> @splat_v4f64_pgso(<4 x double> %x) !prof !14 { ; CHECK-LABEL: splat_v4f64_pgso: ; CHECK: # %bb.0: -; CHECK-NEXT: vbroadcastsd {{.*#+}} ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; CHECK-NEXT: vaddpd %ymm1, %ymm0, %ymm0 +; CHECK-NEXT: vaddpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; CHECK-NEXT: retq %add = fadd <4 x double> %x, ret <4 x double> %add @@ -51,8 +45,7 @@ define <4 x float> @splat_v4f32(<4 x float> %x) #0 { ; CHECK-LABEL: splat_v4f32: ; CHECK: # %bb.0: -; CHECK-NEXT: vbroadcastss {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; CHECK-NEXT: vaddps %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vaddps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; CHECK-NEXT: retq %add = fadd <4 x float> %x, ret <4 x float> %add @@ -61,8 +54,7 @@ define <4 x float> @splat_v4f32_pgso(<4 x float> %x) !prof !14 { ; CHECK-LABEL: splat_v4f32_pgso: ; CHECK: # %bb.0: -; CHECK-NEXT: vbroadcastss {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; CHECK-NEXT: vaddps %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vaddps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; CHECK-NEXT: retq %add = fadd <4 x float> %x, ret <4 x float> %add @@ -71,8 +63,7 @@ define <8 x float> @splat_v8f32(<8 x float> %x) #1 { ; CHECK-LABEL: splat_v8f32: ; CHECK: # %bb.0: -; CHECK-NEXT: vbroadcastss {{.*#+}} ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; CHECK-NEXT: vaddps %ymm1, %ymm0, %ymm0 +; CHECK-NEXT: vaddps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; CHECK-NEXT: retq %add = fadd <8 x float> %x, ret <8 x float> %add @@ -81,8 +72,7 @@ define <8 x float> @splat_v8f32_pgso(<8 x float> %x) !prof !14 { ; CHECK-LABEL: splat_v8f32_pgso: ; CHECK: # %bb.0: -; CHECK-NEXT: vbroadcastss {{.*#+}} ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; CHECK-NEXT: vaddps %ymm1, %ymm0, %ymm0 +; CHECK-NEXT: vaddps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; CHECK-NEXT: retq %add = fadd <8 x float> %x, ret <8 x float> %add @@ -91,35 +81,19 @@ ; AVX can't do integer splats, so fake it: use vmovddup to splat 64-bit value. ; We also generate vmovddup for AVX2 because it's one byte smaller than vpbroadcastq. define <2 x i64> @splat_v2i64(<2 x i64> %x) #1 { -; AVX-LABEL: splat_v2i64: -; AVX: # %bb.0: -; AVX-NEXT: vmovddup {{.*#+}} xmm1 = [2,2] -; AVX-NEXT: # xmm1 = mem[0,0] -; AVX-NEXT: vpaddq %xmm1, %xmm0, %xmm0 -; AVX-NEXT: retq -; -; AVX2-LABEL: splat_v2i64: -; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [2,2] -; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: retq +; CHECK-LABEL: splat_v2i64: +; CHECK: # %bb.0: +; CHECK-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; CHECK-NEXT: retq %add = add <2 x i64> %x, ret <2 x i64> %add } define <2 x i64> @splat_v2i64_pgso(<2 x i64> %x) !prof !14 { -; AVX-LABEL: splat_v2i64_pgso: -; AVX: # %bb.0: -; AVX-NEXT: vmovddup {{.*#+}} xmm1 = [2,2] -; AVX-NEXT: # xmm1 = mem[0,0] -; AVX-NEXT: vpaddq %xmm1, %xmm0, %xmm0 -; AVX-NEXT: retq -; -; AVX2-LABEL: splat_v2i64_pgso: -; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [2,2] -; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: retq +; CHECK-LABEL: splat_v2i64_pgso: +; CHECK: # %bb.0: +; CHECK-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; CHECK-NEXT: retq %add = add <2 x i64> %x, ret <2 x i64> %add } @@ -130,8 +104,7 @@ ; AVX-LABEL: splat_v4i64: ; AVX: # %bb.0: ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX-NEXT: vmovddup {{.*#+}} xmm2 = [2,2] -; AVX-NEXT: # xmm2 = mem[0,0] +; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [2,2] ; AVX-NEXT: vpaddq %xmm2, %xmm1, %xmm1 ; AVX-NEXT: vpaddq %xmm2, %xmm0, %xmm0 ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 @@ -139,8 +112,7 @@ ; ; AVX2-LABEL: splat_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [2,2,2,2] -; AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: retq %add = add <4 x i64> %x, ret <4 x i64> %add @@ -150,8 +122,7 @@ ; AVX-LABEL: splat_v4i64_pgso: ; AVX: # %bb.0: ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX-NEXT: vmovddup {{.*#+}} xmm2 = [2,2] -; AVX-NEXT: # xmm2 = mem[0,0] +; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [2,2] ; AVX-NEXT: vpaddq %xmm2, %xmm1, %xmm1 ; AVX-NEXT: vpaddq %xmm2, %xmm0, %xmm0 ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 @@ -159,8 +130,7 @@ ; ; AVX2-LABEL: splat_v4i64_pgso: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [2,2,2,2] -; AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: retq %add = add <4 x i64> %x, ret <4 x i64> %add @@ -168,33 +138,19 @@ ; AVX can't do integer splats, so fake it: use vbroadcastss to splat 32-bit value. define <4 x i32> @splat_v4i32(<4 x i32> %x) #1 { -; AVX-LABEL: splat_v4i32: -; AVX: # %bb.0: -; AVX-NEXT: vbroadcastss {{.*#+}} xmm1 = [2,2,2,2] -; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX-NEXT: retq -; -; AVX2-LABEL: splat_v4i32: -; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2,2,2,2] -; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: retq +; CHECK-LABEL: splat_v4i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; CHECK-NEXT: retq %add = add <4 x i32> %x, ret <4 x i32> %add } define <4 x i32> @splat_v4i32_pgso(<4 x i32> %x) !prof !14 { -; AVX-LABEL: splat_v4i32_pgso: -; AVX: # %bb.0: -; AVX-NEXT: vbroadcastss {{.*#+}} xmm1 = [2,2,2,2] -; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX-NEXT: retq -; -; AVX2-LABEL: splat_v4i32_pgso: -; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2,2,2,2] -; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: retq +; CHECK-LABEL: splat_v4i32_pgso: +; CHECK: # %bb.0: +; CHECK-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; CHECK-NEXT: retq %add = add <4 x i32> %x, ret <4 x i32> %add } @@ -204,7 +160,7 @@ ; AVX-LABEL: splat_v8i32: ; AVX: # %bb.0: ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX-NEXT: vbroadcastss {{.*#+}} xmm2 = [2,2,2,2] +; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [2,2,2,2] ; AVX-NEXT: vpaddd %xmm2, %xmm1, %xmm1 ; AVX-NEXT: vpaddd %xmm2, %xmm0, %xmm0 ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 @@ -212,8 +168,7 @@ ; ; AVX2-LABEL: splat_v8i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm1 = [2,2,2,2,2,2,2,2] -; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: retq %add = add <8 x i32> %x, ret <8 x i32> %add @@ -223,7 +178,7 @@ ; AVX-LABEL: splat_v8i32_pgso: ; AVX: # %bb.0: ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX-NEXT: vbroadcastss {{.*#+}} xmm2 = [2,2,2,2] +; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [2,2,2,2] ; AVX-NEXT: vpaddd %xmm2, %xmm1, %xmm1 ; AVX-NEXT: vpaddd %xmm2, %xmm0, %xmm0 ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 @@ -231,8 +186,7 @@ ; ; AVX2-LABEL: splat_v8i32_pgso: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm1 = [2,2,2,2,2,2,2,2] -; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: retq %add = add <8 x i32> %x, ret <8 x i32> %add @@ -240,31 +194,19 @@ ; AVX can't do integer splats, and there's no broadcast fakery for 16-bit. Could use pshuflw, etc? define <8 x i16> @splat_v8i16(<8 x i16> %x) #1 { -; AVX-LABEL: splat_v8i16: -; AVX: # %bb.0: -; AVX-NEXT: vpaddw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX-NEXT: retq -; -; AVX2-LABEL: splat_v8i16: -; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm1 = [2,2,2,2,2,2,2,2] -; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: retq +; CHECK-LABEL: splat_v8i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vpaddw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; CHECK-NEXT: retq %add = add <8 x i16> %x, ret <8 x i16> %add } define <8 x i16> @splat_v8i16_pgso(<8 x i16> %x) !prof !14 { -; AVX-LABEL: splat_v8i16_pgso: -; AVX: # %bb.0: -; AVX-NEXT: vpaddw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX-NEXT: retq -; -; AVX2-LABEL: splat_v8i16_pgso: -; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm1 = [2,2,2,2,2,2,2,2] -; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: retq +; CHECK-LABEL: splat_v8i16_pgso: +; CHECK: # %bb.0: +; CHECK-NEXT: vpaddw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; CHECK-NEXT: retq %add = add <8 x i16> %x, ret <8 x i16> %add } @@ -282,8 +224,7 @@ ; ; AVX2-LABEL: splat_v16i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm1 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2] -; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpaddw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: retq %add = add <16 x i16> %x, ret <16 x i16> %add @@ -301,8 +242,7 @@ ; ; AVX2-LABEL: splat_v16i16_pgso: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm1 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2] -; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpaddw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: retq %add = add <16 x i16> %x, ret <16 x i16> %add @@ -310,31 +250,19 @@ ; AVX can't do integer splats, and there's no broadcast fakery for 8-bit. Could use pshufb, etc? define <16 x i8> @splat_v16i8(<16 x i8> %x) #1 { -; AVX-LABEL: splat_v16i8: -; AVX: # %bb.0: -; AVX-NEXT: vpaddb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX-NEXT: retq -; -; AVX2-LABEL: splat_v16i8: -; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2] -; AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: retq +; CHECK-LABEL: splat_v16i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vpaddb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; CHECK-NEXT: retq %add = add <16 x i8> %x, ret <16 x i8> %add } define <16 x i8> @splat_v16i8_pgso(<16 x i8> %x) !prof !14 { -; AVX-LABEL: splat_v16i8_pgso: -; AVX: # %bb.0: -; AVX-NEXT: vpaddb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX-NEXT: retq -; -; AVX2-LABEL: splat_v16i8_pgso: -; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2] -; AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: retq +; CHECK-LABEL: splat_v16i8_pgso: +; CHECK: # %bb.0: +; CHECK-NEXT: vpaddb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; CHECK-NEXT: retq %add = add <16 x i8> %x, ret <16 x i8> %add } @@ -352,8 +280,7 @@ ; ; AVX2-LABEL: splat_v32i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2] -; AVX2-NEXT: vpaddb %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpaddb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: retq %add = add <32 x i8> %x, ret <32 x i8> %add @@ -371,8 +298,7 @@ ; ; AVX2-LABEL: splat_v32i8_pgso: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2] -; AVX2-NEXT: vpaddb %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpaddb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: retq %add = add <32 x i8> %x, ret <32 x i8> %add @@ -390,14 +316,14 @@ ; AVX-NEXT: vmovaps A+16(%rip), %xmm0 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],mem[2,3] ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] -; AVX-NEXT: vbroadcastsd {{.*#+}} ymm1 = [1,1,1,1] +; AVX-NEXT: vbroadcastsd {{.*#+}} ymm1 = [4.9406564584124654E-324,4.9406564584124654E-324,4.9406564584124654E-324,4.9406564584124654E-324] ; AVX-NEXT: retq ; ; AVX2-LABEL: pr23259: ; AVX2: # %bb.0: # %entry ; AVX2-NEXT: vmovaps A+16(%rip), %xmm0 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],mem[2,3,4,5,6,7] -; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm1 = [1,1,1,1] +; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm1 = [4.9406564584124654E-324,4.9406564584124654E-324,4.9406564584124654E-324,4.9406564584124654E-324] ; AVX2-NEXT: retq entry: %0 = load <4 x i64>, ptr @A, align 32 diff --git a/llvm/test/CodeGen/X86/sqrt-fastmath-mir.ll b/llvm/test/CodeGen/X86/sqrt-fastmath-mir.ll --- a/llvm/test/CodeGen/X86/sqrt-fastmath-mir.ll +++ b/llvm/test/CodeGen/X86/sqrt-fastmath-mir.ll @@ -6,12 +6,13 @@ define float @sqrt_ieee(float %f) #0 { ; CHECK-LABEL: name: sqrt_ieee ; CHECK: bb.0 (%ir-block.0): - ; CHECK: liveins: $xmm0 - ; CHECK: [[COPY:%[0-9]+]]:fr32 = COPY $xmm0 - ; CHECK: [[DEF:%[0-9]+]]:fr32 = IMPLICIT_DEF - ; CHECK: %1:fr32 = nofpexcept VSQRTSSr killed [[DEF]], [[COPY]], implicit $mxcsr - ; CHECK: $xmm0 = COPY %1 - ; CHECK: RET 0, $xmm0 + ; CHECK-NEXT: liveins: $xmm0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:fr32 = COPY $xmm0 + ; CHECK-NEXT: [[DEF:%[0-9]+]]:fr32 = IMPLICIT_DEF + ; CHECK-NEXT: [[VSQRTSSr:%[0-9]+]]:fr32 = nofpexcept VSQRTSSr killed [[DEF]], [[COPY]], implicit $mxcsr + ; CHECK-NEXT: $xmm0 = COPY [[VSQRTSSr]] + ; CHECK-NEXT: RET 0, $xmm0 %call = tail call float @llvm.sqrt.f32(float %f) ret float %call } @@ -19,31 +20,31 @@ define float @sqrt_ieee_ninf(float %f) #0 { ; CHECK-LABEL: name: sqrt_ieee_ninf ; CHECK: bb.0 (%ir-block.0): - ; CHECK: liveins: $xmm0 - ; CHECK: [[COPY:%[0-9]+]]:fr32 = COPY $xmm0 - ; CHECK: [[DEF:%[0-9]+]]:fr32 = IMPLICIT_DEF - ; CHECK: [[VRSQRTSSr:%[0-9]+]]:fr32 = VRSQRTSSr killed [[DEF]], [[COPY]] - ; CHECK: %3:fr32 = ninf afn nofpexcept VMULSSrr [[COPY]], [[VRSQRTSSr]], implicit $mxcsr - ; CHECK: [[VMOVSSrm_alt:%[0-9]+]]:fr32 = VMOVSSrm_alt $rip, 1, $noreg, %const.0, $noreg :: (load (s32) from constant-pool) - ; CHECK: %5:fr32 = ninf afn nofpexcept VFMADD213SSr [[VRSQRTSSr]], killed %3, [[VMOVSSrm_alt]], implicit $mxcsr - ; CHECK: [[VMOVSSrm_alt1:%[0-9]+]]:fr32 = VMOVSSrm_alt $rip, 1, $noreg, %const.1, $noreg :: (load (s32) from constant-pool) - ; CHECK: %7:fr32 = ninf afn nofpexcept VMULSSrr [[VRSQRTSSr]], [[VMOVSSrm_alt1]], implicit $mxcsr - ; CHECK: %8:fr32 = ninf afn nofpexcept VMULSSrr killed %7, killed %5, implicit $mxcsr - ; CHECK: %9:fr32 = ninf afn nofpexcept VMULSSrr [[COPY]], %8, implicit $mxcsr - ; CHECK: %10:fr32 = ninf afn nofpexcept VFMADD213SSr %8, %9, [[VMOVSSrm_alt]], implicit $mxcsr - ; CHECK: %11:fr32 = ninf afn nofpexcept VMULSSrr %9, [[VMOVSSrm_alt1]], implicit $mxcsr - ; CHECK: %12:fr32 = ninf afn nofpexcept VMULSSrr killed %11, killed %10, implicit $mxcsr - ; CHECK: [[COPY1:%[0-9]+]]:vr128 = COPY %12 - ; CHECK: [[COPY2:%[0-9]+]]:vr128 = COPY [[COPY]] - ; CHECK: [[VPBROADCASTDrm:%[0-9]+]]:vr128 = VPBROADCASTDrm $rip, 1, $noreg, %const.2, $noreg :: (load (s32) from constant-pool) - ; CHECK: [[VPANDrr:%[0-9]+]]:vr128 = VPANDrr killed [[COPY2]], killed [[VPBROADCASTDrm]] - ; CHECK: [[COPY3:%[0-9]+]]:fr32 = COPY [[VPANDrr]] - ; CHECK: %18:fr32 = nofpexcept VCMPSSrm killed [[COPY3]], $rip, 1, $noreg, %const.3, $noreg, 1, implicit $mxcsr :: (load (s32) from constant-pool) - ; CHECK: [[COPY4:%[0-9]+]]:vr128 = COPY %18 - ; CHECK: [[VPANDNrr:%[0-9]+]]:vr128 = VPANDNrr killed [[COPY4]], killed [[COPY1]] - ; CHECK: [[COPY5:%[0-9]+]]:fr32 = COPY [[VPANDNrr]] - ; CHECK: $xmm0 = COPY [[COPY5]] - ; CHECK: RET 0, $xmm0 + ; CHECK-NEXT: liveins: $xmm0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:fr32 = COPY $xmm0 + ; CHECK-NEXT: [[DEF:%[0-9]+]]:fr32 = IMPLICIT_DEF + ; CHECK-NEXT: [[VRSQRTSSr:%[0-9]+]]:fr32 = VRSQRTSSr killed [[DEF]], [[COPY]] + ; CHECK-NEXT: [[VMULSSrr:%[0-9]+]]:fr32 = ninf afn nofpexcept VMULSSrr [[COPY]], [[VRSQRTSSr]], implicit $mxcsr + ; CHECK-NEXT: [[VMOVSSrm_alt:%[0-9]+]]:fr32 = VMOVSSrm_alt $rip, 1, $noreg, %const.0, $noreg :: (load (s32) from constant-pool) + ; CHECK-NEXT: [[VFMADD213SSr:%[0-9]+]]:fr32 = ninf afn nofpexcept VFMADD213SSr [[VRSQRTSSr]], killed [[VMULSSrr]], [[VMOVSSrm_alt]], implicit $mxcsr + ; CHECK-NEXT: [[VMOVSSrm_alt1:%[0-9]+]]:fr32 = VMOVSSrm_alt $rip, 1, $noreg, %const.1, $noreg :: (load (s32) from constant-pool) + ; CHECK-NEXT: [[VMULSSrr1:%[0-9]+]]:fr32 = ninf afn nofpexcept VMULSSrr [[VRSQRTSSr]], [[VMOVSSrm_alt1]], implicit $mxcsr + ; CHECK-NEXT: [[VMULSSrr2:%[0-9]+]]:fr32 = ninf afn nofpexcept VMULSSrr killed [[VMULSSrr1]], killed [[VFMADD213SSr]], implicit $mxcsr + ; CHECK-NEXT: [[VMULSSrr3:%[0-9]+]]:fr32 = ninf afn nofpexcept VMULSSrr [[COPY]], [[VMULSSrr2]], implicit $mxcsr + ; CHECK-NEXT: [[VFMADD213SSr1:%[0-9]+]]:fr32 = ninf afn nofpexcept VFMADD213SSr [[VMULSSrr2]], [[VMULSSrr3]], [[VMOVSSrm_alt]], implicit $mxcsr + ; CHECK-NEXT: [[VMULSSrr4:%[0-9]+]]:fr32 = ninf afn nofpexcept VMULSSrr [[VMULSSrr3]], [[VMOVSSrm_alt1]], implicit $mxcsr + ; CHECK-NEXT: [[VMULSSrr5:%[0-9]+]]:fr32 = ninf afn nofpexcept VMULSSrr killed [[VMULSSrr4]], killed [[VFMADD213SSr1]], implicit $mxcsr + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vr128 = COPY [[VMULSSrr5]] + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vr128 = COPY [[COPY]] + ; CHECK-NEXT: [[VPANDrm:%[0-9]+]]:vr128 = VPANDrm killed [[COPY2]], $rip, 1, $noreg, %const.2, $noreg :: (load (s128) from constant-pool) + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:fr32 = COPY [[VPANDrm]] + ; CHECK-NEXT: [[VCMPSSrm:%[0-9]+]]:fr32 = nofpexcept VCMPSSrm killed [[COPY3]], $rip, 1, $noreg, %const.3, $noreg, 1, implicit $mxcsr :: (load (s32) from constant-pool) + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vr128 = COPY [[VCMPSSrm]] + ; CHECK-NEXT: [[VPANDNrr:%[0-9]+]]:vr128 = VPANDNrr killed [[COPY4]], killed [[COPY1]] + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:fr32 = COPY [[VPANDNrr]] + ; CHECK-NEXT: $xmm0 = COPY [[COPY5]] + ; CHECK-NEXT: RET 0, $xmm0 %call = tail call ninf afn float @llvm.sqrt.f32(float %f) ret float %call } @@ -51,12 +52,13 @@ define float @sqrt_daz(float %f) #1 { ; CHECK-LABEL: name: sqrt_daz ; CHECK: bb.0 (%ir-block.0): - ; CHECK: liveins: $xmm0 - ; CHECK: [[COPY:%[0-9]+]]:fr32 = COPY $xmm0 - ; CHECK: [[DEF:%[0-9]+]]:fr32 = IMPLICIT_DEF - ; CHECK: %1:fr32 = nofpexcept VSQRTSSr killed [[DEF]], [[COPY]], implicit $mxcsr - ; CHECK: $xmm0 = COPY %1 - ; CHECK: RET 0, $xmm0 + ; CHECK-NEXT: liveins: $xmm0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:fr32 = COPY $xmm0 + ; CHECK-NEXT: [[DEF:%[0-9]+]]:fr32 = IMPLICIT_DEF + ; CHECK-NEXT: [[VSQRTSSr:%[0-9]+]]:fr32 = nofpexcept VSQRTSSr killed [[DEF]], [[COPY]], implicit $mxcsr + ; CHECK-NEXT: $xmm0 = COPY [[VSQRTSSr]] + ; CHECK-NEXT: RET 0, $xmm0 %call = tail call float @llvm.sqrt.f32(float %f) ret float %call } @@ -64,28 +66,29 @@ define float @sqrt_daz_ninf(float %f) #1 { ; CHECK-LABEL: name: sqrt_daz_ninf ; CHECK: bb.0 (%ir-block.0): - ; CHECK: liveins: $xmm0 - ; CHECK: [[COPY:%[0-9]+]]:fr32 = COPY $xmm0 - ; CHECK: [[DEF:%[0-9]+]]:fr32 = IMPLICIT_DEF - ; CHECK: [[VRSQRTSSr:%[0-9]+]]:fr32 = VRSQRTSSr killed [[DEF]], [[COPY]] - ; CHECK: %3:fr32 = ninf afn nofpexcept VMULSSrr [[COPY]], [[VRSQRTSSr]], implicit $mxcsr - ; CHECK: [[VMOVSSrm_alt:%[0-9]+]]:fr32 = VMOVSSrm_alt $rip, 1, $noreg, %const.0, $noreg :: (load (s32) from constant-pool) - ; CHECK: %5:fr32 = ninf afn nofpexcept VFMADD213SSr [[VRSQRTSSr]], killed %3, [[VMOVSSrm_alt]], implicit $mxcsr - ; CHECK: [[VMOVSSrm_alt1:%[0-9]+]]:fr32 = VMOVSSrm_alt $rip, 1, $noreg, %const.1, $noreg :: (load (s32) from constant-pool) - ; CHECK: %7:fr32 = ninf afn nofpexcept VMULSSrr [[VRSQRTSSr]], [[VMOVSSrm_alt1]], implicit $mxcsr - ; CHECK: %8:fr32 = ninf afn nofpexcept VMULSSrr killed %7, killed %5, implicit $mxcsr - ; CHECK: %9:fr32 = ninf afn nofpexcept VMULSSrr [[COPY]], %8, implicit $mxcsr - ; CHECK: %10:fr32 = ninf afn nofpexcept VFMADD213SSr %8, %9, [[VMOVSSrm_alt]], implicit $mxcsr - ; CHECK: %11:fr32 = ninf afn nofpexcept VMULSSrr %9, [[VMOVSSrm_alt1]], implicit $mxcsr - ; CHECK: %12:fr32 = ninf afn nofpexcept VMULSSrr killed %11, killed %10, implicit $mxcsr - ; CHECK: [[COPY1:%[0-9]+]]:vr128 = COPY %12 - ; CHECK: [[FsFLD0SS:%[0-9]+]]:fr32 = FsFLD0SS - ; CHECK: %15:fr32 = nofpexcept VCMPSSrr [[COPY]], killed [[FsFLD0SS]], 0, implicit $mxcsr - ; CHECK: [[COPY2:%[0-9]+]]:vr128 = COPY %15 - ; CHECK: [[VPANDNrr:%[0-9]+]]:vr128 = VPANDNrr killed [[COPY2]], killed [[COPY1]] - ; CHECK: [[COPY3:%[0-9]+]]:fr32 = COPY [[VPANDNrr]] - ; CHECK: $xmm0 = COPY [[COPY3]] - ; CHECK: RET 0, $xmm0 + ; CHECK-NEXT: liveins: $xmm0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:fr32 = COPY $xmm0 + ; CHECK-NEXT: [[DEF:%[0-9]+]]:fr32 = IMPLICIT_DEF + ; CHECK-NEXT: [[VRSQRTSSr:%[0-9]+]]:fr32 = VRSQRTSSr killed [[DEF]], [[COPY]] + ; CHECK-NEXT: [[VMULSSrr:%[0-9]+]]:fr32 = ninf afn nofpexcept VMULSSrr [[COPY]], [[VRSQRTSSr]], implicit $mxcsr + ; CHECK-NEXT: [[VMOVSSrm_alt:%[0-9]+]]:fr32 = VMOVSSrm_alt $rip, 1, $noreg, %const.0, $noreg :: (load (s32) from constant-pool) + ; CHECK-NEXT: [[VFMADD213SSr:%[0-9]+]]:fr32 = ninf afn nofpexcept VFMADD213SSr [[VRSQRTSSr]], killed [[VMULSSrr]], [[VMOVSSrm_alt]], implicit $mxcsr + ; CHECK-NEXT: [[VMOVSSrm_alt1:%[0-9]+]]:fr32 = VMOVSSrm_alt $rip, 1, $noreg, %const.1, $noreg :: (load (s32) from constant-pool) + ; CHECK-NEXT: [[VMULSSrr1:%[0-9]+]]:fr32 = ninf afn nofpexcept VMULSSrr [[VRSQRTSSr]], [[VMOVSSrm_alt1]], implicit $mxcsr + ; CHECK-NEXT: [[VMULSSrr2:%[0-9]+]]:fr32 = ninf afn nofpexcept VMULSSrr killed [[VMULSSrr1]], killed [[VFMADD213SSr]], implicit $mxcsr + ; CHECK-NEXT: [[VMULSSrr3:%[0-9]+]]:fr32 = ninf afn nofpexcept VMULSSrr [[COPY]], [[VMULSSrr2]], implicit $mxcsr + ; CHECK-NEXT: [[VFMADD213SSr1:%[0-9]+]]:fr32 = ninf afn nofpexcept VFMADD213SSr [[VMULSSrr2]], [[VMULSSrr3]], [[VMOVSSrm_alt]], implicit $mxcsr + ; CHECK-NEXT: [[VMULSSrr4:%[0-9]+]]:fr32 = ninf afn nofpexcept VMULSSrr [[VMULSSrr3]], [[VMOVSSrm_alt1]], implicit $mxcsr + ; CHECK-NEXT: [[VMULSSrr5:%[0-9]+]]:fr32 = ninf afn nofpexcept VMULSSrr killed [[VMULSSrr4]], killed [[VFMADD213SSr1]], implicit $mxcsr + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vr128 = COPY [[VMULSSrr5]] + ; CHECK-NEXT: [[FsFLD0SS:%[0-9]+]]:fr32 = FsFLD0SS + ; CHECK-NEXT: [[VCMPSSrr:%[0-9]+]]:fr32 = nofpexcept VCMPSSrr [[COPY]], killed [[FsFLD0SS]], 0, implicit $mxcsr + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vr128 = COPY [[VCMPSSrr]] + ; CHECK-NEXT: [[VPANDNrr:%[0-9]+]]:vr128 = VPANDNrr killed [[COPY2]], killed [[COPY1]] + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:fr32 = COPY [[VPANDNrr]] + ; CHECK-NEXT: $xmm0 = COPY [[COPY3]] + ; CHECK-NEXT: RET 0, $xmm0 %call = tail call ninf afn float @llvm.sqrt.f32(float %f) ret float %call } @@ -93,22 +96,23 @@ define float @rsqrt_ieee(float %f) #0 { ; CHECK-LABEL: name: rsqrt_ieee ; CHECK: bb.0 (%ir-block.0): - ; CHECK: liveins: $xmm0 - ; CHECK: [[COPY:%[0-9]+]]:fr32 = COPY $xmm0 - ; CHECK: [[DEF:%[0-9]+]]:fr32 = IMPLICIT_DEF - ; CHECK: [[VRSQRTSSr:%[0-9]+]]:fr32 = nnan ninf nsz arcp contract afn reassoc VRSQRTSSr killed [[DEF]], [[COPY]] - ; CHECK: %3:fr32 = nnan ninf nsz arcp contract afn reassoc nofpexcept VMULSSrr [[COPY]], [[VRSQRTSSr]], implicit $mxcsr - ; CHECK: [[VMOVSSrm_alt:%[0-9]+]]:fr32 = VMOVSSrm_alt $rip, 1, $noreg, %const.0, $noreg :: (load (s32) from constant-pool) - ; CHECK: %5:fr32 = nnan ninf nsz arcp contract afn reassoc nofpexcept VFMADD213SSr [[VRSQRTSSr]], killed %3, [[VMOVSSrm_alt]], implicit $mxcsr - ; CHECK: [[VMOVSSrm_alt1:%[0-9]+]]:fr32 = VMOVSSrm_alt $rip, 1, $noreg, %const.1, $noreg :: (load (s32) from constant-pool) - ; CHECK: %7:fr32 = nnan ninf nsz arcp contract afn reassoc nofpexcept VMULSSrr [[VRSQRTSSr]], [[VMOVSSrm_alt1]], implicit $mxcsr - ; CHECK: %8:fr32 = nnan ninf nsz arcp contract afn reassoc nofpexcept VMULSSrr killed %7, killed %5, implicit $mxcsr - ; CHECK: %9:fr32 = nnan ninf nsz arcp contract afn reassoc nofpexcept VMULSSrr [[COPY]], %8, implicit $mxcsr - ; CHECK: %10:fr32 = nnan ninf nsz arcp contract afn reassoc nofpexcept VFMADD213SSr %8, killed %9, [[VMOVSSrm_alt]], implicit $mxcsr - ; CHECK: %11:fr32 = nnan ninf nsz arcp contract afn reassoc nofpexcept VMULSSrr %8, [[VMOVSSrm_alt1]], implicit $mxcsr - ; CHECK: %12:fr32 = nnan ninf nsz arcp contract afn reassoc nofpexcept VMULSSrr killed %11, killed %10, implicit $mxcsr - ; CHECK: $xmm0 = COPY %12 - ; CHECK: RET 0, $xmm0 + ; CHECK-NEXT: liveins: $xmm0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:fr32 = COPY $xmm0 + ; CHECK-NEXT: [[DEF:%[0-9]+]]:fr32 = IMPLICIT_DEF + ; CHECK-NEXT: [[VRSQRTSSr:%[0-9]+]]:fr32 = nnan ninf nsz arcp contract afn reassoc VRSQRTSSr killed [[DEF]], [[COPY]] + ; CHECK-NEXT: [[VMULSSrr:%[0-9]+]]:fr32 = nnan ninf nsz arcp contract afn reassoc nofpexcept VMULSSrr [[COPY]], [[VRSQRTSSr]], implicit $mxcsr + ; CHECK-NEXT: [[VMOVSSrm_alt:%[0-9]+]]:fr32 = VMOVSSrm_alt $rip, 1, $noreg, %const.0, $noreg :: (load (s32) from constant-pool) + ; CHECK-NEXT: [[VFMADD213SSr:%[0-9]+]]:fr32 = nnan ninf nsz arcp contract afn reassoc nofpexcept VFMADD213SSr [[VRSQRTSSr]], killed [[VMULSSrr]], [[VMOVSSrm_alt]], implicit $mxcsr + ; CHECK-NEXT: [[VMOVSSrm_alt1:%[0-9]+]]:fr32 = VMOVSSrm_alt $rip, 1, $noreg, %const.1, $noreg :: (load (s32) from constant-pool) + ; CHECK-NEXT: [[VMULSSrr1:%[0-9]+]]:fr32 = nnan ninf nsz arcp contract afn reassoc nofpexcept VMULSSrr [[VRSQRTSSr]], [[VMOVSSrm_alt1]], implicit $mxcsr + ; CHECK-NEXT: [[VMULSSrr2:%[0-9]+]]:fr32 = nnan ninf nsz arcp contract afn reassoc nofpexcept VMULSSrr killed [[VMULSSrr1]], killed [[VFMADD213SSr]], implicit $mxcsr + ; CHECK-NEXT: [[VMULSSrr3:%[0-9]+]]:fr32 = nnan ninf nsz arcp contract afn reassoc nofpexcept VMULSSrr [[COPY]], [[VMULSSrr2]], implicit $mxcsr + ; CHECK-NEXT: [[VFMADD213SSr1:%[0-9]+]]:fr32 = nnan ninf nsz arcp contract afn reassoc nofpexcept VFMADD213SSr [[VMULSSrr2]], killed [[VMULSSrr3]], [[VMOVSSrm_alt]], implicit $mxcsr + ; CHECK-NEXT: [[VMULSSrr4:%[0-9]+]]:fr32 = nnan ninf nsz arcp contract afn reassoc nofpexcept VMULSSrr [[VMULSSrr2]], [[VMOVSSrm_alt1]], implicit $mxcsr + ; CHECK-NEXT: [[VMULSSrr5:%[0-9]+]]:fr32 = nnan ninf nsz arcp contract afn reassoc nofpexcept VMULSSrr killed [[VMULSSrr4]], killed [[VFMADD213SSr1]], implicit $mxcsr + ; CHECK-NEXT: $xmm0 = COPY [[VMULSSrr5]] + ; CHECK-NEXT: RET 0, $xmm0 %sqrt = tail call float @llvm.sqrt.f32(float %f) %div = fdiv fast float 1.0, %sqrt ret float %div @@ -117,22 +121,23 @@ define float @rsqrt_daz(float %f) #1 { ; CHECK-LABEL: name: rsqrt_daz ; CHECK: bb.0 (%ir-block.0): - ; CHECK: liveins: $xmm0 - ; CHECK: [[COPY:%[0-9]+]]:fr32 = COPY $xmm0 - ; CHECK: [[DEF:%[0-9]+]]:fr32 = IMPLICIT_DEF - ; CHECK: [[VRSQRTSSr:%[0-9]+]]:fr32 = nnan ninf nsz arcp contract afn reassoc VRSQRTSSr killed [[DEF]], [[COPY]] - ; CHECK: %3:fr32 = nnan ninf nsz arcp contract afn reassoc nofpexcept VMULSSrr [[COPY]], [[VRSQRTSSr]], implicit $mxcsr - ; CHECK: [[VMOVSSrm_alt:%[0-9]+]]:fr32 = VMOVSSrm_alt $rip, 1, $noreg, %const.0, $noreg :: (load (s32) from constant-pool) - ; CHECK: %5:fr32 = nnan ninf nsz arcp contract afn reassoc nofpexcept VFMADD213SSr [[VRSQRTSSr]], killed %3, [[VMOVSSrm_alt]], implicit $mxcsr - ; CHECK: [[VMOVSSrm_alt1:%[0-9]+]]:fr32 = VMOVSSrm_alt $rip, 1, $noreg, %const.1, $noreg :: (load (s32) from constant-pool) - ; CHECK: %7:fr32 = nnan ninf nsz arcp contract afn reassoc nofpexcept VMULSSrr [[VRSQRTSSr]], [[VMOVSSrm_alt1]], implicit $mxcsr - ; CHECK: %8:fr32 = nnan ninf nsz arcp contract afn reassoc nofpexcept VMULSSrr killed %7, killed %5, implicit $mxcsr - ; CHECK: %9:fr32 = nnan ninf nsz arcp contract afn reassoc nofpexcept VMULSSrr [[COPY]], %8, implicit $mxcsr - ; CHECK: %10:fr32 = nnan ninf nsz arcp contract afn reassoc nofpexcept VFMADD213SSr %8, killed %9, [[VMOVSSrm_alt]], implicit $mxcsr - ; CHECK: %11:fr32 = nnan ninf nsz arcp contract afn reassoc nofpexcept VMULSSrr %8, [[VMOVSSrm_alt1]], implicit $mxcsr - ; CHECK: %12:fr32 = nnan ninf nsz arcp contract afn reassoc nofpexcept VMULSSrr killed %11, killed %10, implicit $mxcsr - ; CHECK: $xmm0 = COPY %12 - ; CHECK: RET 0, $xmm0 + ; CHECK-NEXT: liveins: $xmm0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:fr32 = COPY $xmm0 + ; CHECK-NEXT: [[DEF:%[0-9]+]]:fr32 = IMPLICIT_DEF + ; CHECK-NEXT: [[VRSQRTSSr:%[0-9]+]]:fr32 = nnan ninf nsz arcp contract afn reassoc VRSQRTSSr killed [[DEF]], [[COPY]] + ; CHECK-NEXT: [[VMULSSrr:%[0-9]+]]:fr32 = nnan ninf nsz arcp contract afn reassoc nofpexcept VMULSSrr [[COPY]], [[VRSQRTSSr]], implicit $mxcsr + ; CHECK-NEXT: [[VMOVSSrm_alt:%[0-9]+]]:fr32 = VMOVSSrm_alt $rip, 1, $noreg, %const.0, $noreg :: (load (s32) from constant-pool) + ; CHECK-NEXT: [[VFMADD213SSr:%[0-9]+]]:fr32 = nnan ninf nsz arcp contract afn reassoc nofpexcept VFMADD213SSr [[VRSQRTSSr]], killed [[VMULSSrr]], [[VMOVSSrm_alt]], implicit $mxcsr + ; CHECK-NEXT: [[VMOVSSrm_alt1:%[0-9]+]]:fr32 = VMOVSSrm_alt $rip, 1, $noreg, %const.1, $noreg :: (load (s32) from constant-pool) + ; CHECK-NEXT: [[VMULSSrr1:%[0-9]+]]:fr32 = nnan ninf nsz arcp contract afn reassoc nofpexcept VMULSSrr [[VRSQRTSSr]], [[VMOVSSrm_alt1]], implicit $mxcsr + ; CHECK-NEXT: [[VMULSSrr2:%[0-9]+]]:fr32 = nnan ninf nsz arcp contract afn reassoc nofpexcept VMULSSrr killed [[VMULSSrr1]], killed [[VFMADD213SSr]], implicit $mxcsr + ; CHECK-NEXT: [[VMULSSrr3:%[0-9]+]]:fr32 = nnan ninf nsz arcp contract afn reassoc nofpexcept VMULSSrr [[COPY]], [[VMULSSrr2]], implicit $mxcsr + ; CHECK-NEXT: [[VFMADD213SSr1:%[0-9]+]]:fr32 = nnan ninf nsz arcp contract afn reassoc nofpexcept VFMADD213SSr [[VMULSSrr2]], killed [[VMULSSrr3]], [[VMOVSSrm_alt]], implicit $mxcsr + ; CHECK-NEXT: [[VMULSSrr4:%[0-9]+]]:fr32 = nnan ninf nsz arcp contract afn reassoc nofpexcept VMULSSrr [[VMULSSrr2]], [[VMOVSSrm_alt1]], implicit $mxcsr + ; CHECK-NEXT: [[VMULSSrr5:%[0-9]+]]:fr32 = nnan ninf nsz arcp contract afn reassoc nofpexcept VMULSSrr killed [[VMULSSrr4]], killed [[VFMADD213SSr1]], implicit $mxcsr + ; CHECK-NEXT: $xmm0 = COPY [[VMULSSrr5]] + ; CHECK-NEXT: RET 0, $xmm0 %sqrt = tail call float @llvm.sqrt.f32(float %f) %div = fdiv fast float 1.0, %sqrt ret float %div diff --git a/llvm/test/CodeGen/X86/sqrt-fastmath-tune.ll b/llvm/test/CodeGen/X86/sqrt-fastmath-tune.ll --- a/llvm/test/CodeGen/X86/sqrt-fastmath-tune.ll +++ b/llvm/test/CodeGen/X86/sqrt-fastmath-tune.ll @@ -64,7 +64,7 @@ ; SNB-NEXT: vaddps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; SNB-NEXT: vmulps %xmm1, %xmm3, %xmm1 ; SNB-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; SNB-NEXT: vmovaps {{.*#+}} xmm2 = [1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38] +; SNB-NEXT: vbroadcastss {{.*#+}} xmm2 = [1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38] ; SNB-NEXT: vcmpleps %xmm0, %xmm2, %xmm0 ; SNB-NEXT: vandps %xmm1, %xmm0, %xmm0 ; SNB-NEXT: retq @@ -73,13 +73,10 @@ ; BDW: # %bb.0: ; BDW-NEXT: vrsqrtps %xmm0, %xmm1 ; BDW-NEXT: vmulps %xmm1, %xmm0, %xmm2 -; BDW-NEXT: vbroadcastss {{.*#+}} xmm3 = [-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0] -; BDW-NEXT: vfmadd231ps {{.*#+}} xmm3 = (xmm2 * xmm1) + xmm3 -; BDW-NEXT: vbroadcastss {{.*#+}} xmm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] +; BDW-NEXT: vfmadd213ps {{.*#+}} xmm1 = (xmm2 * xmm1) + mem +; BDW-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 ; BDW-NEXT: vmulps %xmm1, %xmm2, %xmm1 -; BDW-NEXT: vmulps %xmm3, %xmm1, %xmm1 -; BDW-NEXT: vbroadcastss {{.*#+}} xmm2 = [NaN,NaN,NaN,NaN] -; BDW-NEXT: vandps %xmm2, %xmm0, %xmm0 +; BDW-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; BDW-NEXT: vbroadcastss {{.*#+}} xmm2 = [1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38] ; BDW-NEXT: vcmpleps %xmm0, %xmm2, %xmm0 ; BDW-NEXT: vandps %xmm1, %xmm0, %xmm0 @@ -152,7 +149,7 @@ ; SNB-NEXT: vaddps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 ; SNB-NEXT: vmulps %ymm1, %ymm3, %ymm1 ; SNB-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; SNB-NEXT: vmovaps {{.*#+}} ymm2 = [1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38] +; SNB-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38] ; SNB-NEXT: vcmpleps %ymm0, %ymm2, %ymm0 ; SNB-NEXT: vandps %ymm1, %ymm0, %ymm0 ; SNB-NEXT: retq @@ -161,13 +158,10 @@ ; BDW: # %bb.0: ; BDW-NEXT: vrsqrtps %ymm0, %ymm1 ; BDW-NEXT: vmulps %ymm1, %ymm0, %ymm2 -; BDW-NEXT: vbroadcastss {{.*#+}} ymm3 = [-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0] -; BDW-NEXT: vfmadd231ps {{.*#+}} ymm3 = (ymm2 * ymm1) + ymm3 -; BDW-NEXT: vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] +; BDW-NEXT: vfmadd213ps {{.*#+}} ymm1 = (ymm2 * ymm1) + mem +; BDW-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 ; BDW-NEXT: vmulps %ymm1, %ymm2, %ymm1 -; BDW-NEXT: vmulps %ymm3, %ymm1, %ymm1 -; BDW-NEXT: vbroadcastss {{.*#+}} ymm2 = [NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN] -; BDW-NEXT: vandps %ymm2, %ymm0, %ymm0 +; BDW-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; BDW-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38] ; BDW-NEXT: vcmpleps %ymm0, %ymm2, %ymm0 ; BDW-NEXT: vandps %ymm1, %ymm0, %ymm0 @@ -277,11 +271,9 @@ ; BDW: # %bb.0: ; BDW-NEXT: vrsqrtps %xmm0, %xmm1 ; BDW-NEXT: vmulps %xmm1, %xmm0, %xmm2 -; BDW-NEXT: vbroadcastss {{.*#+}} xmm3 = [-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0] -; BDW-NEXT: vfmadd231ps {{.*#+}} xmm3 = (xmm2 * xmm1) + xmm3 -; BDW-NEXT: vbroadcastss {{.*#+}} xmm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] +; BDW-NEXT: vfmadd213ps {{.*#+}} xmm1 = (xmm2 * xmm1) + mem +; BDW-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 ; BDW-NEXT: vmulps %xmm1, %xmm2, %xmm1 -; BDW-NEXT: vmulps %xmm3, %xmm1, %xmm1 ; BDW-NEXT: vxorps %xmm2, %xmm2, %xmm2 ; BDW-NEXT: vcmpneqps %xmm2, %xmm0, %xmm0 ; BDW-NEXT: vandps %xmm1, %xmm0, %xmm0 @@ -354,11 +346,9 @@ ; BDW: # %bb.0: ; BDW-NEXT: vrsqrtps %ymm0, %ymm1 ; BDW-NEXT: vmulps %ymm1, %ymm0, %ymm2 -; BDW-NEXT: vbroadcastss {{.*#+}} ymm3 = [-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0] -; BDW-NEXT: vfmadd231ps {{.*#+}} ymm3 = (ymm2 * ymm1) + ymm3 -; BDW-NEXT: vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] +; BDW-NEXT: vfmadd213ps {{.*#+}} ymm1 = (ymm2 * ymm1) + mem +; BDW-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 ; BDW-NEXT: vmulps %ymm1, %ymm2, %ymm1 -; BDW-NEXT: vmulps %ymm3, %ymm1, %ymm1 ; BDW-NEXT: vxorps %xmm2, %xmm2, %xmm2 ; BDW-NEXT: vcmpneqps %ymm2, %ymm0, %ymm0 ; BDW-NEXT: vandps %ymm1, %ymm0, %ymm0 diff --git a/llvm/test/CodeGen/X86/sqrt-fastmath.ll b/llvm/test/CodeGen/X86/sqrt-fastmath.ll --- a/llvm/test/CodeGen/X86/sqrt-fastmath.ll +++ b/llvm/test/CodeGen/X86/sqrt-fastmath.ll @@ -210,7 +210,7 @@ ; AVX1-NEXT: vaddps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; AVX1-NEXT: vmulps %xmm1, %xmm3, %xmm1 ; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX1-NEXT: vmovaps {{.*#+}} xmm2 = [1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38] ; AVX1-NEXT: vcmpleps %xmm0, %xmm2, %xmm0 ; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: retq @@ -261,7 +261,7 @@ ; AVX1-NEXT: vaddps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; AVX1-NEXT: vmulps %xmm1, %xmm3, %xmm1 ; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX1-NEXT: vmovaps {{.*#+}} xmm2 = [1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38] ; AVX1-NEXT: vcmpleps %xmm0, %xmm2, %xmm0 ; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: retq @@ -360,19 +360,12 @@ ; SSE-NEXT: divps %xmm1, %xmm0 ; SSE-NEXT: retq ; -; AVX1-LABEL: v4f32_no_estimate: -; AVX1: # %bb.0: -; AVX1-NEXT: vsqrtps %xmm0, %xmm0 -; AVX1-NEXT: vmovaps {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; AVX1-NEXT: vdivps %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: retq -; -; AVX512-LABEL: v4f32_no_estimate: -; AVX512: # %bb.0: -; AVX512-NEXT: vsqrtps %xmm0, %xmm0 -; AVX512-NEXT: vbroadcastss {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; AVX512-NEXT: vdivps %xmm0, %xmm1, %xmm0 -; AVX512-NEXT: retq +; AVX-LABEL: v4f32_no_estimate: +; AVX: # %bb.0: +; AVX-NEXT: vsqrtps %xmm0, %xmm0 +; AVX-NEXT: vbroadcastss {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; AVX-NEXT: vdivps %xmm0, %xmm1, %xmm0 +; AVX-NEXT: retq %sqrt = tail call <4 x float> @llvm.sqrt.v4f32(<4 x float> %x) %div = fdiv fast <4 x float> , %sqrt ret <4 x float> %div @@ -431,7 +424,7 @@ ; AVX1-NEXT: vrsqrtps %xmm0, %xmm1 ; AVX1-NEXT: vmulps %xmm1, %xmm0, %xmm1 ; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX1-NEXT: vmovaps {{.*#+}} xmm2 = [1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38] +; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38] ; AVX1-NEXT: vcmpleps %xmm0, %xmm2, %xmm0 ; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: retq @@ -461,19 +454,12 @@ ; SSE-NEXT: divps %xmm2, %xmm1 ; SSE-NEXT: retq ; -; AVX1-LABEL: v8f32_no_estimate: -; AVX1: # %bb.0: -; AVX1-NEXT: vsqrtps %ymm0, %ymm0 -; AVX1-NEXT: vmovaps {{.*#+}} ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; AVX1-NEXT: vdivps %ymm0, %ymm1, %ymm0 -; AVX1-NEXT: retq -; -; AVX512-LABEL: v8f32_no_estimate: -; AVX512: # %bb.0: -; AVX512-NEXT: vsqrtps %ymm0, %ymm0 -; AVX512-NEXT: vbroadcastss {{.*#+}} ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; AVX512-NEXT: vdivps %ymm0, %ymm1, %ymm0 -; AVX512-NEXT: retq +; AVX-LABEL: v8f32_no_estimate: +; AVX: # %bb.0: +; AVX-NEXT: vsqrtps %ymm0, %ymm0 +; AVX-NEXT: vbroadcastss {{.*#+}} ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; AVX-NEXT: vdivps %ymm0, %ymm1, %ymm0 +; AVX-NEXT: retq %sqrt = tail call <8 x float> @llvm.sqrt.v8f32(<8 x float> %x) %div = fdiv fast <8 x float> , %sqrt ret <8 x float> %div @@ -544,7 +530,7 @@ ; AVX1: # %bb.0: ; AVX1-NEXT: vsqrtps %ymm1, %ymm1 ; AVX1-NEXT: vsqrtps %ymm0, %ymm0 -; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; AVX1-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; AVX1-NEXT: vdivps %ymm0, %ymm2, %ymm0 ; AVX1-NEXT: vdivps %ymm1, %ymm2, %ymm1 ; AVX1-NEXT: retq @@ -595,11 +581,11 @@ ; AVX1-LABEL: v16f32_estimate: ; AVX1: # %bb.0: ; AVX1-NEXT: vrsqrtps %ymm0, %ymm2 -; AVX1-NEXT: vmovaps {{.*#+}} ymm3 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] +; AVX1-NEXT: vbroadcastss {{.*#+}} ymm3 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] ; AVX1-NEXT: vmulps %ymm3, %ymm2, %ymm4 ; AVX1-NEXT: vmulps %ymm2, %ymm0, %ymm0 ; AVX1-NEXT: vmulps %ymm2, %ymm0, %ymm0 -; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0] +; AVX1-NEXT: vbroadcastss {{.*#+}} ymm2 = [-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0] ; AVX1-NEXT: vaddps %ymm2, %ymm0, %ymm0 ; AVX1-NEXT: vrsqrtps %ymm1, %ymm5 ; AVX1-NEXT: vmulps %ymm0, %ymm4, %ymm0 @@ -985,7 +971,8 @@ ; AVX-LABEL: sqrt_simplify_before_recip_vec: ; AVX: # %bb.0: ; AVX-NEXT: vsqrtpd %xmm0, %xmm0 -; AVX-NEXT: vmovapd {{.*#+}} xmm1 = [1.0E+0,1.0E+0] +; AVX-NEXT: vmovddup {{.*#+}} xmm1 = [1.0E+0,1.0E+0] +; AVX-NEXT: # xmm1 = mem[0,0] ; AVX-NEXT: vdivpd %xmm0, %xmm1, %xmm1 ; AVX-NEXT: vmovupd %xmm1, (%rdi) ; AVX-NEXT: retq diff --git a/llvm/test/CodeGen/X86/srem-seteq-illegal-types.ll b/llvm/test/CodeGen/X86/srem-seteq-illegal-types.ll --- a/llvm/test/CodeGen/X86/srem-seteq-illegal-types.ll +++ b/llvm/test/CodeGen/X86/srem-seteq-illegal-types.ll @@ -371,8 +371,7 @@ ; AVX2-NEXT: addq %rcx, %rax ; AVX2-NEXT: vmovq %rax, %xmm1 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [8589934591,8589934591,8589934591,8589934591] -; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: vpcmpeqq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 ; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0 diff --git a/llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll b/llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll --- a/llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll +++ b/llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll @@ -128,10 +128,8 @@ ; ; CHECK-AVX2-LABEL: test_srem_odd_allones_eq: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [3435973837,3435973837,3435973837,3435973837] -; CHECK-AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [429496729,429496729,429496729,429496729] -; CHECK-AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 ; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 @@ -187,14 +185,11 @@ ; ; CHECK-AVX2-LABEL: test_srem_odd_allones_ne: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [3435973837,3435973837,3435973837,3435973837] -; CHECK-AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [429496729,429496729,429496729,429496729] -; CHECK-AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 ; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1,1,1,1] -; CHECK-AVX2-NEXT: vpandn %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq ; ; CHECK-AVX512VL-LABEL: test_srem_odd_allones_ne: @@ -260,10 +255,8 @@ ; ; CHECK-AVX2-LABEL: test_srem_even_allones_eq: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [3067833783,3067833783,3067833783,3067833783] -; CHECK-AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [306783378,306783378,306783378,306783378] -; CHECK-AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpsrld $1, %xmm0, %xmm1 ; CHECK-AVX2-NEXT: vpslld $31, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 @@ -334,17 +327,14 @@ ; ; CHECK-AVX2-LABEL: test_srem_even_allones_ne: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [3067833783,3067833783,3067833783,3067833783] -; CHECK-AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [306783378,306783378,306783378,306783378] -; CHECK-AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpsrld $1, %xmm0, %xmm1 ; CHECK-AVX2-NEXT: vpslld $31, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 ; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1,1,1,1] -; CHECK-AVX2-NEXT: vpandn %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq ; ; CHECK-AVX512VL-LABEL: test_srem_even_allones_ne: @@ -508,8 +498,7 @@ ; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 ; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1,1,1,1] -; CHECK-AVX2-NEXT: vpandn %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq ; ; CHECK-AVX512VL-LABEL: test_srem_odd_even_allones_ne: @@ -825,10 +814,8 @@ ; ; CHECK-AVX2-LABEL: test_srem_odd_one: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [3435973837,3435973837,3435973837,3435973837] -; CHECK-AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [429496729,429496729,429496729,429496729] -; CHECK-AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 ; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 @@ -897,10 +884,8 @@ ; ; CHECK-AVX2-LABEL: test_srem_even_one: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [3067833783,3067833783,3067833783,3067833783] -; CHECK-AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [306783378,306783378,306783378,306783378] -; CHECK-AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpsrld $1, %xmm0, %xmm1 ; CHECK-AVX2-NEXT: vpslld $31, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 @@ -1064,8 +1049,7 @@ ; CHECK-AVX2-LABEL: test_srem_odd_INT_MIN: ; CHECK-AVX2: # %bb.0: ; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2147483647,2147483647,2147483647,2147483647] -; CHECK-AVX2-NEXT: vpand %xmm2, %xmm0, %xmm2 +; CHECK-AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 ; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm2, %xmm1 ; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 @@ -1186,8 +1170,7 @@ ; CHECK-AVX2-NEXT: vpor %xmm4, %xmm2, %xmm2 ; CHECK-AVX2-NEXT: vpminud %xmm3, %xmm2, %xmm3 ; CHECK-AVX2-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2 -; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [2147483647,2147483647,2147483647,2147483647] -; CHECK-AVX2-NEXT: vpand %xmm3, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3] ; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 @@ -1303,8 +1286,7 @@ ; CHECK-AVX2-NEXT: vpor %xmm3, %xmm2, %xmm2 ; CHECK-AVX2-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm3 ; CHECK-AVX2-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2 -; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [2147483647,2147483647,2147483647,2147483647] -; CHECK-AVX2-NEXT: vpand %xmm3, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3] ; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 @@ -1626,10 +1608,8 @@ ; ; CHECK-AVX2-LABEL: test_srem_odd_allones_and_one: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [3435973837,3435973837,3435973837,3435973837] -; CHECK-AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [429496729,429496729,429496729,429496729] -; CHECK-AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 ; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 @@ -1698,10 +1678,8 @@ ; ; CHECK-AVX2-LABEL: test_srem_even_allones_and_one: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [3067833783,3067833783,3067833783,3067833783] -; CHECK-AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [306783378,306783378,306783378,306783378] -; CHECK-AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpsrld $1, %xmm0, %xmm1 ; CHECK-AVX2-NEXT: vpslld $31, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 @@ -2434,7 +2412,7 @@ ; CHECK-AVX2-NEXT: vpackuswb %ymm3, %ymm4, %ymm3 ; CHECK-AVX2-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] ; CHECK-AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm4 -; CHECK-AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; CHECK-AVX2-NEXT: vpbroadcastw {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; CHECK-AVX2-NEXT: vpand %ymm5, %ymm4, %ymm4 ; CHECK-AVX2-NEXT: vpshufb {{.*#+}} ymm6 = ymm0[8],zero,ymm0[9],zero,zero,zero,ymm0[11],zero,zero,zero,ymm0[13],zero,zero,zero,ymm0[15],zero,zero,zero,ymm0[25],zero,zero,zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,zero,zero,ymm0[31],zero ; CHECK-AVX2-NEXT: vpackuswb %ymm6, %ymm4, %ymm4 diff --git a/llvm/test/CodeGen/X86/srem-seteq-vec-splat.ll b/llvm/test/CodeGen/X86/srem-seteq-vec-splat.ll --- a/llvm/test/CodeGen/X86/srem-seteq-vec-splat.ll +++ b/llvm/test/CodeGen/X86/srem-seteq-vec-splat.ll @@ -43,12 +43,9 @@ ; ; CHECK-AVX2-LABEL: test_srem_odd_25: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [3264175145,3264175145,3264175145,3264175145] -; CHECK-AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [85899345,85899345,85899345,85899345] -; CHECK-AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [171798690,171798690,171798690,171798690] -; CHECK-AVX2-NEXT: vpminud %xmm1, %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 ; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq @@ -116,15 +113,12 @@ ; ; CHECK-AVX2-LABEL: test_srem_even_100: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [3264175145,3264175145,3264175145,3264175145] -; CHECK-AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [85899344,85899344,85899344,85899344] -; CHECK-AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpsrld $2, %xmm0, %xmm1 ; CHECK-AVX2-NEXT: vpslld $30, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [42949672,42949672,42949672,42949672] -; CHECK-AVX2-NEXT: vpminud %xmm1, %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 ; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq @@ -184,12 +178,9 @@ ; ; CHECK-AVX2-LABEL: test_srem_odd_neg25: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [3264175145,3264175145,3264175145,3264175145] -; CHECK-AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [85899345,85899345,85899345,85899345] -; CHECK-AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [171798690,171798690,171798690,171798690] -; CHECK-AVX2-NEXT: vpminud %xmm1, %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 ; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq @@ -257,15 +248,12 @@ ; ; CHECK-AVX2-LABEL: test_srem_even_neg100: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [3264175145,3264175145,3264175145,3264175145] -; CHECK-AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [85899344,85899344,85899344,85899344] -; CHECK-AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpsrld $2, %xmm0, %xmm1 ; CHECK-AVX2-NEXT: vpslld $30, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [42949672,42949672,42949672,42949672] -; CHECK-AVX2-NEXT: vpminud %xmm1, %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 ; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq @@ -369,8 +357,7 @@ ; CHECK-AVX2-NEXT: vpsrld $31, %xmm1, %xmm2 ; CHECK-AVX2-NEXT: vpsrad $3, %xmm1, %xmm1 ; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [25,25,25,25] -; CHECK-AVX2-NEXT: vpmulld %xmm2, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 @@ -480,8 +467,7 @@ ; CHECK-AVX2-NEXT: vpsrld $31, %xmm1, %xmm2 ; CHECK-AVX2-NEXT: vpsrad $5, %xmm1, %xmm1 ; CHECK-AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [100,100,100,100] -; CHECK-AVX2-NEXT: vpmulld %xmm2, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 @@ -523,12 +509,12 @@ ; ; CHECK-AVX1-LABEL: test_srem_one_eq: ; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vmovaps {{.*#+}} xmm0 = [1,1,1,1] +; CHECK-AVX1-NEXT: vbroadcastss {{.*#+}} xmm0 = [1.40129846E-45,1.40129846E-45,1.40129846E-45,1.40129846E-45] ; CHECK-AVX1-NEXT: retq ; ; CHECK-AVX2-LABEL: test_srem_one_eq: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vbroadcastss {{.*#+}} xmm0 = [1,1,1,1] +; CHECK-AVX2-NEXT: vbroadcastss {{.*#+}} xmm0 = [1.40129846E-45,1.40129846E-45,1.40129846E-45,1.40129846E-45] ; CHECK-AVX2-NEXT: retq ; ; CHECK-AVX512VL-LABEL: test_srem_one_eq: @@ -588,8 +574,7 @@ ; CHECK-AVX2-NEXT: vpsrad $31, %xmm0, %xmm1 ; CHECK-AVX2-NEXT: vpsrld $28, %xmm1, %xmm1 ; CHECK-AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm1 -; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [4294967280,4294967280,4294967280,4294967280] -; CHECK-AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 @@ -645,8 +630,7 @@ ; CHECK-AVX2-NEXT: vpsrad $31, %xmm0, %xmm1 ; CHECK-AVX2-NEXT: vpsrld $1, %xmm1, %xmm1 ; CHECK-AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm1 -; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] -; CHECK-AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; CHECK-AVX2-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 @@ -679,12 +663,12 @@ ; ; CHECK-AVX1-LABEL: test_srem_allones: ; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vmovaps {{.*#+}} xmm0 = [1,1,1,1] +; CHECK-AVX1-NEXT: vbroadcastss {{.*#+}} xmm0 = [1.40129846E-45,1.40129846E-45,1.40129846E-45,1.40129846E-45] ; CHECK-AVX1-NEXT: retq ; ; CHECK-AVX2-LABEL: test_srem_allones: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vbroadcastss {{.*#+}} xmm0 = [1,1,1,1] +; CHECK-AVX2-NEXT: vbroadcastss {{.*#+}} xmm0 = [1.40129846E-45,1.40129846E-45,1.40129846E-45,1.40129846E-45] ; CHECK-AVX2-NEXT: retq ; ; CHECK-AVX512VL-LABEL: test_srem_allones: diff --git a/llvm/test/CodeGen/X86/sse2.ll b/llvm/test/CodeGen/X86/sse2.ll --- a/llvm/test/CodeGen/X86/sse2.ll +++ b/llvm/test/CodeGen/X86/sse2.ll @@ -603,7 +603,7 @@ ; ; X86-AVX1-LABEL: test17: ; X86-AVX1: # %bb.0: # %entry -; X86-AVX1-NEXT: vmovaps {{.*#+}} xmm0 = +; X86-AVX1-NEXT: vbroadcastss {{.*#+}} xmm0 = [4.59177481E-41,4.59177481E-41,4.59177481E-41,4.59177481E-41] ; X86-AVX1-NEXT: vmovaps %xmm0, (%eax) ; X86-AVX1-NEXT: retl ; @@ -621,7 +621,7 @@ ; ; X64-AVX1-LABEL: test17: ; X64-AVX1: # %bb.0: # %entry -; X64-AVX1-NEXT: vmovaps {{.*#+}} xmm0 = +; X64-AVX1-NEXT: vbroadcastss {{.*#+}} xmm0 = [4.59177481E-41,4.59177481E-41,4.59177481E-41,4.59177481E-41] ; X64-AVX1-NEXT: vmovaps %xmm0, (%rax) ; X64-AVX1-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/sshl_sat_vec.ll b/llvm/test/CodeGen/X86/sshl_sat_vec.ll --- a/llvm/test/CodeGen/X86/sshl_sat_vec.ll +++ b/llvm/test/CodeGen/X86/sshl_sat_vec.ll @@ -48,8 +48,10 @@ ; ; X64-AVX2-LABEL: vec_v2i64: ; X64-AVX2: # %bb.0: -; X64-AVX2-NEXT: vmovapd {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] -; X64-AVX2-NEXT: vmovapd {{.*#+}} xmm3 = [9223372036854775807,9223372036854775807] +; X64-AVX2-NEXT: vmovddup {{.*#+}} xmm2 = [-0.0E+0,-0.0E+0] +; X64-AVX2-NEXT: # xmm2 = mem[0,0] +; X64-AVX2-NEXT: vmovddup {{.*#+}} xmm3 = [NaN,NaN] +; X64-AVX2-NEXT: # xmm3 = mem[0,0] ; X64-AVX2-NEXT: vblendvpd %xmm0, %xmm2, %xmm3, %xmm3 ; X64-AVX2-NEXT: vpsrlvq %xmm1, %xmm2, %xmm2 ; X64-AVX2-NEXT: vpsllvq %xmm1, %xmm0, %xmm4 @@ -195,9 +197,8 @@ ; ; X64-AVX2-LABEL: vec_v4i32: ; X64-AVX2: # %bb.0: -; X64-AVX2-NEXT: vbroadcastss {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] -; X64-AVX2-NEXT: vbroadcastss {{.*#+}} xmm3 = [2147483647,2147483647,2147483647,2147483647] -; X64-AVX2-NEXT: vblendvps %xmm0, %xmm2, %xmm3, %xmm2 +; X64-AVX2-NEXT: vbroadcastss {{.*#+}} xmm2 = [NaN,NaN,NaN,NaN] +; X64-AVX2-NEXT: vblendvps %xmm0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 ; X64-AVX2-NEXT: vpsllvd %xmm1, %xmm0, %xmm3 ; X64-AVX2-NEXT: vpsravd %xmm1, %xmm3, %xmm1 ; X64-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 @@ -351,7 +352,7 @@ ; X64-AVX2-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm1 ; X64-AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; X64-AVX2-NEXT: vpcmpgtw %xmm0, %xmm3, %xmm0 -; X64-AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [32767,32767,32767,32767,32767,32767,32767,32767] +; X64-AVX2-NEXT: vpbroadcastw {{.*#+}} xmm3 = [32767,32767,32767,32767,32767,32767,32767,32767] ; X64-AVX2-NEXT: vpblendvb %xmm0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm0 ; X64-AVX2-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 ; X64-AVX2-NEXT: vzeroupper @@ -620,7 +621,7 @@ ; X64-AVX2-NEXT: vpsrlw $8, %xmm1, %xmm1 ; X64-AVX2-NEXT: vpackuswb %xmm3, %xmm1, %xmm1 ; X64-AVX2-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm1 -; X64-AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; X64-AVX2-NEXT: vpbroadcastb {{.*#+}} xmm3 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] ; X64-AVX2-NEXT: vpblendvb %xmm0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm0 ; X64-AVX2-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 ; X64-AVX2-NEXT: retq diff --git a/llvm/test/CodeGen/X86/ssub_sat_vec.ll b/llvm/test/CodeGen/X86/ssub_sat_vec.ll --- a/llvm/test/CodeGen/X86/ssub_sat_vec.ll +++ b/llvm/test/CodeGen/X86/ssub_sat_vec.ll @@ -530,14 +530,14 @@ ; AVX2-LABEL: v16i4: ; AVX2: # %bb.0: ; AVX2-NEXT: vpsllw $4, %xmm1, %xmm1 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm2 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] ; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 ; AVX2-NEXT: vpsllw $4, %xmm0, %xmm0 ; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpsubsb %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpsrlw $4, %xmm0, %xmm0 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] ; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpsubb %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq @@ -545,14 +545,14 @@ ; AVX512F-LABEL: v16i4: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpsllw $4, %xmm1, %xmm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} xmm2 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] ; AVX512F-NEXT: vpand %xmm2, %xmm1, %xmm1 ; AVX512F-NEXT: vpsllw $4, %xmm0, %xmm0 ; AVX512F-NEXT: vpand %xmm2, %xmm0, %xmm0 ; AVX512F-NEXT: vpsubsb %xmm1, %xmm0, %xmm0 ; AVX512F-NEXT: vpsrlw $4, %xmm0, %xmm0 ; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] ; AVX512F-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX512F-NEXT: vpsubb %xmm1, %xmm0, %xmm0 ; AVX512F-NEXT: retq @@ -677,8 +677,7 @@ ; AVX2-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm0, %xmm2, %xmm0 ; AVX2-NEXT: vpsrad $31, %xmm1, %xmm2 -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648] -; AVX2-NEXT: vpxor %xmm3, %xmm2, %xmm2 +; AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 ; AVX2-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -778,8 +777,7 @@ ; AVX2-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm0, %xmm2, %xmm0 ; AVX2-NEXT: vpsrad $31, %xmm1, %xmm2 -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648] -; AVX2-NEXT: vpxor %xmm3, %xmm2, %xmm2 +; AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 ; AVX2-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -925,8 +923,7 @@ ; AVX2-NEXT: vpcmpgtd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpxor %ymm0, %ymm2, %ymm0 ; AVX2-NEXT: vpsrad $31, %ymm1, %ymm2 -; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm3 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648] -; AVX2-NEXT: vpxor %ymm3, %ymm2, %ymm2 +; AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 ; AVX2-NEXT: vblendvps %ymm0, %ymm2, %ymm1, %ymm0 ; AVX2-NEXT: retq ; @@ -1126,7 +1123,7 @@ ; AVX1-NEXT: vpsrad $31, %xmm2, %xmm2 ; AVX1-NEXT: vpsrad $31, %xmm4, %xmm4 ; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 -; AVX1-NEXT: vmovaps {{.*#+}} ymm4 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648] +; AVX1-NEXT: vbroadcastss {{.*#+}} ymm4 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] ; AVX1-NEXT: vxorps %ymm4, %ymm2, %ymm2 ; AVX1-NEXT: vblendvps %ymm0, %ymm2, %ymm6, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm2 @@ -1292,7 +1289,8 @@ ; AVX1-NEXT: vpsubq %xmm1, %xmm0, %xmm1 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm0, %xmm2, %xmm0 -; AVX1-NEXT: vmovapd {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [-0.0E+0,-0.0E+0] +; AVX1-NEXT: # xmm2 = mem[0,0] ; AVX1-NEXT: vblendvpd %xmm1, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 ; AVX1-NEXT: vblendvpd %xmm0, %xmm2, %xmm1, %xmm0 ; AVX1-NEXT: retq @@ -1304,7 +1302,8 @@ ; AVX2-NEXT: vpsubq %xmm1, %xmm0, %xmm1 ; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm0, %xmm2, %xmm0 -; AVX2-NEXT: vmovapd {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX2-NEXT: vmovddup {{.*#+}} xmm2 = [-0.0E+0,-0.0E+0] +; AVX2-NEXT: # xmm2 = mem[0,0] ; AVX2-NEXT: vblendvpd %xmm1, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 ; AVX2-NEXT: vblendvpd %xmm0, %xmm2, %xmm1, %xmm0 ; AVX2-NEXT: retq @@ -1316,7 +1315,8 @@ ; AVX512F-NEXT: vpsubq %xmm1, %xmm0, %xmm1 ; AVX512F-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX512F-NEXT: vpxor %xmm0, %xmm2, %xmm0 -; AVX512F-NEXT: vmovapd {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX512F-NEXT: vmovddup {{.*#+}} xmm2 = [-0.0E+0,-0.0E+0] +; AVX512F-NEXT: # xmm2 = mem[0,0] ; AVX512F-NEXT: vblendvpd %xmm1, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 ; AVX512F-NEXT: vblendvpd %xmm0, %xmm2, %xmm1, %xmm0 ; AVX512F-NEXT: retq @@ -1554,8 +1554,7 @@ ; AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpxor %ymm0, %ymm3, %ymm0 ; AVX2-NEXT: vpcmpgtq %ymm1, %ymm2, %ymm2 -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] -; AVX2-NEXT: vpxor %ymm3, %ymm2, %ymm2 +; AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 ; AVX2-NEXT: vblendvpd %ymm0, %ymm2, %ymm1, %ymm0 ; AVX2-NEXT: retq ; @@ -1959,7 +1958,7 @@ ; AVX1-NEXT: vpcmpgtq %xmm5, %xmm4, %xmm5 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm4, %xmm2 ; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm2, %ymm2 -; AVX1-NEXT: vmovapd {{.*#+}} ymm5 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] +; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm5 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] ; AVX1-NEXT: vxorpd %ymm5, %ymm2, %ymm2 ; AVX1-NEXT: vblendvpd %ymm0, %ymm2, %ymm6, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm2 diff --git a/llvm/test/CodeGen/X86/subvector-broadcast.ll b/llvm/test/CodeGen/X86/subvector-broadcast.ll --- a/llvm/test/CodeGen/X86/subvector-broadcast.ll +++ b/llvm/test/CodeGen/X86/subvector-broadcast.ll @@ -987,23 +987,23 @@ ; X86-AVX1-NEXT: movl %esp, %ebp ; X86-AVX1-NEXT: andl $-32, %esp ; X86-AVX1-NEXT: subl $32, %esp -; X86-AVX1-NEXT: vbroadcastf128 {{.*#+}} ymm3 = [1,2,3,4,1,2,3,4] -; X86-AVX1-NEXT: # ymm3 = mem[0,1,0,1] +; X86-AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [1,2,3,4] ; X86-AVX1-NEXT: vpaddd %xmm3, %xmm0, %xmm0 ; X86-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 ; X86-AVX1-NEXT: vpaddd %xmm3, %xmm4, %xmm4 ; X86-AVX1-NEXT: vpaddd %xmm3, %xmm1, %xmm1 ; X86-AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1 -; X86-AVX1-NEXT: vandps %ymm3, %ymm1, %ymm1 -; X86-AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4 -; X86-AVX1-NEXT: vpaddd %xmm3, %xmm4, %xmm4 +; X86-AVX1-NEXT: vmovaps {{.*#+}} ymm4 = [1,2,3,4,1,2,3,4] +; X86-AVX1-NEXT: vandps %ymm4, %ymm1, %ymm1 +; X86-AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5 +; X86-AVX1-NEXT: vpaddd %xmm3, %xmm5, %xmm5 ; X86-AVX1-NEXT: vpaddd %xmm3, %xmm2, %xmm2 -; X86-AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 -; X86-AVX1-NEXT: vpaddd 8(%ebp), %xmm3, %xmm4 -; X86-AVX1-NEXT: vpaddd 24(%ebp), %xmm3, %xmm5 -; X86-AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm4 -; X86-AVX1-NEXT: vandps %ymm3, %ymm2, %ymm2 -; X86-AVX1-NEXT: vandps %ymm3, %ymm4, %ymm3 +; X86-AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm2, %ymm2 +; X86-AVX1-NEXT: vpaddd 8(%ebp), %xmm3, %xmm5 +; X86-AVX1-NEXT: vpaddd 24(%ebp), %xmm3, %xmm3 +; X86-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm5, %ymm3 +; X86-AVX1-NEXT: vandps %ymm4, %ymm2, %ymm2 +; X86-AVX1-NEXT: vandps %ymm4, %ymm3, %ymm3 ; X86-AVX1-NEXT: vmovdqu %xmm0, ha4 ; X86-AVX1-NEXT: vmovups %ymm1, hb4 ; X86-AVX1-NEXT: vmovups %ymm3, hc4+32 @@ -1019,9 +1019,8 @@ ; X86-AVX2-NEXT: movl %esp, %ebp ; X86-AVX2-NEXT: andl $-32, %esp ; X86-AVX2-NEXT: subl $32, %esp -; X86-AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [1,2,3,4,1,2,3,4] -; X86-AVX2-NEXT: # ymm3 = mem[0,1,0,1] -; X86-AVX2-NEXT: vpaddd %xmm3, %xmm0, %xmm0 +; X86-AVX2-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 +; X86-AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [1,2,3,4,1,2,3,4] ; X86-AVX2-NEXT: vpaddd %ymm3, %ymm1, %ymm1 ; X86-AVX2-NEXT: vpand %ymm3, %ymm1, %ymm1 ; X86-AVX2-NEXT: vpaddd %ymm3, %ymm2, %ymm2 @@ -1054,24 +1053,24 @@ ; ; X64-AVX1-LABEL: fallback_broadcast_v4i32_v8i32_v16i32: ; X64-AVX1: # %bb.0: # %entry -; X64-AVX1-NEXT: vbroadcastf128 {{.*#+}} ymm4 = [1,2,3,4,1,2,3,4] -; X64-AVX1-NEXT: # ymm4 = mem[0,1,0,1] +; X64-AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [1,2,3,4] ; X64-AVX1-NEXT: vpaddd %xmm4, %xmm0, %xmm0 ; X64-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5 ; X64-AVX1-NEXT: vpaddd %xmm4, %xmm5, %xmm5 ; X64-AVX1-NEXT: vpaddd %xmm4, %xmm1, %xmm1 ; X64-AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm1, %ymm1 -; X64-AVX1-NEXT: vandps %ymm4, %ymm1, %ymm1 -; X64-AVX1-NEXT: vextractf128 $1, %ymm3, %xmm5 -; X64-AVX1-NEXT: vpaddd %xmm4, %xmm5, %xmm5 +; X64-AVX1-NEXT: vmovaps {{.*#+}} ymm5 = [1,2,3,4,1,2,3,4] +; X64-AVX1-NEXT: vandps %ymm5, %ymm1, %ymm1 +; X64-AVX1-NEXT: vextractf128 $1, %ymm3, %xmm6 +; X64-AVX1-NEXT: vpaddd %xmm4, %xmm6, %xmm6 ; X64-AVX1-NEXT: vpaddd %xmm4, %xmm3, %xmm3 -; X64-AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm3, %ymm3 -; X64-AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5 -; X64-AVX1-NEXT: vpaddd %xmm4, %xmm5, %xmm5 +; X64-AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm3, %ymm3 +; X64-AVX1-NEXT: vextractf128 $1, %ymm2, %xmm6 +; X64-AVX1-NEXT: vpaddd %xmm4, %xmm6, %xmm6 ; X64-AVX1-NEXT: vpaddd %xmm4, %xmm2, %xmm2 -; X64-AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm2, %ymm2 -; X64-AVX1-NEXT: vandps %ymm4, %ymm2, %ymm2 -; X64-AVX1-NEXT: vandps %ymm4, %ymm3, %ymm3 +; X64-AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm2, %ymm2 +; X64-AVX1-NEXT: vandps %ymm5, %ymm2, %ymm2 +; X64-AVX1-NEXT: vandps %ymm5, %ymm3, %ymm3 ; X64-AVX1-NEXT: vmovdqu %xmm0, ha4(%rip) ; X64-AVX1-NEXT: vmovups %ymm1, hb4(%rip) ; X64-AVX1-NEXT: vmovups %ymm3, hc4+32(%rip) @@ -1081,9 +1080,8 @@ ; ; X64-AVX2-LABEL: fallback_broadcast_v4i32_v8i32_v16i32: ; X64-AVX2: # %bb.0: # %entry -; X64-AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [1,2,3,4,1,2,3,4] -; X64-AVX2-NEXT: # ymm4 = mem[0,1,0,1] -; X64-AVX2-NEXT: vpaddd %xmm4, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [1,2,3,4,1,2,3,4] ; X64-AVX2-NEXT: vpaddd %ymm4, %ymm1, %ymm1 ; X64-AVX2-NEXT: vpand %ymm4, %ymm1, %ymm1 ; X64-AVX2-NEXT: vpaddd %ymm4, %ymm3, %ymm3 diff --git a/llvm/test/CodeGen/X86/uadd_sat_vec.ll b/llvm/test/CodeGen/X86/uadd_sat_vec.ll --- a/llvm/test/CodeGen/X86/uadd_sat_vec.ll +++ b/llvm/test/CodeGen/X86/uadd_sat_vec.ll @@ -490,14 +490,41 @@ ; SSE-NEXT: pminub %xmm2, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: v16i4: -; AVX: # %bb.0: -; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX-NEXT: vpand %xmm2, %xmm1, %xmm1 -; AVX-NEXT: vpand %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpaddb %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpminub %xmm2, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: v16i4: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpminub %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: v16i4: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpminub %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: retq +; +; AVX512F-LABEL: v16i4: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpbroadcastb {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX512F-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX512F-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; AVX512F-NEXT: vpminub %xmm2, %xmm0, %xmm0 +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: v16i4: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX512BW-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX512BW-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; AVX512BW-NEXT: vpminub %xmm2, %xmm0, %xmm0 +; AVX512BW-NEXT: retq %z = call <16 x i4> @llvm.uadd.sat.v16i4(<16 x i4> %x, <16 x i4> %y) ret <16 x i4> %z } @@ -906,7 +933,7 @@ ; ; AVX2-LABEL: v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] ; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm3 ; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm1 diff --git a/llvm/test/CodeGen/X86/umax.ll b/llvm/test/CodeGen/X86/umax.ll --- a/llvm/test/CodeGen/X86/umax.ll +++ b/llvm/test/CodeGen/X86/umax.ll @@ -369,14 +369,23 @@ ; SSE-NEXT: por %xmm3, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: test_v2i64: -; AVX: # %bb.0: -; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] -; AVX-NEXT: vpxor %xmm2, %xmm1, %xmm3 -; AVX-NEXT: vpxor %xmm2, %xmm0, %xmm2 -; AVX-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2 -; AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: test_v2i64: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm3 +; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm2 +; AVX1-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_v2i64: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm3 +; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm2 +; AVX2-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2 +; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: retq ; ; X86-LABEL: test_v2i64: ; X86: # %bb.0: @@ -629,16 +638,10 @@ ; SSE-NEXT: psubd %xmm1, %xmm0 ; SSE-NEXT: retq ; -; AVX1-LABEL: test_v4i32_1: -; AVX1: # %bb.0: -; AVX1-NEXT: vpmaxud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: test_v4i32_1: -; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1,1,1,1] -; AVX2-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: retq +; AVX-LABEL: test_v4i32_1: +; AVX: # %bb.0: +; AVX-NEXT: vpmaxud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX-NEXT: retq ; ; X86-LABEL: test_v4i32_1: ; X86: # %bb.0: @@ -788,8 +791,7 @@ ; ; AVX2-LABEL: test_v8i32_1: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm1 = [1,1,1,1,1,1,1,1] -; AVX2-NEXT: vpmaxud %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpmaxud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; X86-LABEL: test_v8i32_1: diff --git a/llvm/test/CodeGen/X86/urem-seteq-vec-nonsplat.ll b/llvm/test/CodeGen/X86/urem-seteq-vec-nonsplat.ll --- a/llvm/test/CodeGen/X86/urem-seteq-vec-nonsplat.ll +++ b/llvm/test/CodeGen/X86/urem-seteq-vec-nonsplat.ll @@ -727,8 +727,7 @@ ; ; CHECK-AVX2-LABEL: test_urem_odd_one: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [3435973837,3435973837,3435973837,3435973837] -; CHECK-AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 ; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 @@ -793,8 +792,7 @@ ; ; CHECK-AVX2-LABEL: test_urem_even_one: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [3067833783,3067833783,3067833783,3067833783] -; CHECK-AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpsrld $1, %xmm0, %xmm1 ; CHECK-AVX2-NEXT: vpslld $31, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 diff --git a/llvm/test/CodeGen/X86/urem-seteq-vec-nonzero.ll b/llvm/test/CodeGen/X86/urem-seteq-vec-nonzero.ll --- a/llvm/test/CodeGen/X86/urem-seteq-vec-nonzero.ll +++ b/llvm/test/CodeGen/X86/urem-seteq-vec-nonzero.ll @@ -42,8 +42,7 @@ ; CHECK-AVX2-LABEL: t32_3: ; CHECK-AVX2: # %bb.0: ; CHECK-AVX2-NEXT: vpsubd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2863311531,2863311531,2863311531,2863311531] -; CHECK-AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 ; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq @@ -97,10 +96,8 @@ ; CHECK-AVX2-LABEL: t32_5: ; CHECK-AVX2: # %bb.0: ; CHECK-AVX2-NEXT: vpsubd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [3435973837,3435973837,3435973837,3435973837] -; CHECK-AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [858993458,858993458,858993458,858993458] -; CHECK-AVX2-NEXT: vpminud %xmm1, %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 ; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq ; @@ -164,13 +161,11 @@ ; CHECK-AVX2-LABEL: t32_6_part0: ; CHECK-AVX2: # %bb.0: ; CHECK-AVX2-NEXT: vpsubd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2863311531,2863311531,2863311531,2863311531] -; CHECK-AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpsrld $1, %xmm0, %xmm1 ; CHECK-AVX2-NEXT: vpslld $31, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [715827882,715827882,715827882,715827882] -; CHECK-AVX2-NEXT: vpminud %xmm1, %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 ; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq ; @@ -235,8 +230,7 @@ ; CHECK-AVX2-LABEL: t32_6_part1: ; CHECK-AVX2: # %bb.0: ; CHECK-AVX2-NEXT: vpsubd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2863311531,2863311531,2863311531,2863311531] -; CHECK-AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpsrld $1, %xmm0, %xmm1 ; CHECK-AVX2-NEXT: vpslld $31, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 @@ -297,8 +291,7 @@ ; CHECK-AVX2-LABEL: t32_tautological: ; CHECK-AVX2: # %bb.0: ; CHECK-AVX2-NEXT: vpsubd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2863311531,2863311531,2863311531,2863311531] -; CHECK-AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 ; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 diff --git a/llvm/test/CodeGen/X86/urem-seteq-vec-splat.ll b/llvm/test/CodeGen/X86/urem-seteq-vec-splat.ll --- a/llvm/test/CodeGen/X86/urem-seteq-vec-splat.ll +++ b/llvm/test/CodeGen/X86/urem-seteq-vec-splat.ll @@ -40,10 +40,8 @@ ; ; CHECK-AVX2-LABEL: test_urem_odd_25: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [3264175145,3264175145,3264175145,3264175145] -; CHECK-AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [171798691,171798691,171798691,171798691] -; CHECK-AVX2-NEXT: vpminud %xmm1, %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 ; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq @@ -107,13 +105,11 @@ ; ; CHECK-AVX2-LABEL: test_urem_even_100: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [3264175145,3264175145,3264175145,3264175145] -; CHECK-AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpsrld $2, %xmm0, %xmm1 ; CHECK-AVX2-NEXT: vpslld $30, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 -; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [42949672,42949672,42949672,42949672] -; CHECK-AVX2-NEXT: vpminud %xmm1, %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 ; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq @@ -309,8 +305,7 @@ ; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] ; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] ; CHECK-AVX2-NEXT: vpsrld $3, %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [25,25,25,25] -; CHECK-AVX2-NEXT: vpmulld %xmm2, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 @@ -404,8 +399,7 @@ ; CHECK-AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] ; CHECK-AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] ; CHECK-AVX2-NEXT: vpsrld $5, %xmm1, %xmm1 -; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [100,100,100,100] -; CHECK-AVX2-NEXT: vpmulld %xmm2, %xmm1, %xmm1 +; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; CHECK-AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 @@ -445,12 +439,12 @@ ; ; CHECK-AVX1-LABEL: test_urem_one_eq: ; CHECK-AVX1: # %bb.0: -; CHECK-AVX1-NEXT: vmovaps {{.*#+}} xmm0 = [1,1,1,1] +; CHECK-AVX1-NEXT: vbroadcastss {{.*#+}} xmm0 = [1.40129846E-45,1.40129846E-45,1.40129846E-45,1.40129846E-45] ; CHECK-AVX1-NEXT: retq ; ; CHECK-AVX2-LABEL: test_urem_one_eq: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vbroadcastss {{.*#+}} xmm0 = [1,1,1,1] +; CHECK-AVX2-NEXT: vbroadcastss {{.*#+}} xmm0 = [1.40129846E-45,1.40129846E-45,1.40129846E-45,1.40129846E-45] ; CHECK-AVX2-NEXT: retq ; ; CHECK-AVX512VL-LABEL: test_urem_one_eq: @@ -498,8 +492,7 @@ ; ; CHECK-AVX2-LABEL: test_urem_pow2: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [15,15,15,15] -; CHECK-AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 @@ -538,8 +531,7 @@ ; ; CHECK-AVX2-LABEL: test_urem_int_min: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2147483647,2147483647,2147483647,2147483647] -; CHECK-AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 @@ -593,8 +585,7 @@ ; CHECK-AVX2: # %bb.0: ; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; CHECK-AVX2-NEXT: vpsubd %xmm0, %xmm1, %xmm0 -; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1,1,1,1] -; CHECK-AVX2-NEXT: vpminud %xmm1, %xmm0, %xmm1 +; CHECK-AVX2-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 ; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: retq diff --git a/llvm/test/CodeGen/X86/urem-seteq-vec-tautological.ll b/llvm/test/CodeGen/X86/urem-seteq-vec-tautological.ll --- a/llvm/test/CodeGen/X86/urem-seteq-vec-tautological.ll +++ b/llvm/test/CodeGen/X86/urem-seteq-vec-tautological.ll @@ -58,8 +58,7 @@ ; ; CHECK-AVX2-LABEL: t1_all_odd_eq: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2863311531,2863311531,2863311531,2863311531] -; CHECK-AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 ; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 @@ -119,8 +118,7 @@ ; ; CHECK-AVX2-LABEL: t1_all_odd_ne: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2863311531,2863311531,2863311531,2863311531] -; CHECK-AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 ; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 @@ -238,7 +236,7 @@ ; ; CHECK-AVX2-LABEL: t3_wide: ; CHECK-AVX2: # %bb.0: -; CHECK-AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [12297829382473034411,12297829382473034411] +; CHECK-AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [12297829382473034411,12297829382473034411] ; CHECK-AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm2 ; CHECK-AVX2-NEXT: vpsrlq $32, %xmm0, %xmm3 ; CHECK-AVX2-NEXT: vpmuludq %xmm1, %xmm3, %xmm1 diff --git a/llvm/test/CodeGen/X86/urem-seteq.ll b/llvm/test/CodeGen/X86/urem-seteq.ll --- a/llvm/test/CodeGen/X86/urem-seteq.ll +++ b/llvm/test/CodeGen/X86/urem-seteq.ll @@ -362,7 +362,16 @@ ; https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=34366 define void @ossfuzz34366() { ; X86-LABEL: ossfuzz34366: +; X86: # %bb.0: +; X86-NEXT: cmpl $0, (%eax) +; X86-NEXT: sete (%eax) +; X86-NEXT: retl +; ; X64-LABEL: ossfuzz34366: +; X64: # %bb.0: +; X64-NEXT: cmpq $0, (%rax) +; X64-NEXT: sete (%rax) +; X64-NEXT: retq %L10 = load i448, ptr undef, align 4 %B18 = urem i448 %L10, -363419362147803445274661903944002267176820680343659030140745099590319644056698961663095525356881782780381260803133088966767300814307328 %C13 = icmp ule i448 %B18, 0 diff --git a/llvm/test/CodeGen/X86/urem-vector-lkk.ll b/llvm/test/CodeGen/X86/urem-vector-lkk.ll --- a/llvm/test/CodeGen/X86/urem-vector-lkk.ll +++ b/llvm/test/CodeGen/X86/urem-vector-lkk.ll @@ -156,40 +156,22 @@ ; SSE-NEXT: movdqa %xmm1, %xmm0 ; SSE-NEXT: retq ; -; AVX1-LABEL: dont_fold_urem_power_of_two: -; AVX1: # %bb.0: -; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; AVX1-NEXT: vpextrw $1, %xmm0, %eax -; AVX1-NEXT: andl $31, %eax -; AVX1-NEXT: vpinsrw $1, %eax, %xmm1, %xmm1 -; AVX1-NEXT: vpextrw $2, %xmm0, %eax -; AVX1-NEXT: andl $7, %eax -; AVX1-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1 -; AVX1-NEXT: vpextrw $3, %xmm0, %eax -; AVX1-NEXT: imull $44151, %eax, %ecx # imm = 0xAC77 -; AVX1-NEXT: shrl $22, %ecx -; AVX1-NEXT: imull $95, %ecx, %ecx -; AVX1-NEXT: subl %ecx, %eax -; AVX1-NEXT: vpinsrw $3, %eax, %xmm1, %xmm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: dont_fold_urem_power_of_two: -; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [63,63,63,63] -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm1 -; AVX2-NEXT: vpextrw $1, %xmm0, %eax -; AVX2-NEXT: andl $31, %eax -; AVX2-NEXT: vpinsrw $1, %eax, %xmm1, %xmm1 -; AVX2-NEXT: vpextrw $2, %xmm0, %eax -; AVX2-NEXT: andl $7, %eax -; AVX2-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1 -; AVX2-NEXT: vpextrw $3, %xmm0, %eax -; AVX2-NEXT: imull $44151, %eax, %ecx # imm = 0xAC77 -; AVX2-NEXT: shrl $22, %ecx -; AVX2-NEXT: imull $95, %ecx, %ecx -; AVX2-NEXT: subl %ecx, %eax -; AVX2-NEXT: vpinsrw $3, %eax, %xmm1, %xmm0 -; AVX2-NEXT: retq +; AVX-LABEL: dont_fold_urem_power_of_two: +; AVX: # %bb.0: +; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 +; AVX-NEXT: vpextrw $1, %xmm0, %eax +; AVX-NEXT: andl $31, %eax +; AVX-NEXT: vpinsrw $1, %eax, %xmm1, %xmm1 +; AVX-NEXT: vpextrw $2, %xmm0, %eax +; AVX-NEXT: andl $7, %eax +; AVX-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1 +; AVX-NEXT: vpextrw $3, %xmm0, %eax +; AVX-NEXT: imull $44151, %eax, %ecx # imm = 0xAC77 +; AVX-NEXT: shrl $22, %ecx +; AVX-NEXT: imull $95, %ecx, %ecx +; AVX-NEXT: subl %ecx, %eax +; AVX-NEXT: vpinsrw $3, %eax, %xmm1, %xmm0 +; AVX-NEXT: retq %1 = urem <4 x i16> %x, ret <4 x i16> %1 } diff --git a/llvm/test/CodeGen/X86/usub_sat_vec.ll b/llvm/test/CodeGen/X86/usub_sat_vec.ll --- a/llvm/test/CodeGen/X86/usub_sat_vec.ll +++ b/llvm/test/CodeGen/X86/usub_sat_vec.ll @@ -489,13 +489,37 @@ ; SSE-NEXT: psubusb %xmm1, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: v16i4: -; AVX: # %bb.0: -; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX-NEXT: vpand %xmm2, %xmm1, %xmm1 -; AVX-NEXT: vpand %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpsubusb %xmm1, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: v16i4: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpsubusb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: v16i4: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpsubusb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: retq +; +; AVX512F-LABEL: v16i4: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpbroadcastb {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX512F-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX512F-NEXT: vpsubusb %xmm1, %xmm0, %xmm0 +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: v16i4: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX512BW-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX512BW-NEXT: vpsubusb %xmm1, %xmm0, %xmm0 +; AVX512BW-NEXT: retq %z = call <16 x i4> @llvm.usub.sat.v16i4(<16 x i4> %x, <16 x i4> %y) ret <16 x i4> %z } @@ -817,7 +841,7 @@ ; ; AVX2-LABEL: v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] ; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm3 ; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm2 ; AVX2-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2 @@ -1112,7 +1136,7 @@ ; AVX2: # %bb.0: ; AVX2-NEXT: vmovdqa (%rdi), %ymm0 ; AVX2-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64] +; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm2 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64] ; AVX2-NEXT: vpsubusw %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vpsubusw %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vmovdqa %ymm0, (%rdi) @@ -1124,7 +1148,7 @@ ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64] +; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm2 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64] ; AVX512F-NEXT: vpsubusw %ymm2, %ymm1, %ymm1 ; AVX512F-NEXT: vpsubusw %ymm2, %ymm0, %ymm0 ; AVX512F-NEXT: vmovdqa %ymm0, (%rdi) diff --git a/llvm/test/CodeGen/X86/v8i1-masks.ll b/llvm/test/CodeGen/X86/v8i1-masks.ll --- a/llvm/test/CodeGen/X86/v8i1-masks.ll +++ b/llvm/test/CodeGen/X86/v8i1-masks.ll @@ -129,8 +129,7 @@ ; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-AVX2-NEXT: vmovups (%ecx), %ymm0 ; X86-AVX2-NEXT: vcmpnltps (%eax), %ymm0, %ymm0 -; X86-AVX2-NEXT: vbroadcastss {{.*#+}} ymm1 = [1,1,1,1,1,1,1,1] -; X86-AVX2-NEXT: vandps %ymm1, %ymm0, %ymm0 +; X86-AVX2-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0 ; X86-AVX2-NEXT: vmovaps %ymm0, (%eax) ; X86-AVX2-NEXT: vzeroupper ; X86-AVX2-NEXT: retl @@ -139,8 +138,7 @@ ; X64-AVX2: ## %bb.0: ; X64-AVX2-NEXT: vmovups (%rsi), %ymm0 ; X64-AVX2-NEXT: vcmpnltps (%rdi), %ymm0, %ymm0 -; X64-AVX2-NEXT: vbroadcastss {{.*#+}} ymm1 = [1,1,1,1,1,1,1,1] -; X64-AVX2-NEXT: vandps %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; X64-AVX2-NEXT: vmovaps %ymm0, (%rax) ; X64-AVX2-NEXT: vzeroupper ; X64-AVX2-NEXT: retq @@ -240,7 +238,7 @@ define <8 x i32> @two_ands(<8 x float> %x) local_unnamed_addr #0 { ; X86-LABEL: two_ands: ; X86: ## %bb.0: ## %entry -; X86-NEXT: vmovaps {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] +; X86-NEXT: vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] ; X86-NEXT: vcmpleps %ymm0, %ymm1, %ymm1 ; X86-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0 ; X86-NEXT: vandps %ymm0, %ymm1, %ymm0 @@ -248,7 +246,7 @@ ; ; X64-LABEL: two_ands: ; X64: ## %bb.0: ## %entry -; X64-NEXT: vmovaps {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] +; X64-NEXT: vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] ; X64-NEXT: vcmpleps %ymm0, %ymm1, %ymm1 ; X64-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; X64-NEXT: vandps %ymm0, %ymm1, %ymm0 @@ -258,8 +256,7 @@ ; X86-AVX2: ## %bb.0: ## %entry ; X86-AVX2-NEXT: vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] ; X86-AVX2-NEXT: vcmpleps %ymm0, %ymm1, %ymm1 -; X86-AVX2-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; X86-AVX2-NEXT: vcmpltps %ymm2, %ymm0, %ymm0 +; X86-AVX2-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0 ; X86-AVX2-NEXT: vandps %ymm0, %ymm1, %ymm0 ; X86-AVX2-NEXT: retl ; @@ -267,8 +264,7 @@ ; X64-AVX2: ## %bb.0: ## %entry ; X64-AVX2-NEXT: vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] ; X64-AVX2-NEXT: vcmpleps %ymm0, %ymm1, %ymm1 -; X64-AVX2-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; X64-AVX2-NEXT: vcmpltps %ymm2, %ymm0, %ymm0 +; X64-AVX2-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; X64-AVX2-NEXT: vandps %ymm0, %ymm1, %ymm0 ; X64-AVX2-NEXT: retq ; @@ -298,7 +294,7 @@ define <8 x i32> @three_ands(<8 x float> %x) { ; X86-LABEL: three_ands: ; X86: ## %bb.0: ## %entry -; X86-NEXT: vmovaps {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] +; X86-NEXT: vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] ; X86-NEXT: vcmpleps %ymm0, %ymm1, %ymm1 ; X86-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm2 ; X86-NEXT: vxorps %xmm3, %xmm3, %xmm3 @@ -309,7 +305,7 @@ ; ; X64-LABEL: three_ands: ; X64: ## %bb.0: ## %entry -; X64-NEXT: vmovaps {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] +; X64-NEXT: vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] ; X64-NEXT: vcmpleps %ymm0, %ymm1, %ymm1 ; X64-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2 ; X64-NEXT: vxorps %xmm3, %xmm3, %xmm3 @@ -322,11 +318,10 @@ ; X86-AVX2: ## %bb.0: ## %entry ; X86-AVX2-NEXT: vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] ; X86-AVX2-NEXT: vcmpleps %ymm0, %ymm1, %ymm1 -; X86-AVX2-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; X86-AVX2-NEXT: vcmpltps %ymm2, %ymm0, %ymm2 -; X86-AVX2-NEXT: vandps %ymm2, %ymm1, %ymm1 -; X86-AVX2-NEXT: vxorps %xmm2, %xmm2, %xmm2 -; X86-AVX2-NEXT: vcmpneqps %ymm2, %ymm0, %ymm0 +; X86-AVX2-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm2 +; X86-AVX2-NEXT: vxorps %xmm3, %xmm3, %xmm3 +; X86-AVX2-NEXT: vcmpneqps %ymm3, %ymm0, %ymm0 +; X86-AVX2-NEXT: vandps %ymm0, %ymm2, %ymm0 ; X86-AVX2-NEXT: vandps %ymm0, %ymm1, %ymm0 ; X86-AVX2-NEXT: retl ; @@ -334,11 +329,10 @@ ; X64-AVX2: ## %bb.0: ## %entry ; X64-AVX2-NEXT: vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] ; X64-AVX2-NEXT: vcmpleps %ymm0, %ymm1, %ymm1 -; X64-AVX2-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; X64-AVX2-NEXT: vcmpltps %ymm2, %ymm0, %ymm2 -; X64-AVX2-NEXT: vandps %ymm2, %ymm1, %ymm1 -; X64-AVX2-NEXT: vxorps %xmm2, %xmm2, %xmm2 -; X64-AVX2-NEXT: vcmpneqps %ymm2, %ymm0, %ymm0 +; X64-AVX2-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2 +; X64-AVX2-NEXT: vxorps %xmm3, %xmm3, %xmm3 +; X64-AVX2-NEXT: vcmpneqps %ymm3, %ymm0, %ymm0 +; X64-AVX2-NEXT: vandps %ymm0, %ymm2, %ymm0 ; X64-AVX2-NEXT: vandps %ymm0, %ymm1, %ymm0 ; X64-AVX2-NEXT: retq ; @@ -374,7 +368,7 @@ define <8 x i32> @four_ands(<8 x float> %x) { ; X86-LABEL: four_ands: ; X86: ## %bb.0: ## %entry -; X86-NEXT: vmovaps {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] +; X86-NEXT: vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] ; X86-NEXT: vcmpleps %ymm0, %ymm1, %ymm1 ; X86-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm2 ; X86-NEXT: vxorps %xmm3, %xmm3, %xmm3 @@ -387,7 +381,7 @@ ; ; X64-LABEL: four_ands: ; X64: ## %bb.0: ## %entry -; X64-NEXT: vmovaps {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] +; X64-NEXT: vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] ; X64-NEXT: vcmpleps %ymm0, %ymm1, %ymm1 ; X64-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2 ; X64-NEXT: vxorps %xmm3, %xmm3, %xmm3 @@ -402,14 +396,12 @@ ; X86-AVX2: ## %bb.0: ## %entry ; X86-AVX2-NEXT: vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] ; X86-AVX2-NEXT: vcmpleps %ymm0, %ymm1, %ymm1 -; X86-AVX2-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; X86-AVX2-NEXT: vcmpltps %ymm2, %ymm0, %ymm2 +; X86-AVX2-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm2 +; X86-AVX2-NEXT: vxorps %xmm3, %xmm3, %xmm3 +; X86-AVX2-NEXT: vcmpneqps %ymm3, %ymm0, %ymm3 +; X86-AVX2-NEXT: vandps %ymm3, %ymm2, %ymm2 ; X86-AVX2-NEXT: vandps %ymm2, %ymm1, %ymm1 -; X86-AVX2-NEXT: vxorps %xmm2, %xmm2, %xmm2 -; X86-AVX2-NEXT: vcmpneqps %ymm2, %ymm0, %ymm2 -; X86-AVX2-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1] -; X86-AVX2-NEXT: vcmpneqps %ymm3, %ymm0, %ymm0 -; X86-AVX2-NEXT: vandps %ymm0, %ymm2, %ymm0 +; X86-AVX2-NEXT: vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0 ; X86-AVX2-NEXT: vandps %ymm0, %ymm1, %ymm0 ; X86-AVX2-NEXT: retl ; @@ -417,14 +409,12 @@ ; X64-AVX2: ## %bb.0: ## %entry ; X64-AVX2-NEXT: vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] ; X64-AVX2-NEXT: vcmpleps %ymm0, %ymm1, %ymm1 -; X64-AVX2-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; X64-AVX2-NEXT: vcmpltps %ymm2, %ymm0, %ymm2 +; X64-AVX2-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2 +; X64-AVX2-NEXT: vxorps %xmm3, %xmm3, %xmm3 +; X64-AVX2-NEXT: vcmpneqps %ymm3, %ymm0, %ymm3 +; X64-AVX2-NEXT: vandps %ymm3, %ymm2, %ymm2 ; X64-AVX2-NEXT: vandps %ymm2, %ymm1, %ymm1 -; X64-AVX2-NEXT: vxorps %xmm2, %xmm2, %xmm2 -; X64-AVX2-NEXT: vcmpneqps %ymm2, %ymm0, %ymm2 -; X64-AVX2-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1] -; X64-AVX2-NEXT: vcmpneqps %ymm3, %ymm0, %ymm0 -; X64-AVX2-NEXT: vandps %ymm0, %ymm2, %ymm0 +; X64-AVX2-NEXT: vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; X64-AVX2-NEXT: vandps %ymm0, %ymm1, %ymm0 ; X64-AVX2-NEXT: retq ; @@ -464,7 +454,7 @@ define <8 x i32> @five_ands(<8 x float> %x) { ; X86-LABEL: five_ands: ; X86: ## %bb.0: ## %entry -; X86-NEXT: vmovaps {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] +; X86-NEXT: vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] ; X86-NEXT: vcmpleps %ymm0, %ymm1, %ymm1 ; X86-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm2 ; X86-NEXT: vxorps %xmm3, %xmm3, %xmm3 @@ -479,7 +469,7 @@ ; ; X64-LABEL: five_ands: ; X64: ## %bb.0: ## %entry -; X64-NEXT: vmovaps {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] +; X64-NEXT: vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] ; X64-NEXT: vcmpleps %ymm0, %ymm1, %ymm1 ; X64-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2 ; X64-NEXT: vxorps %xmm3, %xmm3, %xmm3 @@ -496,17 +486,14 @@ ; X86-AVX2: ## %bb.0: ## %entry ; X86-AVX2-NEXT: vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] ; X86-AVX2-NEXT: vcmpleps %ymm0, %ymm1, %ymm1 -; X86-AVX2-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; X86-AVX2-NEXT: vcmpltps %ymm2, %ymm0, %ymm2 -; X86-AVX2-NEXT: vandps %ymm2, %ymm1, %ymm1 -; X86-AVX2-NEXT: vxorps %xmm2, %xmm2, %xmm2 -; X86-AVX2-NEXT: vcmpneqps %ymm2, %ymm0, %ymm2 -; X86-AVX2-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1] +; X86-AVX2-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm2 +; X86-AVX2-NEXT: vxorps %xmm3, %xmm3, %xmm3 ; X86-AVX2-NEXT: vcmpneqps %ymm3, %ymm0, %ymm3 ; X86-AVX2-NEXT: vandps %ymm3, %ymm2, %ymm2 ; X86-AVX2-NEXT: vandps %ymm2, %ymm1, %ymm1 -; X86-AVX2-NEXT: vbroadcastss {{.*#+}} ymm2 = [2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1] -; X86-AVX2-NEXT: vcmpneqps %ymm2, %ymm0, %ymm0 +; X86-AVX2-NEXT: vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm2 +; X86-AVX2-NEXT: vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0 +; X86-AVX2-NEXT: vandps %ymm0, %ymm2, %ymm0 ; X86-AVX2-NEXT: vandps %ymm0, %ymm1, %ymm0 ; X86-AVX2-NEXT: retl ; @@ -514,17 +501,14 @@ ; X64-AVX2: ## %bb.0: ## %entry ; X64-AVX2-NEXT: vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] ; X64-AVX2-NEXT: vcmpleps %ymm0, %ymm1, %ymm1 -; X64-AVX2-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; X64-AVX2-NEXT: vcmpltps %ymm2, %ymm0, %ymm2 -; X64-AVX2-NEXT: vandps %ymm2, %ymm1, %ymm1 -; X64-AVX2-NEXT: vxorps %xmm2, %xmm2, %xmm2 -; X64-AVX2-NEXT: vcmpneqps %ymm2, %ymm0, %ymm2 -; X64-AVX2-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1] +; X64-AVX2-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2 +; X64-AVX2-NEXT: vxorps %xmm3, %xmm3, %xmm3 ; X64-AVX2-NEXT: vcmpneqps %ymm3, %ymm0, %ymm3 ; X64-AVX2-NEXT: vandps %ymm3, %ymm2, %ymm2 ; X64-AVX2-NEXT: vandps %ymm2, %ymm1, %ymm1 -; X64-AVX2-NEXT: vbroadcastss {{.*#+}} ymm2 = [2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1] -; X64-AVX2-NEXT: vcmpneqps %ymm2, %ymm0, %ymm0 +; X64-AVX2-NEXT: vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2 +; X64-AVX2-NEXT: vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; X64-AVX2-NEXT: vandps %ymm0, %ymm2, %ymm0 ; X64-AVX2-NEXT: vandps %ymm0, %ymm1, %ymm0 ; X64-AVX2-NEXT: retq ; @@ -568,7 +552,7 @@ define <8 x i32> @two_or(<8 x float> %x) { ; X86-LABEL: two_or: ; X86: ## %bb.0: ## %entry -; X86-NEXT: vmovaps {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] +; X86-NEXT: vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] ; X86-NEXT: vcmpleps %ymm0, %ymm1, %ymm1 ; X86-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0 ; X86-NEXT: vorps %ymm0, %ymm1, %ymm0 @@ -576,7 +560,7 @@ ; ; X64-LABEL: two_or: ; X64: ## %bb.0: ## %entry -; X64-NEXT: vmovaps {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] +; X64-NEXT: vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] ; X64-NEXT: vcmpleps %ymm0, %ymm1, %ymm1 ; X64-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; X64-NEXT: vorps %ymm0, %ymm1, %ymm0 @@ -586,8 +570,7 @@ ; X86-AVX2: ## %bb.0: ## %entry ; X86-AVX2-NEXT: vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] ; X86-AVX2-NEXT: vcmpleps %ymm0, %ymm1, %ymm1 -; X86-AVX2-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; X86-AVX2-NEXT: vcmpltps %ymm2, %ymm0, %ymm0 +; X86-AVX2-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0 ; X86-AVX2-NEXT: vorps %ymm0, %ymm1, %ymm0 ; X86-AVX2-NEXT: retl ; @@ -595,8 +578,7 @@ ; X64-AVX2: ## %bb.0: ## %entry ; X64-AVX2-NEXT: vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] ; X64-AVX2-NEXT: vcmpleps %ymm0, %ymm1, %ymm1 -; X64-AVX2-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; X64-AVX2-NEXT: vcmpltps %ymm2, %ymm0, %ymm0 +; X64-AVX2-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; X64-AVX2-NEXT: vorps %ymm0, %ymm1, %ymm0 ; X64-AVX2-NEXT: retq ; @@ -628,7 +610,7 @@ define <8 x i32> @three_or(<8 x float> %x) { ; X86-LABEL: three_or: ; X86: ## %bb.0: ## %entry -; X86-NEXT: vmovaps {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] +; X86-NEXT: vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] ; X86-NEXT: vcmpleps %ymm0, %ymm1, %ymm1 ; X86-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm2 ; X86-NEXT: vxorps %xmm3, %xmm3, %xmm3 @@ -639,7 +621,7 @@ ; ; X64-LABEL: three_or: ; X64: ## %bb.0: ## %entry -; X64-NEXT: vmovaps {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] +; X64-NEXT: vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] ; X64-NEXT: vcmpleps %ymm0, %ymm1, %ymm1 ; X64-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2 ; X64-NEXT: vxorps %xmm3, %xmm3, %xmm3 @@ -652,11 +634,10 @@ ; X86-AVX2: ## %bb.0: ## %entry ; X86-AVX2-NEXT: vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] ; X86-AVX2-NEXT: vcmpleps %ymm0, %ymm1, %ymm1 -; X86-AVX2-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; X86-AVX2-NEXT: vcmpltps %ymm2, %ymm0, %ymm2 -; X86-AVX2-NEXT: vorps %ymm2, %ymm1, %ymm1 -; X86-AVX2-NEXT: vxorps %xmm2, %xmm2, %xmm2 -; X86-AVX2-NEXT: vcmpneqps %ymm2, %ymm0, %ymm0 +; X86-AVX2-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm2 +; X86-AVX2-NEXT: vxorps %xmm3, %xmm3, %xmm3 +; X86-AVX2-NEXT: vcmpneqps %ymm3, %ymm0, %ymm0 +; X86-AVX2-NEXT: vorps %ymm0, %ymm2, %ymm0 ; X86-AVX2-NEXT: vorps %ymm0, %ymm1, %ymm0 ; X86-AVX2-NEXT: retl ; @@ -664,11 +645,10 @@ ; X64-AVX2: ## %bb.0: ## %entry ; X64-AVX2-NEXT: vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] ; X64-AVX2-NEXT: vcmpleps %ymm0, %ymm1, %ymm1 -; X64-AVX2-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; X64-AVX2-NEXT: vcmpltps %ymm2, %ymm0, %ymm2 -; X64-AVX2-NEXT: vorps %ymm2, %ymm1, %ymm1 -; X64-AVX2-NEXT: vxorps %xmm2, %xmm2, %xmm2 -; X64-AVX2-NEXT: vcmpneqps %ymm2, %ymm0, %ymm0 +; X64-AVX2-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2 +; X64-AVX2-NEXT: vxorps %xmm3, %xmm3, %xmm3 +; X64-AVX2-NEXT: vcmpneqps %ymm3, %ymm0, %ymm0 +; X64-AVX2-NEXT: vorps %ymm0, %ymm2, %ymm0 ; X64-AVX2-NEXT: vorps %ymm0, %ymm1, %ymm0 ; X64-AVX2-NEXT: retq ; @@ -708,7 +688,7 @@ define <8 x i32> @four_or(<8 x float> %x) { ; X86-LABEL: four_or: ; X86: ## %bb.0: ## %entry -; X86-NEXT: vmovaps {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] +; X86-NEXT: vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] ; X86-NEXT: vcmpleps %ymm0, %ymm1, %ymm1 ; X86-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm2 ; X86-NEXT: vxorps %xmm3, %xmm3, %xmm3 @@ -721,7 +701,7 @@ ; ; X64-LABEL: four_or: ; X64: ## %bb.0: ## %entry -; X64-NEXT: vmovaps {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] +; X64-NEXT: vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] ; X64-NEXT: vcmpleps %ymm0, %ymm1, %ymm1 ; X64-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2 ; X64-NEXT: vxorps %xmm3, %xmm3, %xmm3 @@ -736,14 +716,12 @@ ; X86-AVX2: ## %bb.0: ## %entry ; X86-AVX2-NEXT: vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] ; X86-AVX2-NEXT: vcmpleps %ymm0, %ymm1, %ymm1 -; X86-AVX2-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; X86-AVX2-NEXT: vcmpltps %ymm2, %ymm0, %ymm2 +; X86-AVX2-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm2 +; X86-AVX2-NEXT: vxorps %xmm3, %xmm3, %xmm3 +; X86-AVX2-NEXT: vcmpneqps %ymm3, %ymm0, %ymm3 +; X86-AVX2-NEXT: vorps %ymm3, %ymm2, %ymm2 ; X86-AVX2-NEXT: vorps %ymm2, %ymm1, %ymm1 -; X86-AVX2-NEXT: vxorps %xmm2, %xmm2, %xmm2 -; X86-AVX2-NEXT: vcmpneqps %ymm2, %ymm0, %ymm2 -; X86-AVX2-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1] -; X86-AVX2-NEXT: vcmpneqps %ymm3, %ymm0, %ymm0 -; X86-AVX2-NEXT: vorps %ymm0, %ymm2, %ymm0 +; X86-AVX2-NEXT: vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0 ; X86-AVX2-NEXT: vorps %ymm0, %ymm1, %ymm0 ; X86-AVX2-NEXT: retl ; @@ -751,14 +729,12 @@ ; X64-AVX2: ## %bb.0: ## %entry ; X64-AVX2-NEXT: vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] ; X64-AVX2-NEXT: vcmpleps %ymm0, %ymm1, %ymm1 -; X64-AVX2-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; X64-AVX2-NEXT: vcmpltps %ymm2, %ymm0, %ymm2 +; X64-AVX2-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2 +; X64-AVX2-NEXT: vxorps %xmm3, %xmm3, %xmm3 +; X64-AVX2-NEXT: vcmpneqps %ymm3, %ymm0, %ymm3 +; X64-AVX2-NEXT: vorps %ymm3, %ymm2, %ymm2 ; X64-AVX2-NEXT: vorps %ymm2, %ymm1, %ymm1 -; X64-AVX2-NEXT: vxorps %xmm2, %xmm2, %xmm2 -; X64-AVX2-NEXT: vcmpneqps %ymm2, %ymm0, %ymm2 -; X64-AVX2-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1] -; X64-AVX2-NEXT: vcmpneqps %ymm3, %ymm0, %ymm0 -; X64-AVX2-NEXT: vorps %ymm0, %ymm2, %ymm0 +; X64-AVX2-NEXT: vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; X64-AVX2-NEXT: vorps %ymm0, %ymm1, %ymm0 ; X64-AVX2-NEXT: retq ; @@ -804,7 +780,7 @@ define <8 x i32> @five_or(<8 x float> %x) { ; X86-LABEL: five_or: ; X86: ## %bb.0: ## %entry -; X86-NEXT: vmovaps {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] +; X86-NEXT: vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] ; X86-NEXT: vcmpleps %ymm0, %ymm1, %ymm1 ; X86-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm2 ; X86-NEXT: vxorps %xmm3, %xmm3, %xmm3 @@ -819,7 +795,7 @@ ; ; X64-LABEL: five_or: ; X64: ## %bb.0: ## %entry -; X64-NEXT: vmovaps {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] +; X64-NEXT: vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] ; X64-NEXT: vcmpleps %ymm0, %ymm1, %ymm1 ; X64-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2 ; X64-NEXT: vxorps %xmm3, %xmm3, %xmm3 @@ -836,17 +812,14 @@ ; X86-AVX2: ## %bb.0: ## %entry ; X86-AVX2-NEXT: vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] ; X86-AVX2-NEXT: vcmpleps %ymm0, %ymm1, %ymm1 -; X86-AVX2-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; X86-AVX2-NEXT: vcmpltps %ymm2, %ymm0, %ymm2 -; X86-AVX2-NEXT: vorps %ymm2, %ymm1, %ymm1 -; X86-AVX2-NEXT: vxorps %xmm2, %xmm2, %xmm2 -; X86-AVX2-NEXT: vcmpneqps %ymm2, %ymm0, %ymm2 -; X86-AVX2-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1] +; X86-AVX2-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm2 +; X86-AVX2-NEXT: vxorps %xmm3, %xmm3, %xmm3 ; X86-AVX2-NEXT: vcmpneqps %ymm3, %ymm0, %ymm3 ; X86-AVX2-NEXT: vorps %ymm3, %ymm2, %ymm2 ; X86-AVX2-NEXT: vorps %ymm2, %ymm1, %ymm1 -; X86-AVX2-NEXT: vbroadcastss {{.*#+}} ymm2 = [2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1] -; X86-AVX2-NEXT: vcmpneqps %ymm2, %ymm0, %ymm0 +; X86-AVX2-NEXT: vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm2 +; X86-AVX2-NEXT: vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0 +; X86-AVX2-NEXT: vorps %ymm0, %ymm2, %ymm0 ; X86-AVX2-NEXT: vorps %ymm0, %ymm1, %ymm0 ; X86-AVX2-NEXT: retl ; @@ -854,17 +827,14 @@ ; X64-AVX2: ## %bb.0: ## %entry ; X64-AVX2-NEXT: vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] ; X64-AVX2-NEXT: vcmpleps %ymm0, %ymm1, %ymm1 -; X64-AVX2-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; X64-AVX2-NEXT: vcmpltps %ymm2, %ymm0, %ymm2 -; X64-AVX2-NEXT: vorps %ymm2, %ymm1, %ymm1 -; X64-AVX2-NEXT: vxorps %xmm2, %xmm2, %xmm2 -; X64-AVX2-NEXT: vcmpneqps %ymm2, %ymm0, %ymm2 -; X64-AVX2-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1] +; X64-AVX2-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2 +; X64-AVX2-NEXT: vxorps %xmm3, %xmm3, %xmm3 ; X64-AVX2-NEXT: vcmpneqps %ymm3, %ymm0, %ymm3 ; X64-AVX2-NEXT: vorps %ymm3, %ymm2, %ymm2 ; X64-AVX2-NEXT: vorps %ymm2, %ymm1, %ymm1 -; X64-AVX2-NEXT: vbroadcastss {{.*#+}} ymm2 = [2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1] -; X64-AVX2-NEXT: vcmpneqps %ymm2, %ymm0, %ymm0 +; X64-AVX2-NEXT: vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2 +; X64-AVX2-NEXT: vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; X64-AVX2-NEXT: vorps %ymm0, %ymm2, %ymm0 ; X64-AVX2-NEXT: vorps %ymm0, %ymm1, %ymm0 ; X64-AVX2-NEXT: retq ; @@ -916,7 +886,7 @@ define <8 x i32> @three_or_and(<8 x float> %x) { ; X86-LABEL: three_or_and: ; X86: ## %bb.0: ## %entry -; X86-NEXT: vmovaps {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] +; X86-NEXT: vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] ; X86-NEXT: vcmpleps %ymm0, %ymm1, %ymm1 ; X86-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm2 ; X86-NEXT: vxorps %xmm3, %xmm3, %xmm3 @@ -927,7 +897,7 @@ ; ; X64-LABEL: three_or_and: ; X64: ## %bb.0: ## %entry -; X64-NEXT: vmovaps {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] +; X64-NEXT: vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] ; X64-NEXT: vcmpleps %ymm0, %ymm1, %ymm1 ; X64-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2 ; X64-NEXT: vxorps %xmm3, %xmm3, %xmm3 @@ -940,8 +910,7 @@ ; X86-AVX2: ## %bb.0: ## %entry ; X86-AVX2-NEXT: vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] ; X86-AVX2-NEXT: vcmpleps %ymm0, %ymm1, %ymm1 -; X86-AVX2-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; X86-AVX2-NEXT: vcmpltps %ymm2, %ymm0, %ymm2 +; X86-AVX2-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm2 ; X86-AVX2-NEXT: vxorps %xmm3, %xmm3, %xmm3 ; X86-AVX2-NEXT: vcmpneqps %ymm3, %ymm0, %ymm0 ; X86-AVX2-NEXT: vandps %ymm0, %ymm2, %ymm0 @@ -952,8 +921,7 @@ ; X64-AVX2: ## %bb.0: ## %entry ; X64-AVX2-NEXT: vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] ; X64-AVX2-NEXT: vcmpleps %ymm0, %ymm1, %ymm1 -; X64-AVX2-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; X64-AVX2-NEXT: vcmpltps %ymm2, %ymm0, %ymm2 +; X64-AVX2-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2 ; X64-AVX2-NEXT: vxorps %xmm3, %xmm3, %xmm3 ; X64-AVX2-NEXT: vcmpneqps %ymm3, %ymm0, %ymm0 ; X64-AVX2-NEXT: vandps %ymm0, %ymm2, %ymm0 @@ -994,7 +962,7 @@ define <8 x i32> @four_or_and(<8 x float> %x) { ; X86-LABEL: four_or_and: ; X86: ## %bb.0: ## %entry -; X86-NEXT: vmovaps {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] +; X86-NEXT: vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] ; X86-NEXT: vcmpleps %ymm0, %ymm1, %ymm1 ; X86-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm2 ; X86-NEXT: vandps %ymm2, %ymm1, %ymm1 @@ -1007,7 +975,7 @@ ; ; X64-LABEL: four_or_and: ; X64: ## %bb.0: ## %entry -; X64-NEXT: vmovaps {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] +; X64-NEXT: vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] ; X64-NEXT: vcmpleps %ymm0, %ymm1, %ymm1 ; X64-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2 ; X64-NEXT: vandps %ymm2, %ymm1, %ymm1 @@ -1022,13 +990,11 @@ ; X86-AVX2: ## %bb.0: ## %entry ; X86-AVX2-NEXT: vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] ; X86-AVX2-NEXT: vcmpleps %ymm0, %ymm1, %ymm1 -; X86-AVX2-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; X86-AVX2-NEXT: vcmpltps %ymm2, %ymm0, %ymm2 +; X86-AVX2-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm2 ; X86-AVX2-NEXT: vandps %ymm2, %ymm1, %ymm1 ; X86-AVX2-NEXT: vxorps %xmm2, %xmm2, %xmm2 ; X86-AVX2-NEXT: vcmpneqps %ymm2, %ymm0, %ymm2 -; X86-AVX2-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1] -; X86-AVX2-NEXT: vcmpneqps %ymm3, %ymm0, %ymm0 +; X86-AVX2-NEXT: vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0 ; X86-AVX2-NEXT: vandps %ymm0, %ymm2, %ymm0 ; X86-AVX2-NEXT: vorps %ymm0, %ymm1, %ymm0 ; X86-AVX2-NEXT: retl @@ -1037,13 +1003,11 @@ ; X64-AVX2: ## %bb.0: ## %entry ; X64-AVX2-NEXT: vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] ; X64-AVX2-NEXT: vcmpleps %ymm0, %ymm1, %ymm1 -; X64-AVX2-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; X64-AVX2-NEXT: vcmpltps %ymm2, %ymm0, %ymm2 +; X64-AVX2-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2 ; X64-AVX2-NEXT: vandps %ymm2, %ymm1, %ymm1 ; X64-AVX2-NEXT: vxorps %xmm2, %xmm2, %xmm2 ; X64-AVX2-NEXT: vcmpneqps %ymm2, %ymm0, %ymm2 -; X64-AVX2-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1] -; X64-AVX2-NEXT: vcmpneqps %ymm3, %ymm0, %ymm0 +; X64-AVX2-NEXT: vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; X64-AVX2-NEXT: vandps %ymm0, %ymm2, %ymm0 ; X64-AVX2-NEXT: vorps %ymm0, %ymm1, %ymm0 ; X64-AVX2-NEXT: retq @@ -1086,7 +1050,7 @@ define <8 x i32> @five_or_and(<8 x float> %x) { ; X86-LABEL: five_or_and: ; X86: ## %bb.0: ## %entry -; X86-NEXT: vmovaps {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] +; X86-NEXT: vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] ; X86-NEXT: vcmpleps %ymm0, %ymm1, %ymm1 ; X86-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm2 ; X86-NEXT: vxorps %xmm3, %xmm3, %xmm3 @@ -1101,7 +1065,7 @@ ; ; X64-LABEL: five_or_and: ; X64: ## %bb.0: ## %entry -; X64-NEXT: vmovaps {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] +; X64-NEXT: vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] ; X64-NEXT: vcmpleps %ymm0, %ymm1, %ymm1 ; X64-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2 ; X64-NEXT: vxorps %xmm3, %xmm3, %xmm3 @@ -1118,16 +1082,13 @@ ; X86-AVX2: ## %bb.0: ## %entry ; X86-AVX2-NEXT: vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] ; X86-AVX2-NEXT: vcmpleps %ymm0, %ymm1, %ymm1 -; X86-AVX2-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; X86-AVX2-NEXT: vcmpltps %ymm2, %ymm0, %ymm2 +; X86-AVX2-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm2 ; X86-AVX2-NEXT: vxorps %xmm3, %xmm3, %xmm3 ; X86-AVX2-NEXT: vcmpneqps %ymm3, %ymm0, %ymm3 ; X86-AVX2-NEXT: vandps %ymm3, %ymm2, %ymm2 ; X86-AVX2-NEXT: vorps %ymm1, %ymm2, %ymm1 -; X86-AVX2-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1] -; X86-AVX2-NEXT: vcmpneqps %ymm2, %ymm0, %ymm2 -; X86-AVX2-NEXT: vbroadcastss {{.*#+}} ymm3 = [2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1] -; X86-AVX2-NEXT: vcmpneqps %ymm3, %ymm0, %ymm0 +; X86-AVX2-NEXT: vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm2 +; X86-AVX2-NEXT: vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0 ; X86-AVX2-NEXT: vandps %ymm0, %ymm2, %ymm0 ; X86-AVX2-NEXT: vorps %ymm0, %ymm1, %ymm0 ; X86-AVX2-NEXT: retl @@ -1136,16 +1097,13 @@ ; X64-AVX2: ## %bb.0: ## %entry ; X64-AVX2-NEXT: vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] ; X64-AVX2-NEXT: vcmpleps %ymm0, %ymm1, %ymm1 -; X64-AVX2-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; X64-AVX2-NEXT: vcmpltps %ymm2, %ymm0, %ymm2 +; X64-AVX2-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2 ; X64-AVX2-NEXT: vxorps %xmm3, %xmm3, %xmm3 ; X64-AVX2-NEXT: vcmpneqps %ymm3, %ymm0, %ymm3 ; X64-AVX2-NEXT: vandps %ymm3, %ymm2, %ymm2 ; X64-AVX2-NEXT: vorps %ymm1, %ymm2, %ymm1 -; X64-AVX2-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1] -; X64-AVX2-NEXT: vcmpneqps %ymm2, %ymm0, %ymm2 -; X64-AVX2-NEXT: vbroadcastss {{.*#+}} ymm3 = [2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1] -; X64-AVX2-NEXT: vcmpneqps %ymm3, %ymm0, %ymm0 +; X64-AVX2-NEXT: vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2 +; X64-AVX2-NEXT: vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; X64-AVX2-NEXT: vandps %ymm0, %ymm2, %ymm0 ; X64-AVX2-NEXT: vorps %ymm0, %ymm1, %ymm0 ; X64-AVX2-NEXT: retq @@ -1194,7 +1152,7 @@ define <8 x i32> @four_or_and_xor(<8 x float> %x) { ; X86-LABEL: four_or_and_xor: ; X86: ## %bb.0: ## %entry -; X86-NEXT: vmovaps {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] +; X86-NEXT: vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] ; X86-NEXT: vcmpleps %ymm0, %ymm1, %ymm1 ; X86-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm2 ; X86-NEXT: vxorps %ymm2, %ymm1, %ymm1 @@ -1207,7 +1165,7 @@ ; ; X64-LABEL: four_or_and_xor: ; X64: ## %bb.0: ## %entry -; X64-NEXT: vmovaps {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] +; X64-NEXT: vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] ; X64-NEXT: vcmpleps %ymm0, %ymm1, %ymm1 ; X64-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2 ; X64-NEXT: vxorps %ymm2, %ymm1, %ymm1 @@ -1222,13 +1180,11 @@ ; X86-AVX2: ## %bb.0: ## %entry ; X86-AVX2-NEXT: vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] ; X86-AVX2-NEXT: vcmpleps %ymm0, %ymm1, %ymm1 -; X86-AVX2-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; X86-AVX2-NEXT: vcmpltps %ymm2, %ymm0, %ymm2 +; X86-AVX2-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm2 ; X86-AVX2-NEXT: vxorps %ymm2, %ymm1, %ymm1 ; X86-AVX2-NEXT: vxorps %xmm2, %xmm2, %xmm2 ; X86-AVX2-NEXT: vcmpneqps %ymm2, %ymm0, %ymm2 -; X86-AVX2-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1] -; X86-AVX2-NEXT: vcmpneqps %ymm3, %ymm0, %ymm0 +; X86-AVX2-NEXT: vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0 ; X86-AVX2-NEXT: vandps %ymm0, %ymm2, %ymm0 ; X86-AVX2-NEXT: vorps %ymm0, %ymm1, %ymm0 ; X86-AVX2-NEXT: retl @@ -1237,13 +1193,11 @@ ; X64-AVX2: ## %bb.0: ## %entry ; X64-AVX2-NEXT: vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] ; X64-AVX2-NEXT: vcmpleps %ymm0, %ymm1, %ymm1 -; X64-AVX2-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; X64-AVX2-NEXT: vcmpltps %ymm2, %ymm0, %ymm2 +; X64-AVX2-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2 ; X64-AVX2-NEXT: vxorps %ymm2, %ymm1, %ymm1 ; X64-AVX2-NEXT: vxorps %xmm2, %xmm2, %xmm2 ; X64-AVX2-NEXT: vcmpneqps %ymm2, %ymm0, %ymm2 -; X64-AVX2-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1] -; X64-AVX2-NEXT: vcmpneqps %ymm3, %ymm0, %ymm0 +; X64-AVX2-NEXT: vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; X64-AVX2-NEXT: vandps %ymm0, %ymm2, %ymm0 ; X64-AVX2-NEXT: vorps %ymm0, %ymm1, %ymm0 ; X64-AVX2-NEXT: retq @@ -1288,7 +1242,7 @@ define <8 x i32> @five_or_and_xor(<8 x float> %x) { ; X86-LABEL: five_or_and_xor: ; X86: ## %bb.0: ## %entry -; X86-NEXT: vmovaps {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] +; X86-NEXT: vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] ; X86-NEXT: vcmpleps %ymm0, %ymm1, %ymm1 ; X86-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm2 ; X86-NEXT: vxorps %xmm3, %xmm3, %xmm3 @@ -1303,7 +1257,7 @@ ; ; X64-LABEL: five_or_and_xor: ; X64: ## %bb.0: ## %entry -; X64-NEXT: vmovaps {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] +; X64-NEXT: vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] ; X64-NEXT: vcmpleps %ymm0, %ymm1, %ymm1 ; X64-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2 ; X64-NEXT: vxorps %xmm3, %xmm3, %xmm3 @@ -1320,15 +1274,12 @@ ; X86-AVX2: ## %bb.0: ## %entry ; X86-AVX2-NEXT: vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] ; X86-AVX2-NEXT: vcmpleps %ymm0, %ymm1, %ymm1 -; X86-AVX2-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; X86-AVX2-NEXT: vcmpltps %ymm2, %ymm0, %ymm2 +; X86-AVX2-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm2 ; X86-AVX2-NEXT: vxorps %xmm3, %xmm3, %xmm3 ; X86-AVX2-NEXT: vcmpneqps %ymm3, %ymm0, %ymm3 ; X86-AVX2-NEXT: vxorps %ymm3, %ymm2, %ymm2 -; X86-AVX2-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1] -; X86-AVX2-NEXT: vcmpneqps %ymm3, %ymm0, %ymm3 -; X86-AVX2-NEXT: vbroadcastss {{.*#+}} ymm4 = [2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1] -; X86-AVX2-NEXT: vcmpneqps %ymm4, %ymm0, %ymm0 +; X86-AVX2-NEXT: vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm3 +; X86-AVX2-NEXT: vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0 ; X86-AVX2-NEXT: vandps %ymm0, %ymm3, %ymm0 ; X86-AVX2-NEXT: vxorps %ymm0, %ymm2, %ymm0 ; X86-AVX2-NEXT: vorps %ymm1, %ymm0, %ymm0 @@ -1338,15 +1289,12 @@ ; X64-AVX2: ## %bb.0: ## %entry ; X64-AVX2-NEXT: vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] ; X64-AVX2-NEXT: vcmpleps %ymm0, %ymm1, %ymm1 -; X64-AVX2-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; X64-AVX2-NEXT: vcmpltps %ymm2, %ymm0, %ymm2 +; X64-AVX2-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2 ; X64-AVX2-NEXT: vxorps %xmm3, %xmm3, %xmm3 ; X64-AVX2-NEXT: vcmpneqps %ymm3, %ymm0, %ymm3 ; X64-AVX2-NEXT: vxorps %ymm3, %ymm2, %ymm2 -; X64-AVX2-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1] -; X64-AVX2-NEXT: vcmpneqps %ymm3, %ymm0, %ymm3 -; X64-AVX2-NEXT: vbroadcastss {{.*#+}} ymm4 = [2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1] -; X64-AVX2-NEXT: vcmpneqps %ymm4, %ymm0, %ymm0 +; X64-AVX2-NEXT: vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm3 +; X64-AVX2-NEXT: vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; X64-AVX2-NEXT: vandps %ymm0, %ymm3, %ymm0 ; X64-AVX2-NEXT: vxorps %ymm0, %ymm2, %ymm0 ; X64-AVX2-NEXT: vorps %ymm1, %ymm0, %ymm0 @@ -1397,7 +1345,7 @@ define <8 x i32> @six_or_and_xor(<8 x float> %x) { ; X86-LABEL: six_or_and_xor: ; X86: ## %bb.0: ## %entry -; X86-NEXT: vmovaps {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] +; X86-NEXT: vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] ; X86-NEXT: vcmpleps %ymm0, %ymm1, %ymm1 ; X86-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm2 ; X86-NEXT: vxorps %xmm3, %xmm3, %xmm3 @@ -1414,7 +1362,7 @@ ; ; X64-LABEL: six_or_and_xor: ; X64: ## %bb.0: ## %entry -; X64-NEXT: vmovaps {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] +; X64-NEXT: vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] ; X64-NEXT: vcmpleps %ymm0, %ymm1, %ymm1 ; X64-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2 ; X64-NEXT: vxorps %xmm3, %xmm3, %xmm3 @@ -1433,20 +1381,16 @@ ; X86-AVX2: ## %bb.0: ## %entry ; X86-AVX2-NEXT: vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] ; X86-AVX2-NEXT: vcmpleps %ymm0, %ymm1, %ymm1 -; X86-AVX2-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; X86-AVX2-NEXT: vcmpltps %ymm2, %ymm0, %ymm2 +; X86-AVX2-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm2 ; X86-AVX2-NEXT: vxorps %xmm3, %xmm3, %xmm3 ; X86-AVX2-NEXT: vcmpneqps %ymm3, %ymm0, %ymm3 ; X86-AVX2-NEXT: vandps %ymm3, %ymm2, %ymm2 -; X86-AVX2-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1] -; X86-AVX2-NEXT: vcmpneqps %ymm3, %ymm0, %ymm3 +; X86-AVX2-NEXT: vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm3 ; X86-AVX2-NEXT: vandps %ymm3, %ymm2, %ymm2 -; X86-AVX2-NEXT: vbroadcastss {{.*#+}} ymm3 = [2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1] -; X86-AVX2-NEXT: vcmpneqps %ymm3, %ymm0, %ymm3 +; X86-AVX2-NEXT: vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm3 ; X86-AVX2-NEXT: vxorps %ymm1, %ymm3, %ymm1 ; X86-AVX2-NEXT: vxorps %ymm2, %ymm1, %ymm1 -; X86-AVX2-NEXT: vbroadcastss {{.*#+}} ymm2 = [4.00000006E-1,4.00000006E-1,4.00000006E-1,4.00000006E-1,4.00000006E-1,4.00000006E-1,4.00000006E-1,4.00000006E-1] -; X86-AVX2-NEXT: vcmpneqps %ymm2, %ymm0, %ymm0 +; X86-AVX2-NEXT: vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0 ; X86-AVX2-NEXT: vorps %ymm0, %ymm1, %ymm0 ; X86-AVX2-NEXT: retl ; @@ -1454,20 +1398,16 @@ ; X64-AVX2: ## %bb.0: ## %entry ; X64-AVX2-NEXT: vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] ; X64-AVX2-NEXT: vcmpleps %ymm0, %ymm1, %ymm1 -; X64-AVX2-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; X64-AVX2-NEXT: vcmpltps %ymm2, %ymm0, %ymm2 +; X64-AVX2-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2 ; X64-AVX2-NEXT: vxorps %xmm3, %xmm3, %xmm3 ; X64-AVX2-NEXT: vcmpneqps %ymm3, %ymm0, %ymm3 ; X64-AVX2-NEXT: vandps %ymm3, %ymm2, %ymm2 -; X64-AVX2-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1] -; X64-AVX2-NEXT: vcmpneqps %ymm3, %ymm0, %ymm3 +; X64-AVX2-NEXT: vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm3 ; X64-AVX2-NEXT: vandps %ymm3, %ymm2, %ymm2 -; X64-AVX2-NEXT: vbroadcastss {{.*#+}} ymm3 = [2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1] -; X64-AVX2-NEXT: vcmpneqps %ymm3, %ymm0, %ymm3 +; X64-AVX2-NEXT: vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm3 ; X64-AVX2-NEXT: vxorps %ymm1, %ymm3, %ymm1 ; X64-AVX2-NEXT: vxorps %ymm2, %ymm1, %ymm1 -; X64-AVX2-NEXT: vbroadcastss {{.*#+}} ymm2 = [4.00000006E-1,4.00000006E-1,4.00000006E-1,4.00000006E-1,4.00000006E-1,4.00000006E-1,4.00000006E-1,4.00000006E-1] -; X64-AVX2-NEXT: vcmpneqps %ymm2, %ymm0, %ymm0 +; X64-AVX2-NEXT: vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; X64-AVX2-NEXT: vorps %ymm0, %ymm1, %ymm0 ; X64-AVX2-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/var-permute-128.ll b/llvm/test/CodeGen/X86/var-permute-128.ll --- a/llvm/test/CodeGen/X86/var-permute-128.ll +++ b/llvm/test/CodeGen/X86/var-permute-128.ll @@ -1179,11 +1179,10 @@ ; AVX2-LABEL: indices_convert: ; AVX2: # %bb.0: # %bb ; AVX2-NEXT: vpbroadcastq (%rax), %xmm0 -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [7,7,7,7] -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero -; AVX2-NEXT: vpaddq %xmm0, %xmm0, %xmm0 ; AVX2-NEXT: vmovapd (%rax), %xmm1 +; AVX2-NEXT: vpaddq %xmm0, %xmm0, %xmm0 ; AVX2-NEXT: vpermilpd %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: vmovupd %xmm0, (%rax) ; AVX2-NEXT: retq diff --git a/llvm/test/CodeGen/X86/var-permute-256.ll b/llvm/test/CodeGen/X86/var-permute-256.ll --- a/llvm/test/CodeGen/X86/var-permute-256.ll +++ b/llvm/test/CodeGen/X86/var-permute-256.ll @@ -42,13 +42,12 @@ ; AVX2-LABEL: var_shuffle_v4i64: ; AVX2: # %bb.0: ; AVX2-NEXT: vpaddq %ymm1, %ymm1, %ymm1 -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [2,2,2,2] -; AVX2-NEXT: vpcmpgtq %ymm2, %ymm1, %ymm2 -; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm0[2,3,2,3] -; AVX2-NEXT: vpermilpd %ymm1, %ymm3, %ymm3 +; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm0[2,3,2,3] +; AVX2-NEXT: vpermilpd %ymm1, %ymm2, %ymm2 ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,1] ; AVX2-NEXT: vpermilpd %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vblendvpd %ymm2, %ymm3, %ymm0, %ymm0 +; AVX2-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX2-NEXT: vblendvpd %ymm1, %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512-LABEL: var_shuffle_v4i64: @@ -462,13 +461,12 @@ ; AVX2-LABEL: var_shuffle_v4f64: ; AVX2: # %bb.0: ; AVX2-NEXT: vpaddq %ymm1, %ymm1, %ymm1 -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [2,2,2,2] -; AVX2-NEXT: vpcmpgtq %ymm2, %ymm1, %ymm2 -; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm0[2,3,2,3] -; AVX2-NEXT: vpermilpd %ymm1, %ymm3, %ymm3 +; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm0[2,3,2,3] +; AVX2-NEXT: vpermilpd %ymm1, %ymm2, %ymm2 ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,1] ; AVX2-NEXT: vpermilpd %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vblendvpd %ymm2, %ymm3, %ymm0, %ymm0 +; AVX2-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX2-NEXT: vblendvpd %ymm1, %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512-LABEL: var_shuffle_v4f64: @@ -586,12 +584,11 @@ ; AVX2: # %bb.0: ; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 ; AVX2-NEXT: vpaddq %ymm1, %ymm1, %ymm1 -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [2,2,2,2] -; AVX2-NEXT: vpcmpgtq %ymm2, %ymm1, %ymm2 -; AVX2-NEXT: vpermilpd %ymm1, %ymm0, %ymm3 +; AVX2-NEXT: vpermilpd %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,1] ; AVX2-NEXT: vpermilpd %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vblendvpd %ymm2, %ymm3, %ymm0, %ymm0 +; AVX2-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX2-NEXT: vblendvpd %ymm1, %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512-LABEL: var_shuffle_v4i64_from_v2i64: @@ -1007,12 +1004,11 @@ ; AVX2: # %bb.0: ; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 ; AVX2-NEXT: vpaddq %ymm1, %ymm1, %ymm1 -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [2,2,2,2] -; AVX2-NEXT: vpcmpgtq %ymm2, %ymm1, %ymm2 -; AVX2-NEXT: vpermilpd %ymm1, %ymm0, %ymm3 +; AVX2-NEXT: vpermilpd %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,1] ; AVX2-NEXT: vpermilpd %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vblendvpd %ymm2, %ymm3, %ymm0, %ymm0 +; AVX2-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX2-NEXT: vblendvpd %ymm1, %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512-LABEL: var_shuffle_v4f64_from_v2f64: @@ -1290,14 +1286,13 @@ ; AVX2-LABEL: var_shuffle_v4i64_with_v16i8_indices: ; AVX2: # %bb.0: ; AVX2-NEXT: vpmovzxbq {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero,xmm1[2],zero,zero,zero,zero,zero,zero,zero,xmm1[3],zero,zero,zero,zero,zero,zero,zero +; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm0[2,3,2,3] ; AVX2-NEXT: vpaddq %ymm1, %ymm1, %ymm1 -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [2,2,2,2] -; AVX2-NEXT: vpcmpgtq %ymm2, %ymm1, %ymm2 -; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm0[2,3,2,3] -; AVX2-NEXT: vpermilpd %ymm1, %ymm3, %ymm3 +; AVX2-NEXT: vpermilpd %ymm1, %ymm2, %ymm2 ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,1] ; AVX2-NEXT: vpermilpd %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vblendvpd %ymm2, %ymm3, %ymm0, %ymm0 +; AVX2-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX2-NEXT: vblendvpd %ymm1, %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512-LABEL: var_shuffle_v4i64_with_v16i8_indices: diff --git a/llvm/test/CodeGen/X86/vec-strict-fptoint-128.ll b/llvm/test/CodeGen/X86/vec-strict-fptoint-128.ll --- a/llvm/test/CodeGen/X86/vec-strict-fptoint-128.ll +++ b/llvm/test/CodeGen/X86/vec-strict-fptoint-128.ll @@ -3063,10 +3063,10 @@ ; ; AVX-LABEL: strict_vector_fptoui_v4f32_to_v4i32: ; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} xmm1 = [2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9] +; AVX-NEXT: vbroadcastss {{.*#+}} xmm1 = [2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9] ; AVX-NEXT: vcmpltps %xmm1, %xmm0, %xmm2 ; AVX-NEXT: vxorps %xmm3, %xmm3, %xmm3 -; AVX-NEXT: vmovaps {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648] +; AVX-NEXT: vbroadcastss {{.*#+}} xmm4 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] ; AVX-NEXT: vblendvps %xmm2, %xmm3, %xmm4, %xmm4 ; AVX-NEXT: vblendvps %xmm2, %xmm3, %xmm1, %xmm1 ; AVX-NEXT: vsubps %xmm1, %xmm0, %xmm0 diff --git a/llvm/test/CodeGen/X86/vec-strict-fptoint-256.ll b/llvm/test/CodeGen/X86/vec-strict-fptoint-256.ll --- a/llvm/test/CodeGen/X86/vec-strict-fptoint-256.ll +++ b/llvm/test/CodeGen/X86/vec-strict-fptoint-256.ll @@ -1113,12 +1113,12 @@ define <4 x i32> @strict_vector_fptoui_v4f64_to_v4i32(<4 x double> %a) #0 { ; AVX-LABEL: strict_vector_fptoui_v4f64_to_v4i32: ; AVX: # %bb.0: -; AVX-NEXT: vmovapd {{.*#+}} ymm1 = [2.147483648E+9,2.147483648E+9,2.147483648E+9,2.147483648E+9] +; AVX-NEXT: vbroadcastsd {{.*#+}} ymm1 = [2.147483648E+9,2.147483648E+9,2.147483648E+9,2.147483648E+9] ; AVX-NEXT: vcmpltpd %ymm1, %ymm0, %ymm2 ; AVX-NEXT: vextractf128 $1, %ymm2, %xmm3 ; AVX-NEXT: vshufps {{.*#+}} xmm3 = xmm2[0,2],xmm3[0,2] ; AVX-NEXT: vxorps %xmm4, %xmm4, %xmm4 -; AVX-NEXT: vmovaps {{.*#+}} xmm5 = [2147483648,2147483648,2147483648,2147483648] +; AVX-NEXT: vbroadcastss {{.*#+}} xmm5 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] ; AVX-NEXT: vblendvps %xmm3, %xmm4, %xmm5, %xmm3 ; AVX-NEXT: vxorps %xmm4, %xmm4, %xmm4 ; AVX-NEXT: vblendvpd %ymm2, %ymm4, %ymm1, %ymm1 @@ -1379,10 +1379,10 @@ define <8 x i32> @strict_vector_fptoui_v8f32_to_v8i32(<8 x float> %a) #0 { ; AVX-LABEL: strict_vector_fptoui_v8f32_to_v8i32: ; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} ymm1 = [2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9] +; AVX-NEXT: vbroadcastss {{.*#+}} ymm1 = [2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9] ; AVX-NEXT: vcmpltps %ymm1, %ymm0, %ymm2 ; AVX-NEXT: vxorps %xmm3, %xmm3, %xmm3 -; AVX-NEXT: vmovaps {{.*#+}} ymm4 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648] +; AVX-NEXT: vbroadcastss {{.*#+}} ymm4 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] ; AVX-NEXT: vblendvps %ymm2, %ymm3, %ymm4, %ymm4 ; AVX-NEXT: vblendvps %ymm2, %ymm3, %ymm1, %ymm1 ; AVX-NEXT: vsubps %ymm1, %ymm0, %ymm0 diff --git a/llvm/test/CodeGen/X86/vec-strict-inttofp-256.ll b/llvm/test/CodeGen/X86/vec-strict-inttofp-256.ll --- a/llvm/test/CodeGen/X86/vec-strict-inttofp-256.ll +++ b/llvm/test/CodeGen/X86/vec-strict-inttofp-256.ll @@ -405,17 +405,23 @@ ; AVX1-64-NEXT: vaddps %ymm0, %ymm1, %ymm0 ; AVX1-64-NEXT: retq ; -; AVX2-LABEL: uitofp_v8i32_v8f32: -; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm1 = [1258291200,1258291200,1258291200,1258291200,1258291200,1258291200,1258291200,1258291200] -; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] -; AVX2-NEXT: vpsrld $16, %ymm0, %ymm0 -; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [1392508928,1392508928,1392508928,1392508928,1392508928,1392508928,1392508928,1392508928] -; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7],ymm0[8],ymm2[9],ymm0[10],ymm2[11],ymm0[12],ymm2[13],ymm0[14],ymm2[15] -; AVX2-NEXT: vbroadcastss {{.*#+}} ymm2 = [5.49764202E+11,5.49764202E+11,5.49764202E+11,5.49764202E+11,5.49764202E+11,5.49764202E+11,5.49764202E+11,5.49764202E+11] -; AVX2-NEXT: vsubps %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vaddps %ymm0, %ymm1, %ymm0 -; AVX2-NEXT: ret{{[l|q]}} +; AVX2-32-LABEL: uitofp_v8i32_v8f32: +; AVX2-32: # %bb.0: +; AVX2-32-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],mem[1],ymm0[2],mem[3],ymm0[4],mem[5],ymm0[6],mem[7],ymm0[8],mem[9],ymm0[10],mem[11],ymm0[12],mem[13],ymm0[14],mem[15] +; AVX2-32-NEXT: vpsrld $16, %ymm0, %ymm0 +; AVX2-32-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],mem[1],ymm0[2],mem[3],ymm0[4],mem[5],ymm0[6],mem[7],ymm0[8],mem[9],ymm0[10],mem[11],ymm0[12],mem[13],ymm0[14],mem[15] +; AVX2-32-NEXT: vsubps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0 +; AVX2-32-NEXT: vaddps %ymm0, %ymm1, %ymm0 +; AVX2-32-NEXT: retl +; +; AVX2-64-LABEL: uitofp_v8i32_v8f32: +; AVX2-64: # %bb.0: +; AVX2-64-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],mem[1],ymm0[2],mem[3],ymm0[4],mem[5],ymm0[6],mem[7],ymm0[8],mem[9],ymm0[10],mem[11],ymm0[12],mem[13],ymm0[14],mem[15] +; AVX2-64-NEXT: vpsrld $16, %ymm0, %ymm0 +; AVX2-64-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],mem[1],ymm0[2],mem[3],ymm0[4],mem[5],ymm0[6],mem[7],ymm0[8],mem[9],ymm0[10],mem[11],ymm0[12],mem[13],ymm0[14],mem[15] +; AVX2-64-NEXT: vsubps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX2-64-NEXT: vaddps %ymm0, %ymm1, %ymm0 +; AVX2-64-NEXT: retq ; ; AVX512F-LABEL: uitofp_v8i32_v8f32: ; AVX512F: # %bb.0: @@ -472,12 +478,17 @@ ; AVX1-64-NEXT: vcvtdq2pd %xmm0, %ymm0 ; AVX1-64-NEXT: retq ; -; AVX2-LABEL: uitofp_v4i1_v4f64: -; AVX2: # %bb.0: -; AVX2-NEXT: vbroadcastss {{.*#+}} xmm1 = [1,1,1,1] -; AVX2-NEXT: vandps %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vcvtdq2pd %xmm0, %ymm0 -; AVX2-NEXT: ret{{[l|q]}} +; AVX2-32-LABEL: uitofp_v4i1_v4f64: +; AVX2-32: # %bb.0: +; AVX2-32-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 +; AVX2-32-NEXT: vcvtdq2pd %xmm0, %ymm0 +; AVX2-32-NEXT: retl +; +; AVX2-64-LABEL: uitofp_v4i1_v4f64: +; AVX2-64: # %bb.0: +; AVX2-64-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX2-64-NEXT: vcvtdq2pd %xmm0, %ymm0 +; AVX2-64-NEXT: retq ; ; AVX512F-LABEL: uitofp_v4i1_v4f64: ; AVX512F: # %bb.0: @@ -831,32 +842,31 @@ ; AVX2-64-LABEL: uitofp_v4i64_v4f64: ; AVX2-64: # %bb.0: ; AVX2-64-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX2-64-NEXT: vextractps $3, %xmm1, %eax +; AVX2-64-NEXT: vextractps $2, %xmm1, %eax ; AVX2-64-NEXT: vcvtsi2sd %rax, %xmm2, %xmm2 -; AVX2-64-NEXT: vextractps $1, %xmm1, %eax +; AVX2-64-NEXT: vmovd %xmm1, %eax ; AVX2-64-NEXT: vcvtsi2sd %rax, %xmm3, %xmm3 ; AVX2-64-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm3[0],xmm2[0] -; AVX2-64-NEXT: vextractps $3, %xmm0, %eax +; AVX2-64-NEXT: vextractps $2, %xmm0, %eax ; AVX2-64-NEXT: vcvtsi2sd %rax, %xmm4, %xmm3 -; AVX2-64-NEXT: vextractps $1, %xmm0, %eax +; AVX2-64-NEXT: vmovq %xmm0, %rax +; AVX2-64-NEXT: movl %eax, %eax ; AVX2-64-NEXT: vcvtsi2sd %rax, %xmm4, %xmm4 ; AVX2-64-NEXT: vunpcklpd {{.*#+}} xmm3 = xmm4[0],xmm3[0] ; AVX2-64-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 -; AVX2-64-NEXT: vbroadcastsd {{.*#+}} ymm3 = [4.294967296E+9,4.294967296E+9,4.294967296E+9,4.294967296E+9] -; AVX2-64-NEXT: vmulpd %ymm3, %ymm2, %ymm2 -; AVX2-64-NEXT: vextractps $2, %xmm1, %eax +; AVX2-64-NEXT: vpextrd $3, %xmm1, %eax ; AVX2-64-NEXT: vcvtsi2sd %rax, %xmm5, %xmm3 -; AVX2-64-NEXT: vmovd %xmm1, %eax +; AVX2-64-NEXT: vpextrd $1, %xmm1, %eax ; AVX2-64-NEXT: vcvtsi2sd %rax, %xmm5, %xmm1 ; AVX2-64-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm3[0] -; AVX2-64-NEXT: vextractps $2, %xmm0, %eax +; AVX2-64-NEXT: vpextrd $3, %xmm0, %eax ; AVX2-64-NEXT: vcvtsi2sd %rax, %xmm5, %xmm3 -; AVX2-64-NEXT: vmovq %xmm0, %rax -; AVX2-64-NEXT: movl %eax, %eax +; AVX2-64-NEXT: vpextrd $1, %xmm0, %eax ; AVX2-64-NEXT: vcvtsi2sd %rax, %xmm5, %xmm0 ; AVX2-64-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm3[0] ; AVX2-64-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-64-NEXT: vaddpd %ymm0, %ymm2, %ymm0 +; AVX2-64-NEXT: vmulpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX2-64-NEXT: vaddpd %ymm2, %ymm0, %ymm0 ; AVX2-64-NEXT: retq ; ; AVX512F-64-LABEL: uitofp_v4i64_v4f64: @@ -1111,8 +1121,7 @@ ; ; AVX2-64-LABEL: uitofp_v4i64_v4f32: ; AVX2-64: # %bb.0: -; AVX2-64-NEXT: vpbroadcastq {{.*#+}} ymm1 = [1,1,1,1] -; AVX2-64-NEXT: vpand %ymm1, %ymm0, %ymm1 +; AVX2-64-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1 ; AVX2-64-NEXT: vpsrlq $1, %ymm0, %ymm2 ; AVX2-64-NEXT: vpor %ymm1, %ymm2, %ymm1 ; AVX2-64-NEXT: vblendvpd %ymm0, %ymm1, %ymm0, %ymm1 diff --git a/llvm/test/CodeGen/X86/vec_cmp_uint-128.ll b/llvm/test/CodeGen/X86/vec_cmp_uint-128.ll --- a/llvm/test/CodeGen/X86/vec_cmp_uint-128.ll +++ b/llvm/test/CodeGen/X86/vec_cmp_uint-128.ll @@ -342,7 +342,7 @@ ; ; AVX2-LABEL: ge_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] ; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm1 ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 @@ -516,7 +516,7 @@ ; ; AVX2-LABEL: gt_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] ; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm1 ; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 @@ -764,7 +764,7 @@ ; ; AVX2-LABEL: le_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] ; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm1 ; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 @@ -939,7 +939,7 @@ ; ; AVX2-LABEL: lt_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] ; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm1 ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 diff --git a/llvm/test/CodeGen/X86/vec_fabs.ll b/llvm/test/CodeGen/X86/vec_fabs.ll --- a/llvm/test/CodeGen/X86/vec_fabs.ll +++ b/llvm/test/CodeGen/X86/vec_fabs.ll @@ -149,7 +149,7 @@ define <8 x double> @fabs_v8f64(<8 x double> %p) { ; X86-AVX-LABEL: fabs_v8f64: ; X86-AVX: # %bb.0: -; X86-AVX-NEXT: vmovaps {{.*#+}} ymm2 = [NaN,NaN,NaN,NaN] +; X86-AVX-NEXT: vbroadcastsd {{.*#+}} ymm2 = [NaN,NaN,NaN,NaN] ; X86-AVX-NEXT: vandps %ymm2, %ymm0, %ymm0 ; X86-AVX-NEXT: vandps %ymm2, %ymm1, %ymm1 ; X86-AVX-NEXT: retl @@ -166,7 +166,7 @@ ; ; X64-AVX-LABEL: fabs_v8f64: ; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vmovaps {{.*#+}} ymm2 = [NaN,NaN,NaN,NaN] +; X64-AVX-NEXT: vbroadcastsd {{.*#+}} ymm2 = [NaN,NaN,NaN,NaN] ; X64-AVX-NEXT: vandps %ymm2, %ymm0, %ymm0 ; X64-AVX-NEXT: vandps %ymm2, %ymm1, %ymm1 ; X64-AVX-NEXT: retq @@ -188,7 +188,7 @@ define <16 x float> @fabs_v16f32(<16 x float> %p) { ; X86-AVX-LABEL: fabs_v16f32: ; X86-AVX: # %bb.0: -; X86-AVX-NEXT: vmovaps {{.*#+}} ymm2 = [NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN] +; X86-AVX-NEXT: vbroadcastss {{.*#+}} ymm2 = [NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN] ; X86-AVX-NEXT: vandps %ymm2, %ymm0, %ymm0 ; X86-AVX-NEXT: vandps %ymm2, %ymm1, %ymm1 ; X86-AVX-NEXT: retl @@ -205,7 +205,7 @@ ; ; X64-AVX-LABEL: fabs_v16f32: ; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vmovaps {{.*#+}} ymm2 = [NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN] +; X64-AVX-NEXT: vbroadcastss {{.*#+}} ymm2 = [NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN] ; X64-AVX-NEXT: vandps %ymm2, %ymm0, %ymm0 ; X64-AVX-NEXT: vandps %ymm2, %ymm1, %ymm1 ; X64-AVX-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vec_fp_to_int.ll b/llvm/test/CodeGen/X86/vec_fp_to_int.ll --- a/llvm/test/CodeGen/X86/vec_fp_to_int.ll +++ b/llvm/test/CodeGen/X86/vec_fp_to_int.ll @@ -448,30 +448,17 @@ ; SSE-NEXT: orpd %xmm1, %xmm0 ; SSE-NEXT: retq ; -; AVX1-LABEL: fptoui_4f64_to_2i32: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovapd %xmm0, %xmm0 -; AVX1-NEXT: vcvttpd2dq %ymm0, %xmm1 -; AVX1-NEXT: vpsrad $31, %xmm1, %xmm2 -; AVX1-NEXT: vsubpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; AVX1-NEXT: vcvttpd2dq %ymm0, %xmm0 -; AVX1-NEXT: vandpd %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vorpd %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: fptoui_4f64_to_2i32: -; AVX2: # %bb.0: -; AVX2-NEXT: vmovapd %xmm0, %xmm0 -; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm1 = [2.147483648E+9,2.147483648E+9,2.147483648E+9,2.147483648E+9] -; AVX2-NEXT: vsubpd %ymm1, %ymm0, %ymm1 -; AVX2-NEXT: vcvttpd2dq %ymm1, %xmm1 -; AVX2-NEXT: vcvttpd2dq %ymm0, %xmm0 -; AVX2-NEXT: vpsrad $31, %xmm0, %xmm2 -; AVX2-NEXT: vandpd %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vorpd %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq +; VEX-LABEL: fptoui_4f64_to_2i32: +; VEX: # %bb.0: +; VEX-NEXT: vmovapd %xmm0, %xmm0 +; VEX-NEXT: vcvttpd2dq %ymm0, %xmm1 +; VEX-NEXT: vpsrad $31, %xmm1, %xmm2 +; VEX-NEXT: vsubpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; VEX-NEXT: vcvttpd2dq %ymm0, %xmm0 +; VEX-NEXT: vandpd %xmm2, %xmm0, %xmm0 +; VEX-NEXT: vorpd %xmm0, %xmm1, %xmm0 +; VEX-NEXT: vzeroupper +; VEX-NEXT: retq ; ; AVX512F-LABEL: fptoui_4f64_to_2i32: ; AVX512F: # %bb.0: @@ -710,28 +697,16 @@ ; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm4[0] ; SSE-NEXT: retq ; -; AVX1-LABEL: fptoui_4f64_to_4i32: -; AVX1: # %bb.0: -; AVX1-NEXT: vcvttpd2dq %ymm0, %xmm1 -; AVX1-NEXT: vpsrad $31, %xmm1, %xmm2 -; AVX1-NEXT: vsubpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; AVX1-NEXT: vcvttpd2dq %ymm0, %xmm0 -; AVX1-NEXT: vandpd %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vorpd %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: fptoui_4f64_to_4i32: -; AVX2: # %bb.0: -; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm1 = [2.147483648E+9,2.147483648E+9,2.147483648E+9,2.147483648E+9] -; AVX2-NEXT: vsubpd %ymm1, %ymm0, %ymm1 -; AVX2-NEXT: vcvttpd2dq %ymm1, %xmm1 -; AVX2-NEXT: vcvttpd2dq %ymm0, %xmm0 -; AVX2-NEXT: vpsrad $31, %xmm0, %xmm2 -; AVX2-NEXT: vandpd %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vorpd %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq +; VEX-LABEL: fptoui_4f64_to_4i32: +; VEX: # %bb.0: +; VEX-NEXT: vcvttpd2dq %ymm0, %xmm1 +; VEX-NEXT: vpsrad $31, %xmm1, %xmm2 +; VEX-NEXT: vsubpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; VEX-NEXT: vcvttpd2dq %ymm0, %xmm0 +; VEX-NEXT: vandpd %xmm2, %xmm0, %xmm0 +; VEX-NEXT: vorpd %xmm0, %xmm1, %xmm0 +; VEX-NEXT: vzeroupper +; VEX-NEXT: retq ; ; AVX512F-LABEL: fptoui_4f64_to_4i32: ; AVX512F: # %bb.0: @@ -1164,26 +1139,15 @@ ; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: retq ; -; AVX1-LABEL: fptoui_2f32_to_2i32: -; AVX1: # %bb.0: -; AVX1-NEXT: vcvttps2dq %xmm0, %xmm1 -; AVX1-NEXT: vpsrad $31, %xmm1, %xmm2 -; AVX1-NEXT: vsubps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX1-NEXT: vcvttps2dq %xmm0, %xmm0 -; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpor %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: fptoui_2f32_to_2i32: -; AVX2: # %bb.0: -; AVX2-NEXT: vbroadcastss {{.*#+}} xmm1 = [2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9] -; AVX2-NEXT: vsubps %xmm1, %xmm0, %xmm1 -; AVX2-NEXT: vcvttps2dq %xmm1, %xmm1 -; AVX2-NEXT: vcvttps2dq %xmm0, %xmm0 -; AVX2-NEXT: vpsrad $31, %xmm0, %xmm2 -; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: retq +; VEX-LABEL: fptoui_2f32_to_2i32: +; VEX: # %bb.0: +; VEX-NEXT: vcvttps2dq %xmm0, %xmm1 +; VEX-NEXT: vpsrad $31, %xmm1, %xmm2 +; VEX-NEXT: vsubps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; VEX-NEXT: vcvttps2dq %xmm0, %xmm0 +; VEX-NEXT: vpand %xmm2, %xmm0, %xmm0 +; VEX-NEXT: vpor %xmm0, %xmm1, %xmm0 +; VEX-NEXT: retq ; ; AVX512F-LABEL: fptoui_2f32_to_2i32: ; AVX512F: # %bb.0: @@ -1226,26 +1190,15 @@ ; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: retq ; -; AVX1-LABEL: fptoui_4f32_to_4i32: -; AVX1: # %bb.0: -; AVX1-NEXT: vcvttps2dq %xmm0, %xmm1 -; AVX1-NEXT: vpsrad $31, %xmm1, %xmm2 -; AVX1-NEXT: vsubps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX1-NEXT: vcvttps2dq %xmm0, %xmm0 -; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpor %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: fptoui_4f32_to_4i32: -; AVX2: # %bb.0: -; AVX2-NEXT: vbroadcastss {{.*#+}} xmm1 = [2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9] -; AVX2-NEXT: vsubps %xmm1, %xmm0, %xmm1 -; AVX2-NEXT: vcvttps2dq %xmm1, %xmm1 -; AVX2-NEXT: vcvttps2dq %xmm0, %xmm0 -; AVX2-NEXT: vpsrad $31, %xmm0, %xmm2 -; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: retq +; VEX-LABEL: fptoui_4f32_to_4i32: +; VEX: # %bb.0: +; VEX-NEXT: vcvttps2dq %xmm0, %xmm1 +; VEX-NEXT: vpsrad $31, %xmm1, %xmm2 +; VEX-NEXT: vsubps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; VEX-NEXT: vcvttps2dq %xmm0, %xmm0 +; VEX-NEXT: vpand %xmm2, %xmm0, %xmm0 +; VEX-NEXT: vpor %xmm0, %xmm1, %xmm0 +; VEX-NEXT: retq ; ; AVX512F-LABEL: fptoui_4f32_to_4i32: ; AVX512F: # %bb.0: @@ -1481,13 +1434,12 @@ ; ; AVX2-LABEL: fptoui_8f32_to_8i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vbroadcastss {{.*#+}} ymm1 = [2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9] -; AVX2-NEXT: vsubps %ymm1, %ymm0, %ymm1 -; AVX2-NEXT: vcvttps2dq %ymm1, %ymm1 +; AVX2-NEXT: vcvttps2dq %ymm0, %ymm1 +; AVX2-NEXT: vpsrad $31, %ymm1, %ymm2 +; AVX2-NEXT: vsubps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: vcvttps2dq %ymm0, %ymm0 -; AVX2-NEXT: vpsrad $31, %ymm0, %ymm2 -; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1 -; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpor %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: retq ; ; AVX512F-LABEL: fptoui_8f32_to_8i32: diff --git a/llvm/test/CodeGen/X86/vec_int_to_fp.ll b/llvm/test/CodeGen/X86/vec_int_to_fp.ll --- a/llvm/test/CodeGen/X86/vec_int_to_fp.ll +++ b/llvm/test/CodeGen/X86/vec_int_to_fp.ll @@ -52,14 +52,23 @@ ; SSE41-NEXT: cvtpd2ps %xmm0, %xmm0 ; SSE41-NEXT: retq ; -; VEX-LABEL: uitofp_2i32_to_2f32: -; VEX: # %bb.0: -; VEX-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero -; VEX-NEXT: vmovdqa {{.*#+}} xmm1 = [4.503599627370496E+15,4.503599627370496E+15] -; VEX-NEXT: vpor %xmm1, %xmm0, %xmm0 -; VEX-NEXT: vsubpd %xmm1, %xmm0, %xmm0 -; VEX-NEXT: vcvtpd2ps %xmm0, %xmm0 -; VEX-NEXT: retq +; AVX1-LABEL: uitofp_2i32_to_2f32: +; AVX1: # %bb.0: +; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero +; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [4.503599627370496E+15,4.503599627370496E+15] +; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vsubpd %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vcvtpd2ps %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: uitofp_2i32_to_2f32: +; AVX2: # %bb.0: +; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [4841369599423283200,4841369599423283200] +; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vsubpd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vcvtpd2ps %xmm0, %xmm0 +; AVX2-NEXT: retq ; ; AVX512F-LABEL: uitofp_2i32_to_2f32: ; AVX512F: # %bb.0: @@ -667,13 +676,21 @@ ; SSE41-NEXT: subpd %xmm1, %xmm0 ; SSE41-NEXT: retq ; -; VEX-LABEL: uitofp_2i32_to_2f64: -; VEX: # %bb.0: -; VEX-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero -; VEX-NEXT: vmovdqa {{.*#+}} xmm1 = [4.503599627370496E+15,4.503599627370496E+15] -; VEX-NEXT: vpor %xmm1, %xmm0, %xmm0 -; VEX-NEXT: vsubpd %xmm1, %xmm0, %xmm0 -; VEX-NEXT: retq +; AVX1-LABEL: uitofp_2i32_to_2f64: +; AVX1: # %bb.0: +; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero +; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [4.503599627370496E+15,4.503599627370496E+15] +; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vsubpd %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: uitofp_2i32_to_2f64: +; AVX2: # %bb.0: +; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [4841369599423283200,4841369599423283200] +; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vsubpd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: retq ; ; AVX512F-LABEL: uitofp_2i32_to_2f64: ; AVX512F: # %bb.0: @@ -943,13 +960,10 @@ ; AVX2: # %bb.0: ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [4841369599423283200,4841369599423283200,4841369599423283200,4841369599423283200] -; AVX2-NEXT: vpor %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 ; AVX2-NEXT: vpsrlq $32, %ymm0, %ymm0 -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [4985484787499139072,4985484787499139072,4985484787499139072,4985484787499139072] -; AVX2-NEXT: vpor %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm2 = [1.9342813118337666E+25,1.9342813118337666E+25,1.9342813118337666E+25,1.9342813118337666E+25] -; AVX2-NEXT: vsubpd %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX2-NEXT: vsubpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: vaddpd %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: retq ; @@ -2165,8 +2179,7 @@ ; AVX2-LABEL: uitofp_4i64_to_4f32_undef: ; AVX2: # %bb.0: ; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [1,1,1,1] -; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm1 +; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1 ; AVX2-NEXT: vpsrlq $1, %ymm0, %ymm2 ; AVX2-NEXT: vpor %ymm1, %ymm2, %ymm1 ; AVX2-NEXT: vblendvpd %ymm0, %ymm1, %ymm0, %ymm1 @@ -2247,26 +2260,14 @@ ; SSE41-NEXT: addps %xmm1, %xmm0 ; SSE41-NEXT: retq ; -; AVX1-LABEL: uitofp_4i32_to_4f32: -; AVX1: # %bb.0: -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],mem[1],xmm0[2],mem[3],xmm0[4],mem[5],xmm0[6],mem[7] -; AVX1-NEXT: vpsrld $16, %xmm0, %xmm0 -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],mem[1],xmm0[2],mem[3],xmm0[4],mem[5],xmm0[6],mem[7] -; AVX1-NEXT: vsubps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX1-NEXT: vaddps %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: uitofp_4i32_to_4f32: -; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1258291200,1258291200,1258291200,1258291200] -; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] -; AVX2-NEXT: vpsrld $16, %xmm0, %xmm0 -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [1392508928,1392508928,1392508928,1392508928] -; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7] -; AVX2-NEXT: vbroadcastss {{.*#+}} xmm2 = [5.49764202E+11,5.49764202E+11,5.49764202E+11,5.49764202E+11] -; AVX2-NEXT: vsubps %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vaddps %xmm0, %xmm1, %xmm0 -; AVX2-NEXT: retq +; VEX-LABEL: uitofp_4i32_to_4f32: +; VEX: # %bb.0: +; VEX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],mem[1],xmm0[2],mem[3],xmm0[4],mem[5],xmm0[6],mem[7] +; VEX-NEXT: vpsrld $16, %xmm0, %xmm0 +; VEX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],mem[1],xmm0[2],mem[3],xmm0[4],mem[5],xmm0[6],mem[7] +; VEX-NEXT: vsubps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; VEX-NEXT: vaddps %xmm0, %xmm1, %xmm0 +; VEX-NEXT: retq ; ; AVX512F-LABEL: uitofp_4i32_to_4f32: ; AVX512F: # %bb.0: @@ -2578,8 +2579,7 @@ ; ; AVX2-LABEL: uitofp_4i64_to_4f32: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [1,1,1,1] -; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm1 +; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1 ; AVX2-NEXT: vpsrlq $1, %ymm0, %ymm2 ; AVX2-NEXT: vpor %ymm1, %ymm2, %ymm1 ; AVX2-NEXT: vblendvpd %ymm0, %ymm1, %ymm0, %ymm1 @@ -2708,13 +2708,10 @@ ; ; AVX2-LABEL: uitofp_8i32_to_8f32: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm1 = [1258291200,1258291200,1258291200,1258291200,1258291200,1258291200,1258291200,1258291200] -; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] +; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],mem[1],ymm0[2],mem[3],ymm0[4],mem[5],ymm0[6],mem[7],ymm0[8],mem[9],ymm0[10],mem[11],ymm0[12],mem[13],ymm0[14],mem[15] ; AVX2-NEXT: vpsrld $16, %ymm0, %ymm0 -; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [1392508928,1392508928,1392508928,1392508928,1392508928,1392508928,1392508928,1392508928] -; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7],ymm0[8],ymm2[9],ymm0[10],ymm2[11],ymm0[12],ymm2[13],ymm0[14],ymm2[15] -; AVX2-NEXT: vbroadcastss {{.*#+}} ymm2 = [5.49764202E+11,5.49764202E+11,5.49764202E+11,5.49764202E+11,5.49764202E+11,5.49764202E+11,5.49764202E+11,5.49764202E+11] -; AVX2-NEXT: vsubps %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],mem[1],ymm0[2],mem[3],ymm0[4],mem[5],ymm0[6],mem[7],ymm0[8],mem[9],ymm0[10],mem[11],ymm0[12],mem[13],ymm0[14],mem[15] +; AVX2-NEXT: vsubps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: vaddps %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: retq ; @@ -3343,13 +3340,21 @@ ; SSE41-NEXT: subpd %xmm1, %xmm0 ; SSE41-NEXT: retq ; -; VEX-LABEL: uitofp_load_2i32_to_2f64: -; VEX: # %bb.0: -; VEX-NEXT: vpmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero -; VEX-NEXT: vmovdqa {{.*#+}} xmm1 = [4.503599627370496E+15,4.503599627370496E+15] -; VEX-NEXT: vpor %xmm1, %xmm0, %xmm0 -; VEX-NEXT: vsubpd %xmm1, %xmm0, %xmm0 -; VEX-NEXT: retq +; AVX1-LABEL: uitofp_load_2i32_to_2f64: +; AVX1: # %bb.0: +; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero +; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [4.503599627370496E+15,4.503599627370496E+15] +; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vsubpd %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: uitofp_load_2i32_to_2f64: +; AVX2: # %bb.0: +; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [4841369599423283200,4841369599423283200] +; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vsubpd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: retq ; ; AVX512F-LABEL: uitofp_load_2i32_to_2f64: ; AVX512F: # %bb.0: @@ -3642,13 +3647,10 @@ ; AVX2-NEXT: vmovdqa (%rdi), %ymm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [4841369599423283200,4841369599423283200,4841369599423283200,4841369599423283200] -; AVX2-NEXT: vpor %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 ; AVX2-NEXT: vpsrlq $32, %ymm0, %ymm0 -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [4985484787499139072,4985484787499139072,4985484787499139072,4985484787499139072] -; AVX2-NEXT: vpor %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm2 = [1.9342813118337666E+25,1.9342813118337666E+25,1.9342813118337666E+25,1.9342813118337666E+25] -; AVX2-NEXT: vsubpd %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX2-NEXT: vsubpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: vaddpd %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: retq ; @@ -4371,8 +4373,7 @@ ; AVX2-LABEL: uitofp_load_4i64_to_4f32: ; AVX2: # %bb.0: ; AVX2-NEXT: vmovdqa (%rdi), %ymm0 -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [1,1,1,1] -; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm1 +; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1 ; AVX2-NEXT: vpsrlq $1, %ymm0, %ymm2 ; AVX2-NEXT: vpor %ymm1, %ymm2, %ymm1 ; AVX2-NEXT: vblendvpd %ymm0, %ymm1, %ymm0, %ymm0 @@ -4458,28 +4459,15 @@ ; SSE41-NEXT: addps %xmm1, %xmm0 ; SSE41-NEXT: retq ; -; AVX1-LABEL: uitofp_load_4i32_to_4f32: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa (%rdi), %xmm0 -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],mem[1],xmm0[2],mem[3],xmm0[4],mem[5],xmm0[6],mem[7] -; AVX1-NEXT: vpsrld $16, %xmm0, %xmm0 -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],mem[1],xmm0[2],mem[3],xmm0[4],mem[5],xmm0[6],mem[7] -; AVX1-NEXT: vsubps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX1-NEXT: vaddps %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: uitofp_load_4i32_to_4f32: -; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa (%rdi), %xmm0 -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1258291200,1258291200,1258291200,1258291200] -; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] -; AVX2-NEXT: vpsrld $16, %xmm0, %xmm0 -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [1392508928,1392508928,1392508928,1392508928] -; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7] -; AVX2-NEXT: vbroadcastss {{.*#+}} xmm2 = [5.49764202E+11,5.49764202E+11,5.49764202E+11,5.49764202E+11] -; AVX2-NEXT: vsubps %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vaddps %xmm0, %xmm1, %xmm0 -; AVX2-NEXT: retq +; VEX-LABEL: uitofp_load_4i32_to_4f32: +; VEX: # %bb.0: +; VEX-NEXT: vmovdqa (%rdi), %xmm0 +; VEX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],mem[1],xmm0[2],mem[3],xmm0[4],mem[5],xmm0[6],mem[7] +; VEX-NEXT: vpsrld $16, %xmm0, %xmm0 +; VEX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],mem[1],xmm0[2],mem[3],xmm0[4],mem[5],xmm0[6],mem[7] +; VEX-NEXT: vsubps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; VEX-NEXT: vaddps %xmm0, %xmm1, %xmm0 +; VEX-NEXT: retq ; ; AVX512F-LABEL: uitofp_load_4i32_to_4f32: ; AVX512F: # %bb.0: @@ -4786,7 +4774,7 @@ ; AVX1: # %bb.0: ; AVX1-NEXT: vmovapd (%rdi), %ymm2 ; AVX1-NEXT: vmovapd 32(%rdi), %ymm3 -; AVX1-NEXT: vmovapd {{.*#+}} ymm4 = [1,1,1,1] +; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm4 = [4.9406564584124654E-324,4.9406564584124654E-324,4.9406564584124654E-324,4.9406564584124654E-324] ; AVX1-NEXT: vandpd %ymm4, %ymm3, %ymm5 ; AVX1-NEXT: vmovaps (%rdi), %xmm0 ; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1 @@ -4998,13 +4986,10 @@ ; AVX2-LABEL: uitofp_load_8i32_to_8f32: ; AVX2: # %bb.0: ; AVX2-NEXT: vmovdqa (%rdi), %ymm0 -; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm1 = [1258291200,1258291200,1258291200,1258291200,1258291200,1258291200,1258291200,1258291200] -; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] +; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],mem[1],ymm0[2],mem[3],ymm0[4],mem[5],ymm0[6],mem[7],ymm0[8],mem[9],ymm0[10],mem[11],ymm0[12],mem[13],ymm0[14],mem[15] ; AVX2-NEXT: vpsrld $16, %ymm0, %ymm0 -; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [1392508928,1392508928,1392508928,1392508928,1392508928,1392508928,1392508928,1392508928] -; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7],ymm0[8],ymm2[9],ymm0[10],ymm2[11],ymm0[12],ymm2[13],ymm0[14],ymm2[15] -; AVX2-NEXT: vbroadcastss {{.*#+}} ymm2 = [5.49764202E+11,5.49764202E+11,5.49764202E+11,5.49764202E+11,5.49764202E+11,5.49764202E+11,5.49764202E+11,5.49764202E+11] -; AVX2-NEXT: vsubps %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],mem[1],ymm0[2],mem[3],ymm0[4],mem[5],ymm0[6],mem[7],ymm0[8],mem[9],ymm0[10],mem[11],ymm0[12],mem[13],ymm0[14],mem[15] +; AVX2-NEXT: vsubps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: vaddps %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: retq ; @@ -5640,7 +5625,8 @@ ; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm0 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [4985484787499139072,4985484787499139072] ; AVX1-NEXT: vpor %xmm5, %xmm0, %xmm0 -; AVX1-NEXT: vmovapd {{.*#+}} xmm6 = [1.9342813118337666E+25,1.9342813118337666E+25] +; AVX1-NEXT: vmovddup {{.*#+}} xmm6 = [1.9342813118337666E+25,1.9342813118337666E+25] +; AVX1-NEXT: # xmm6 = mem[0,0] ; AVX1-NEXT: vsubpd %xmm6, %xmm0, %xmm0 ; AVX1-NEXT: vaddpd %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] @@ -5649,7 +5635,8 @@ ; AVX1-NEXT: vpor %xmm5, %xmm1, %xmm1 ; AVX1-NEXT: vsubpd %xmm6, %xmm1, %xmm1 ; AVX1-NEXT: vaddpd %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vmovapd {{.*#+}} xmm2 = [5.0E-1,5.0E-1] +; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [5.0E-1,5.0E-1] +; AVX1-NEXT: # xmm2 = mem[0,0] ; AVX1-NEXT: vaddpd %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vaddpd %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vmovupd %xmm0, (%rdi) @@ -5661,12 +5648,13 @@ ; AVX2-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX2-NEXT: vpblendd {{.*#+}} xmm3 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] -; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = [4841369599423283200,4841369599423283200] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm4 = [4841369599423283200,4841369599423283200] ; AVX2-NEXT: vpor %xmm4, %xmm3, %xmm3 ; AVX2-NEXT: vpsrlq $32, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm5 = [4985484787499139072,4985484787499139072] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm5 = [4985484787499139072,4985484787499139072] ; AVX2-NEXT: vpor %xmm5, %xmm0, %xmm0 -; AVX2-NEXT: vmovapd {{.*#+}} xmm6 = [1.9342813118337666E+25,1.9342813118337666E+25] +; AVX2-NEXT: vmovddup {{.*#+}} xmm6 = [1.9342813118337666E+25,1.9342813118337666E+25] +; AVX2-NEXT: # xmm6 = mem[0,0] ; AVX2-NEXT: vsubpd %xmm6, %xmm0, %xmm0 ; AVX2-NEXT: vaddpd %xmm0, %xmm3, %xmm0 ; AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] @@ -5675,7 +5663,8 @@ ; AVX2-NEXT: vpor %xmm5, %xmm1, %xmm1 ; AVX2-NEXT: vsubpd %xmm6, %xmm1, %xmm1 ; AVX2-NEXT: vaddpd %xmm1, %xmm2, %xmm1 -; AVX2-NEXT: vmovapd {{.*#+}} xmm2 = [5.0E-1,5.0E-1] +; AVX2-NEXT: vmovddup {{.*#+}} xmm2 = [5.0E-1,5.0E-1] +; AVX2-NEXT: # xmm2 = mem[0,0] ; AVX2-NEXT: vaddpd %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vaddpd %xmm2, %xmm1, %xmm1 ; AVX2-NEXT: vmovupd %xmm0, (%rdi) @@ -5687,12 +5676,13 @@ ; AVX512F-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 ; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512F-NEXT: vpblendd {{.*#+}} xmm3 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm4 = [4841369599423283200,4841369599423283200] +; AVX512F-NEXT: vpbroadcastq {{.*#+}} xmm4 = [4841369599423283200,4841369599423283200] ; AVX512F-NEXT: vpor %xmm4, %xmm3, %xmm3 ; AVX512F-NEXT: vpsrlq $32, %xmm0, %xmm0 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm5 = [4985484787499139072,4985484787499139072] +; AVX512F-NEXT: vpbroadcastq {{.*#+}} xmm5 = [4985484787499139072,4985484787499139072] ; AVX512F-NEXT: vpor %xmm5, %xmm0, %xmm0 -; AVX512F-NEXT: vmovapd {{.*#+}} xmm6 = [1.9342813118337666E+25,1.9342813118337666E+25] +; AVX512F-NEXT: vmovddup {{.*#+}} xmm6 = [1.9342813118337666E+25,1.9342813118337666E+25] +; AVX512F-NEXT: # xmm6 = mem[0,0] ; AVX512F-NEXT: vsubpd %xmm6, %xmm0, %xmm0 ; AVX512F-NEXT: vaddpd %xmm0, %xmm3, %xmm0 ; AVX512F-NEXT: vpblendd {{.*#+}} xmm2 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] @@ -5701,7 +5691,8 @@ ; AVX512F-NEXT: vpor %xmm5, %xmm1, %xmm1 ; AVX512F-NEXT: vsubpd %xmm6, %xmm1, %xmm1 ; AVX512F-NEXT: vaddpd %xmm1, %xmm2, %xmm1 -; AVX512F-NEXT: vmovapd {{.*#+}} xmm2 = [5.0E-1,5.0E-1] +; AVX512F-NEXT: vmovddup {{.*#+}} xmm2 = [5.0E-1,5.0E-1] +; AVX512F-NEXT: # xmm2 = mem[0,0] ; AVX512F-NEXT: vaddpd %xmm2, %xmm0, %xmm0 ; AVX512F-NEXT: vaddpd %xmm2, %xmm1, %xmm1 ; AVX512F-NEXT: vmovupd %xmm0, (%rdi) @@ -5742,7 +5733,8 @@ ; AVX512DQ-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 ; AVX512DQ-NEXT: vcvtuqq2pd %zmm0, %zmm0 ; AVX512DQ-NEXT: vcvtuqq2pd %zmm1, %zmm1 -; AVX512DQ-NEXT: vmovapd {{.*#+}} xmm2 = [5.0E-1,5.0E-1] +; AVX512DQ-NEXT: vmovddup {{.*#+}} xmm2 = [5.0E-1,5.0E-1] +; AVX512DQ-NEXT: # xmm2 = mem[0,0] ; AVX512DQ-NEXT: vaddpd %xmm2, %xmm0, %xmm0 ; AVX512DQ-NEXT: vaddpd %xmm2, %xmm1, %xmm1 ; AVX512DQ-NEXT: vmovupd %xmm0, (%rdi) diff --git a/llvm/test/CodeGen/X86/vec_minmax_uint.ll b/llvm/test/CodeGen/X86/vec_minmax_uint.ll --- a/llvm/test/CodeGen/X86/vec_minmax_uint.ll +++ b/llvm/test/CodeGen/X86/vec_minmax_uint.ll @@ -71,7 +71,7 @@ ; ; AVX2-LABEL: max_gt_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] ; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm3 ; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm2 ; AVX2-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2 @@ -477,7 +477,7 @@ ; ; AVX2-LABEL: max_ge_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] ; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm3 ; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm2 ; AVX2-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2 @@ -882,7 +882,7 @@ ; ; AVX2-LABEL: min_lt_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] ; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm3 ; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm2 ; AVX2-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2 @@ -1290,7 +1290,7 @@ ; ; AVX2-LABEL: min_le_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] ; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm3 ; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm2 ; AVX2-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2 diff --git a/llvm/test/CodeGen/X86/vec_shift6.ll b/llvm/test/CodeGen/X86/vec_shift6.ll --- a/llvm/test/CodeGen/X86/vec_shift6.ll +++ b/llvm/test/CodeGen/X86/vec_shift6.ll @@ -162,8 +162,7 @@ ; ; AVX2-LABEL: test7: ; AVX2: # %bb.0: -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [2,2,4,8,128,1,512,2048,2,2,4,8,128,1,512,2048] -; AVX2-NEXT: # ymm2 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [2,2,4,8,128,1,512,2048,2,2,4,8,128,1,512,2048] ; AVX2-NEXT: vpmullw %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpmullw %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: retq @@ -227,8 +226,7 @@ ; ; AVX2-LABEL: test8: ; AVX2: # %bb.0: -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [1,1,2,3,1,1,2,3] -; AVX2-NEXT: # ymm2 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [1,1,2,3,1,1,2,3] ; AVX2-NEXT: vpsllvd %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpsllvd %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vec_smulo.ll b/llvm/test/CodeGen/X86/vec_smulo.ll --- a/llvm/test/CodeGen/X86/vec_smulo.ll +++ b/llvm/test/CodeGen/X86/vec_smulo.ll @@ -1816,7 +1816,7 @@ ; AVX2-NEXT: vpmulhw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm1 ; AVX2-NEXT: vpackuswb %ymm4, %ymm1, %ymm1 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX2-NEXT: vpand %ymm4, %ymm3, %ymm3 ; AVX2-NEXT: vpand %ymm4, %ymm0, %ymm0 ; AVX2-NEXT: vpackuswb %ymm3, %ymm0, %ymm4 @@ -2546,7 +2546,7 @@ ; AVX2-NEXT: vpmulhw %ymm3, %ymm1, %ymm1 ; AVX2-NEXT: vpsrlw $8, %ymm1, %ymm3 ; AVX2-NEXT: vpackuswb %ymm6, %ymm3, %ymm3 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm6 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX2-NEXT: vpand %ymm6, %ymm5, %ymm5 ; AVX2-NEXT: vpand %ymm6, %ymm1, %ymm1 ; AVX2-NEXT: vpackuswb %ymm5, %ymm1, %ymm1 diff --git a/llvm/test/CodeGen/X86/vec_uaddo.ll b/llvm/test/CodeGen/X86/vec_uaddo.ll --- a/llvm/test/CodeGen/X86/vec_uaddo.ll +++ b/llvm/test/CodeGen/X86/vec_uaddo.ll @@ -855,16 +855,27 @@ ; SSE-NEXT: movdqa %xmm1, (%rdi) ; SSE-NEXT: retq ; -; AVX-LABEL: uaddo_v2i64: -; AVX: # %bb.0: -; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] -; AVX-NEXT: vpxor %xmm2, %xmm0, %xmm3 -; AVX-NEXT: vpaddq %xmm1, %xmm0, %xmm1 -; AVX-NEXT: vpxor %xmm2, %xmm1, %xmm0 -; AVX-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm0 -; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; AVX-NEXT: vmovdqa %xmm1, (%rdi) -; AVX-NEXT: retq +; AVX1-LABEL: uaddo_v2i64: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm3 +; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm1 +; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm0 +; AVX1-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX1-NEXT: vmovdqa %xmm1, (%rdi) +; AVX1-NEXT: retq +; +; AVX2-LABEL: uaddo_v2i64: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm3 +; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm1 +; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm0 +; AVX2-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX2-NEXT: vmovdqa %xmm1, (%rdi) +; AVX2-NEXT: retq ; ; AVX512-LABEL: uaddo_v2i64: ; AVX512: # %bb.0: @@ -978,9 +989,9 @@ ; ; AVX1-LABEL: uaddo_v4i24: ; AVX1: # %bb.0: -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [16777215,16777215,16777215,16777215] -; AVX1-NEXT: vandps %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vandps %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0] +; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm1 ; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm0 ; AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 diff --git a/llvm/test/CodeGen/X86/vec_uint_to_fp-fastmath.ll b/llvm/test/CodeGen/X86/vec_uint_to_fp-fastmath.ll --- a/llvm/test/CodeGen/X86/vec_uint_to_fp-fastmath.ll +++ b/llvm/test/CodeGen/X86/vec_uint_to_fp-fastmath.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 ; RUN: llc < %s -mtriple=x86_64 -enable-unsafe-fp-math | FileCheck %s --check-prefix=CST --check-prefix=SSE2 ; RUN: llc < %s -mtriple=x86_64 -enable-unsafe-fp-math -mattr=+sse4.1 | FileCheck %s --check-prefix=CST --check-prefix=SSE41 ; RUN: llc < %s -mtriple=x86_64 -enable-unsafe-fp-math -mattr=+avx | FileCheck %s --check-prefix=CST --check-prefix=AVX @@ -41,48 +42,43 @@ define <4 x float> @test_uitofp_v4i32_to_v4f32(<4 x i32> %arg) { ; SSE2-LABEL: test_uitofp_v4i32_to_v4f32: -; SSE2: movdqa [[MASKCSTADDR]](%rip), [[MASK:%xmm[0-9]+]] -; SSE2-NEXT: pand %xmm0, [[MASK]] -; After this instruction, MASK will have the value of the low parts -; of the vector. -; SSE2-NEXT: por [[LOWCSTADDR]](%rip), [[MASK]] -; SSE2-NEXT: psrld $16, %xmm0 -; SSE2-NEXT: por [[HIGHCSTADDR]](%rip), %xmm0 -; SSE2-NEXT: subps [[MAGICCSTADDR]](%rip), %xmm0 -; SSE2-NEXT: addps [[MASK]], %xmm0 -; SSE2-NEXT: retq +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535] +; SSE2-NEXT: pand %xmm0, %xmm1 +; SSE2-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE2-NEXT: psrld $16, %xmm0 +; SSE2-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE2-NEXT: subps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE2-NEXT: addps %xmm1, %xmm0 +; SSE2-NEXT: retq ; -; Currently we commute the arguments of the first blend, but this could be -; improved to match the lowering of the second blend. ; SSE41-LABEL: test_uitofp_v4i32_to_v4f32: -; SSE41: movdqa [[LOWCSTADDR]](%rip), [[LOWVEC:%xmm[0-9]+]] -; SSE41-NEXT: pblendw $85, %xmm0, [[LOWVEC]] -; SSE41-NEXT: psrld $16, %xmm0 -; SSE41-NEXT: pblendw $170, [[HIGHCSTADDR]](%rip), %xmm0 -; SSE41-NEXT: subps [[MAGICCSTADDR]](%rip), %xmm0 -; SSE41-NEXT: addps [[LOWVEC]], %xmm0 -; SSE41-NEXT: retq +; SSE41: # %bb.0: +; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1258291200,1258291200,1258291200,1258291200] +; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] +; SSE41-NEXT: psrld $16, %xmm0 +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],mem[1],xmm0[2],mem[3],xmm0[4],mem[5],xmm0[6],mem[7] +; SSE41-NEXT: subps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE41-NEXT: addps %xmm1, %xmm0 +; SSE41-NEXT: retq ; ; AVX-LABEL: test_uitofp_v4i32_to_v4f32: -; AVX: vpblendw $170, [[LOWCSTADDR]](%rip), %xmm0, [[LOWVEC:%xmm[0-9]+]] -; AVX-NEXT: vpsrld $16, %xmm0, [[SHIFTVEC:%xmm[0-9]+]] -; AVX-NEXT: vpblendw $170, [[HIGHCSTADDR]](%rip), [[SHIFTVEC]], [[HIGHVEC:%xmm[0-9]+]] -; AVX-NEXT: vsubps [[MAGICCSTADDR]](%rip), [[HIGHVEC]], [[TMP:%xmm[0-9]+]] -; AVX-NEXT: vaddps [[TMP]], [[LOWVEC]], %xmm0 -; AVX-NEXT: retq +; AVX: # %bb.0: +; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],mem[1],xmm0[2],mem[3],xmm0[4],mem[5],xmm0[6],mem[7] +; AVX-NEXT: vpsrld $16, %xmm0, %xmm0 +; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],mem[1],xmm0[2],mem[3],xmm0[4],mem[5],xmm0[6],mem[7] +; AVX-NEXT: vsubps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX-NEXT: vaddps %xmm0, %xmm1, %xmm0 +; AVX-NEXT: retq ; -; The lowering for AVX2 is a bit messy, because we select broadcast -; instructions, instead of folding the constant loads. ; AVX2-LABEL: test_uitofp_v4i32_to_v4f32: -; AVX2: vpbroadcastd [[LOWCSTADDR]](%rip), [[LOWCST:%xmm[0-9]+]] -; AVX2-NEXT: vpblendw $170, [[LOWCST]], %xmm0, [[LOWVEC:%xmm[0-9]+]] -; AVX2-NEXT: vpsrld $16, %xmm0, [[SHIFTVEC:%xmm[0-9]+]] -; AVX2-NEXT: vpbroadcastd [[HIGHCSTADDR]](%rip), [[HIGHCST:%xmm[0-9]+]] -; AVX2-NEXT: vpblendw $170, [[HIGHCST]], [[SHIFTVEC]], [[HIGHVEC:%xmm[0-9]+]] -; AVX2-NEXT: vbroadcastss [[MAGICCSTADDR]](%rip), [[MAGICCST:%xmm[0-9]+]] -; AVX2-NEXT: vsubps [[MAGICCST]], [[HIGHVEC]], [[TMP:%xmm[0-9]+]] -; AVX2-NEXT: vaddps [[TMP]], [[LOWVEC]], %xmm0 -; AVX2-NEXT: retq +; AVX2: # %bb.0: +; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],mem[1],xmm0[2],mem[3],xmm0[4],mem[5],xmm0[6],mem[7] +; AVX2-NEXT: vpsrld $16, %xmm0, %xmm0 +; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],mem[1],xmm0[2],mem[3],xmm0[4],mem[5],xmm0[6],mem[7] +; AVX2-NEXT: vsubps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vaddps %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: retq ; ; AVX512F-LABEL: test_uitofp_v4i32_to_v4f32: ; AVX512F: # %bb.0: @@ -96,6 +92,12 @@ ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vcvtudq2ps %xmm0, %xmm0 ; AVX512VL-NEXT: retq +; After this instruction, MASK will have the value of the low parts +; of the vector. +; Currently we commute the arguments of the first blend, but this could be +; improved to match the lowering of the second blend. +; The lowering for AVX2 is a bit messy, because we select broadcast +; instructions, instead of folding the constant loads. %tmp = uitofp <4 x i32> %arg to <4 x float> ret <4 x float> %tmp } @@ -116,72 +118,84 @@ ; two sequences of instructions. ; ; SSE2-LABEL: test_uitofp_v8i32_to_v8f32: -; SSE2: movdqa {{.*#+}} [[MASK:xmm[0-9]+]] = [65535,65535,65535,65535] -; SSE2-NEXT: movdqa %xmm0, [[VECLOW:%xmm[0-9]+]] -; SSE2-NEXT: pand %[[MASK]], [[VECLOW]] -; SSE2-NEXT: movdqa {{.*#+}} [[LOWCST:xmm[0-9]+]] = [1258291200,1258291200,1258291200,1258291200] -; SSE2-NEXT: por %[[LOWCST]], [[VECLOW]] -; SSE2-NEXT: psrld $16, %xmm0 -; SSE2-NEXT: movdqa {{.*#+}} [[HIGHCST:xmm[0-9]+]] = [1392508928,1392508928,1392508928,1392508928] -; SSE2-NEXT: por %[[HIGHCST]], %xmm0 -; SSE2-NEXT: movaps {{.*#+}} [[MAGICCST:xmm[0-9]+]] = [5.49764202E+11,5.49764202E+11,5.49764202E+11,5.49764202E+11] -; SSE2-NEXT: subps %[[MAGICCST]], %xmm0 -; SSE2-NEXT: addps [[VECLOW]], %xmm0 -; MASK is the low vector of the second part after this point. -; SSE2-NEXT: pand %xmm1, %[[MASK]] -; SSE2-NEXT: por %[[LOWCST]], %[[MASK]] -; SSE2-NEXT: psrld $16, %xmm1 -; SSE2-NEXT: por %[[HIGHCST]], %xmm1 -; SSE2-NEXT: subps %[[MAGICCST]], %xmm1 -; SSE2-NEXT: addps %[[MASK]], %xmm1 -; SSE2-NEXT: retq +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,65535] +; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: pand %xmm2, %xmm3 +; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [1258291200,1258291200,1258291200,1258291200] +; SSE2-NEXT: por %xmm4, %xmm3 +; SSE2-NEXT: psrld $16, %xmm0 +; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [1392508928,1392508928,1392508928,1392508928] +; SSE2-NEXT: por %xmm5, %xmm0 +; SSE2-NEXT: movaps {{.*#+}} xmm6 = [5.49764202E+11,5.49764202E+11,5.49764202E+11,5.49764202E+11] +; SSE2-NEXT: subps %xmm6, %xmm0 +; SSE2-NEXT: addps %xmm3, %xmm0 +; SSE2-NEXT: pand %xmm1, %xmm2 +; SSE2-NEXT: por %xmm4, %xmm2 +; SSE2-NEXT: psrld $16, %xmm1 +; SSE2-NEXT: por %xmm5, %xmm1 +; SSE2-NEXT: subps %xmm6, %xmm1 +; SSE2-NEXT: addps %xmm2, %xmm1 +; SSE2-NEXT: retq ; ; SSE41-LABEL: test_uitofp_v8i32_to_v8f32: -; SSE41: movdqa {{.*#+}} [[LOWCST:xmm[0-9]+]] = [1258291200,1258291200,1258291200,1258291200] -; SSE41-NEXT: movdqa %xmm0, [[VECLOW:%xmm[0-9]+]] -; SSE41-NEXT: pblendw $170, %[[LOWCST]], [[VECLOW]] -; SSE41-NEXT: psrld $16, %xmm0 -; SSE41-NEXT: movdqa {{.*#+}} [[HIGHCST:xmm[0-9]+]] = [1392508928,1392508928,1392508928,1392508928] -; SSE41-NEXT: pblendw $170, %[[HIGHCST]], %xmm0 -; SSE41-NEXT: movaps {{.*#+}} [[MAGICCST:xmm[0-9]+]] = [5.49764202E+11,5.49764202E+11,5.49764202E+11,5.49764202E+11] -; SSE41-NEXT: subps %[[MAGICCST]], %xmm0 -; SSE41-NEXT: addps [[VECLOW]], %xmm0 -; LOWCST is the low vector of the second part after this point. -; The operands of the blend are inverted because we reuse xmm1 -; in the next shift. -; SSE41-NEXT: pblendw $85, %xmm1, %[[LOWCST]] -; SSE41-NEXT: psrld $16, %xmm1 -; SSE41-NEXT: pblendw $170, %[[HIGHCST]], %xmm1 -; SSE41-NEXT: subps %[[MAGICCST]], %xmm1 -; SSE41-NEXT: addps %[[LOWCST]], %xmm1 -; SSE41-NEXT: retq +; SSE41: # %bb.0: +; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [1258291200,1258291200,1258291200,1258291200] +; SSE41-NEXT: movdqa %xmm0, %xmm3 +; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0],xmm2[1],xmm3[2],xmm2[3],xmm3[4],xmm2[5],xmm3[6],xmm2[7] +; SSE41-NEXT: psrld $16, %xmm0 +; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [1392508928,1392508928,1392508928,1392508928] +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1],xmm0[2],xmm4[3],xmm0[4],xmm4[5],xmm0[6],xmm4[7] +; SSE41-NEXT: movaps {{.*#+}} xmm5 = [5.49764202E+11,5.49764202E+11,5.49764202E+11,5.49764202E+11] +; SSE41-NEXT: subps %xmm5, %xmm0 +; SSE41-NEXT: addps %xmm3, %xmm0 +; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7] +; SSE41-NEXT: psrld $16, %xmm1 +; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1],xmm1[2],xmm4[3],xmm1[4],xmm4[5],xmm1[6],xmm4[7] +; SSE41-NEXT: subps %xmm5, %xmm1 +; SSE41-NEXT: addps %xmm2, %xmm1 +; SSE41-NEXT: retq ; -; Test that we are not lowering uinttofp to scalars -; AVX-NOT: cvtsd2ss -; AVX: retq +; AVX-LABEL: test_uitofp_v8i32_to_v8f32: +; AVX: # %bb.0: +; AVX-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX-NEXT: vpsrld $16, %xmm2, %xmm2 +; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX-NEXT: vcvtdq2ps %ymm1, %ymm1 +; AVX-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX-NEXT: vcvtdq2ps %ymm0, %ymm0 +; AVX-NEXT: vaddps %ymm0, %ymm1, %ymm0 +; AVX-NEXT: retq ; ; AVX2-LABEL: test_uitofp_v8i32_to_v8f32: -; AVX2: vpbroadcastd [[LOWCSTADDR]](%rip), [[LOWCST:%ymm[0-9]+]] -; AVX2-NEXT: vpblendw $170, [[LOWCST]], %ymm0, [[LOWVEC:%ymm[0-9]+]] -; AVX2-NEXT: vpsrld $16, %ymm0, [[SHIFTVEC:%ymm[0-9]+]] -; AVX2-NEXT: vpbroadcastd [[HIGHCSTADDR]](%rip), [[HIGHCST:%ymm[0-9]+]] -; AVX2-NEXT: vpblendw $170, [[HIGHCST]], [[SHIFTVEC]], [[HIGHVEC:%ymm[0-9]+]] -; AVX2-NEXT: vbroadcastss [[MAGICCSTADDR]](%rip), [[MAGICCST:%ymm[0-9]+]] -; AVX2-NEXT: vsubps [[MAGICCST]], [[HIGHVEC]], [[TMP:%ymm[0-9]+]] -; AVX2-NEXT: vaddps [[TMP]], [[LOWVEC]], %ymm0 -; AVX2-NEXT: retq +; AVX2: # %bb.0: +; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],mem[1],ymm0[2],mem[3],ymm0[4],mem[5],ymm0[6],mem[7],ymm0[8],mem[9],ymm0[10],mem[11],ymm0[12],mem[13],ymm0[14],mem[15] +; AVX2-NEXT: vpsrld $16, %ymm0, %ymm0 +; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],mem[1],ymm0[2],mem[3],ymm0[4],mem[5],ymm0[6],mem[7],ymm0[8],mem[9],ymm0[10],mem[11],ymm0[12],mem[13],ymm0[14],mem[15] +; AVX2-NEXT: vsubps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX2-NEXT: vaddps %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: retq ; ; AVX512F-LABEL: test_uitofp_v8i32_to_v8f32: ; AVX512F: # %bb.0: -; AVX512F-NEXT: # kill +; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; AVX512F-NEXT: vcvtudq2ps %zmm0, %zmm0 -; AVX512F-NEXT: # kill +; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: test_uitofp_v8i32_to_v8f32: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vcvtudq2ps %ymm0, %ymm0 ; AVX512VL-NEXT: retq +; MASK is the low vector of the second part after this point. +; LOWCST is the low vector of the second part after this point. +; The operands of the blend are inverted because we reuse xmm1 +; in the next shift. +; Test that we are not lowering uinttofp to scalars %tmp = uitofp <8 x i32> %arg to <8 x float> ret <8 x float> %tmp } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; CST: {{.*}} diff --git a/llvm/test/CodeGen/X86/vec_uint_to_fp.ll b/llvm/test/CodeGen/X86/vec_uint_to_fp.ll --- a/llvm/test/CodeGen/X86/vec_uint_to_fp.ll +++ b/llvm/test/CodeGen/X86/vec_uint_to_fp.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 ; RUN: llc < %s -mtriple=x86_64-apple-macosx | FileCheck --check-prefix=CHECK --check-prefix=SSE --check-prefix=CST %s ; RUN: llc < %s -mtriple=x86_64-apple-macosx -mattr=+sse4.1 | FileCheck --check-prefix=CHECK --check-prefix=SSE41 --check-prefix=CST %s ; RUN: llc < %s -mtriple=x86_64-apple-macosx -mattr=+avx | FileCheck --check-prefix=CHECK --check-prefix=AVX --check-prefix=CST %s @@ -38,47 +39,50 @@ ; AVX2-NEXT: .long 0x53000080 ## float 5.49764202E+11 define <4 x float> @test1(<4 x i32> %A) nounwind { -; CHECK-LABEL: test1: +; SSE-LABEL: test1: +; SSE: ## %bb.0: +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535] +; SSE-NEXT: pand %xmm0, %xmm1 +; SSE-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE-NEXT: psrld $16, %xmm0 +; SSE-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE-NEXT: subps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE-NEXT: addps %xmm1, %xmm0 +; SSE-NEXT: retq ; -; SSE: movdqa [[MASKCSTADDR]](%rip), [[MASK:%xmm[0-9]+]] -; SSE-NEXT: pand %xmm0, [[MASK]] +; SSE41-LABEL: test1: +; SSE41: ## %bb.0: +; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1258291200,1258291200,1258291200,1258291200] +; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] +; SSE41-NEXT: psrld $16, %xmm0 +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],mem[1],xmm0[2],mem[3],xmm0[4],mem[5],xmm0[6],mem[7] +; SSE41-NEXT: subps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE41-NEXT: addps %xmm1, %xmm0 +; SSE41-NEXT: retq +; +; AVX-LABEL: test1: +; AVX: ## %bb.0: +; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],mem[1],xmm0[2],mem[3],xmm0[4],mem[5],xmm0[6],mem[7] +; AVX-NEXT: vpsrld $16, %xmm0, %xmm0 +; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],mem[1],xmm0[2],mem[3],xmm0[4],mem[5],xmm0[6],mem[7] +; AVX-NEXT: vsubps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX-NEXT: vaddps %xmm0, %xmm1, %xmm0 +; AVX-NEXT: retq +; +; AVX2-LABEL: test1: +; AVX2: ## %bb.0: +; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],mem[1],xmm0[2],mem[3],xmm0[4],mem[5],xmm0[6],mem[7] +; AVX2-NEXT: vpsrld $16, %xmm0, %xmm0 +; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],mem[1],xmm0[2],mem[3],xmm0[4],mem[5],xmm0[6],mem[7] +; AVX2-NEXT: vsubps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vaddps %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: retq ; After this instruction, MASK will have the value of the low parts ; of the vector. -; SSE-NEXT: por [[LOWCSTADDR]](%rip), [[MASK]] -; SSE-NEXT: psrld $16, %xmm0 -; SSE-NEXT: por [[HIGHCSTADDR]](%rip), %xmm0 -; SSE-NEXT: subps [[MAGICCSTADDR]](%rip), %xmm0 -; SSE-NEXT: addps [[MASK]], %xmm0 -; SSE-NEXT: retq -; ; Currently we commute the arguments of the first blend, but this could be ; improved to match the lowering of the second blend. -; SSE41: movdqa [[LOWCSTADDR]](%rip), [[LOWVEC:%xmm[0-9]+]] -; SSE41-NEXT: pblendw $85, %xmm0, [[LOWVEC]] -; SSE41-NEXT: psrld $16, %xmm0 -; SSE41-NEXT: pblendw $170, [[HIGHCSTADDR]](%rip), %xmm0 -; SSE41-NEXT: subps [[MAGICCSTADDR]](%rip), %xmm0 -; SSE41-NEXT: addps [[LOWVEC]], %xmm0 -; SSE41-NEXT: retq -; -; AVX: vpblendw $170, [[LOWCSTADDR]](%rip), %xmm0, [[LOWVEC:%xmm[0-9]+]] -; AVX-NEXT: vpsrld $16, %xmm0, [[SHIFTVEC:%xmm[0-9]+]] -; AVX-NEXT: vpblendw $170, [[HIGHCSTADDR]](%rip), [[SHIFTVEC]], [[HIGHVEC:%xmm[0-9]+]] -; AVX-NEXT: vsubps [[MAGICCSTADDR]](%rip), [[HIGHVEC]], [[TMP:%xmm[0-9]+]] -; AVX-NEXT: vaddps [[TMP]], [[LOWVEC]], %xmm0 -; AVX-NEXT: retq -; ; The lowering for AVX2 is a bit messy, because we select broadcast ; instructions, instead of folding the constant loads. -; AVX2: vpbroadcastd [[LOWCSTADDR]](%rip), [[LOWCST:%xmm[0-9]+]] -; AVX2-NEXT: vpblendw $170, [[LOWCST]], %xmm0, [[LOWVEC:%xmm[0-9]+]] -; AVX2-NEXT: vpsrld $16, %xmm0, [[SHIFTVEC:%xmm[0-9]+]] -; AVX2-NEXT: vpbroadcastd [[HIGHCSTADDR]](%rip), [[HIGHCST:%xmm[0-9]+]] -; AVX2-NEXT: vpblendw $170, [[HIGHCST]], [[SHIFTVEC]], [[HIGHVEC:%xmm[0-9]+]] -; AVX2-NEXT: vbroadcastss [[MAGICCSTADDR]](%rip), [[MAGICCST:%xmm[0-9]+]] -; AVX2-NEXT: vsubps [[MAGICCST]], [[HIGHVEC]], [[TMP:%xmm[0-9]+]] -; AVX2-NEXT: vaddps [[TMP]], [[LOWVEC]], %xmm0 -; AVX2-NEXT: retq %C = uitofp <4 x i32> %A to <4 x float> ret <4 x float> %C } @@ -94,74 +98,128 @@ ; AVX2-NEXT: .long 0x53000080 ## float 5.49764202E+11 define <8 x float> @test2(<8 x i32> %A) nounwind { -; CHECK-LABEL: test2: +; SSE-LABEL: test2: +; SSE: ## %bb.0: +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,65535] +; SSE-NEXT: movdqa %xmm0, %xmm3 +; SSE-NEXT: pand %xmm2, %xmm3 +; SSE-NEXT: movdqa {{.*#+}} xmm4 = [1258291200,1258291200,1258291200,1258291200] +; SSE-NEXT: por %xmm4, %xmm3 +; SSE-NEXT: psrld $16, %xmm0 +; SSE-NEXT: movdqa {{.*#+}} xmm5 = [1392508928,1392508928,1392508928,1392508928] +; SSE-NEXT: por %xmm5, %xmm0 +; SSE-NEXT: movaps {{.*#+}} xmm6 = [5.49764202E+11,5.49764202E+11,5.49764202E+11,5.49764202E+11] +; SSE-NEXT: subps %xmm6, %xmm0 +; SSE-NEXT: addps %xmm3, %xmm0 +; SSE-NEXT: pand %xmm1, %xmm2 +; SSE-NEXT: por %xmm4, %xmm2 +; SSE-NEXT: psrld $16, %xmm1 +; SSE-NEXT: por %xmm5, %xmm1 +; SSE-NEXT: subps %xmm6, %xmm1 +; SSE-NEXT: addps %xmm2, %xmm1 +; SSE-NEXT: retq +; +; SSE41-LABEL: test2: +; SSE41: ## %bb.0: +; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [1258291200,1258291200,1258291200,1258291200] +; SSE41-NEXT: movdqa %xmm0, %xmm3 +; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0],xmm2[1],xmm3[2],xmm2[3],xmm3[4],xmm2[5],xmm3[6],xmm2[7] +; SSE41-NEXT: psrld $16, %xmm0 +; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [1392508928,1392508928,1392508928,1392508928] +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1],xmm0[2],xmm4[3],xmm0[4],xmm4[5],xmm0[6],xmm4[7] +; SSE41-NEXT: movaps {{.*#+}} xmm5 = [5.49764202E+11,5.49764202E+11,5.49764202E+11,5.49764202E+11] +; SSE41-NEXT: subps %xmm5, %xmm0 +; SSE41-NEXT: addps %xmm3, %xmm0 +; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7] +; SSE41-NEXT: psrld $16, %xmm1 +; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1],xmm1[2],xmm4[3],xmm1[4],xmm4[5],xmm1[6],xmm4[7] +; SSE41-NEXT: subps %xmm5, %xmm1 +; SSE41-NEXT: addps %xmm2, %xmm1 +; SSE41-NEXT: retq +; +; AVX-LABEL: test2: +; AVX: ## %bb.0: +; AVX-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX-NEXT: vpsrld $16, %xmm2, %xmm2 +; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX-NEXT: vcvtdq2ps %ymm1, %ymm1 +; AVX-NEXT: vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX-NEXT: vcvtdq2ps %ymm0, %ymm0 +; AVX-NEXT: vaddps %ymm0, %ymm1, %ymm0 +; AVX-NEXT: retq +; +; AVX2-LABEL: test2: +; AVX2: ## %bb.0: +; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],mem[1],ymm0[2],mem[3],ymm0[4],mem[5],ymm0[6],mem[7],ymm0[8],mem[9],ymm0[10],mem[11],ymm0[12],mem[13],ymm0[14],mem[15] +; AVX2-NEXT: vpsrld $16, %ymm0, %ymm0 +; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],mem[1],ymm0[2],mem[3],ymm0[4],mem[5],ymm0[6],mem[7],ymm0[8],mem[9],ymm0[10],mem[11],ymm0[12],mem[13],ymm0[14],mem[15] +; AVX2-NEXT: vsubps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX2-NEXT: vaddps %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: retq ; Legalization will break the thing is 2 x <4 x i32> on anthing prior AVX. ; The constant used for in the vector instruction are shared between the ; two sequences of instructions. -; -; SSE: movdqa {{.*#+}} [[MASK:xmm[0-9]+]] = [65535,65535,65535,65535] -; SSE-NEXT: movdqa %xmm0, [[VECLOW:%xmm[0-9]+]] -; SSE-NEXT: pand %[[MASK]], [[VECLOW]] -; SSE-NEXT: movdqa {{.*#+}} [[LOWCST:xmm[0-9]+]] = [1258291200,1258291200,1258291200,1258291200] -; SSE-NEXT: por %[[LOWCST]], [[VECLOW]] -; SSE-NEXT: psrld $16, %xmm0 -; SSE-NEXT: movdqa {{.*#+}} [[HIGHCST:xmm[0-9]+]] = [1392508928,1392508928,1392508928,1392508928] -; SSE-NEXT: por %[[HIGHCST]], %xmm0 -; SSE-NEXT: movaps {{.*#+}} [[MAGICCST:xmm[0-9]+]] = [5.49764202E+11,5.49764202E+11,5.49764202E+11,5.49764202E+11] -; SSE-NEXT: subps %[[MAGICCST]], %xmm0 -; SSE-NEXT: addps [[VECLOW]], %xmm0 ; MASK is the low vector of the second part after this point. -; SSE-NEXT: pand %xmm1, %[[MASK]] -; SSE-NEXT: por %[[LOWCST]], %[[MASK]] -; SSE-NEXT: psrld $16, %xmm1 -; SSE-NEXT: por %[[HIGHCST]], %xmm1 -; SSE-NEXT: subps %[[MAGICCST]], %xmm1 -; SSE-NEXT: addps %[[MASK]], %xmm1 -; SSE-NEXT: retq -; -; SSE41: movdqa {{.*#+}} [[LOWCST:xmm[0-9]+]] = [1258291200,1258291200,1258291200,1258291200] -; SSE41-NEXT: movdqa %xmm0, [[VECLOW:%xmm[0-9]+]] -; SSE41-NEXT: pblendw $170, %[[LOWCST]], [[VECLOW]] -; SSE41-NEXT: psrld $16, %xmm0 -; SSE41-NEXT: movdqa {{.*#+}} [[HIGHCST:xmm[0-9]+]] = [1392508928,1392508928,1392508928,1392508928] -; SSE41-NEXT: pblendw $170, %[[HIGHCST]], %xmm0 -; SSE41-NEXT: movaps {{.*#+}} [[MAGICCST:xmm[0-9]+]] = [5.49764202E+11,5.49764202E+11,5.49764202E+11,5.49764202E+11] -; SSE41-NEXT: subps %[[MAGICCST]], %xmm0 -; SSE41-NEXT: addps [[VECLOW]], %xmm0 ; LOWCST is the low vector of the second part after this point. ; The operands of the blend are inverted because we reuse xmm1 ; in the next shift. -; SSE41-NEXT: pblendw $85, %xmm1, %[[LOWCST]] -; SSE41-NEXT: psrld $16, %xmm1 -; SSE41-NEXT: pblendw $170, %[[HIGHCST]], %xmm1 -; SSE41-NEXT: subps %[[MAGICCST]], %xmm1 -; SSE41-NEXT: addps %[[LOWCST]], %xmm1 -; SSE41-NEXT: retq -; ; Test that we are not lowering uinttofp to scalars -; AVX-NOT: cvtsd2ss -; AVX: retq -; -; AVX2: vpbroadcastd [[LOWCSTADDR]](%rip), [[LOWCST:%ymm[0-9]+]] -; AVX2-NEXT: vpblendw $170, [[LOWCST]], %ymm0, [[LOWVEC:%ymm[0-9]+]] -; AVX2-NEXT: vpsrld $16, %ymm0, [[SHIFTVEC:%ymm[0-9]+]] -; AVX2-NEXT: vpbroadcastd [[HIGHCSTADDR]](%rip), [[HIGHCST:%ymm[0-9]+]] -; AVX2-NEXT: vpblendw $170, [[HIGHCST]], [[SHIFTVEC]], [[HIGHVEC:%ymm[0-9]+]] -; AVX2-NEXT: vbroadcastss [[MAGICCSTADDR]](%rip), [[MAGICCST:%ymm[0-9]+]] -; AVX2-NEXT: vsubps [[MAGICCST]], [[HIGHVEC]], [[TMP:%ymm[0-9]+]] -; AVX2-NEXT: vaddps [[TMP]], [[LOWVEC]], %ymm0 -; AVX2-NEXT: retq %C = uitofp <8 x i32> %A to <8 x float> ret <8 x float> %C } define <4 x double> @test3(<4 x i32> %arg) { -; CHECK-LABEL: test3: +; SSE-LABEL: test3: +; SSE: ## %bb.0: +; SSE-NEXT: movapd %xmm0, %xmm1 +; SSE-NEXT: xorpd %xmm2, %xmm2 +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE-NEXT: movapd {{.*#+}} xmm3 = [4.503599627370496E+15,4.503599627370496E+15] +; SSE-NEXT: orpd %xmm3, %xmm0 +; SSE-NEXT: subpd %xmm3, %xmm0 +; SSE-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; SSE-NEXT: orpd %xmm3, %xmm1 +; SSE-NEXT: subpd %xmm3, %xmm1 +; SSE-NEXT: retq +; +; SSE41-LABEL: test3: +; SSE41: ## %bb.0: +; SSE41-NEXT: movdqa %xmm0, %xmm1 +; SSE41-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero +; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [4.503599627370496E+15,4.503599627370496E+15] +; SSE41-NEXT: por %xmm2, %xmm0 +; SSE41-NEXT: subpd %xmm2, %xmm0 +; SSE41-NEXT: pxor %xmm3, %xmm3 +; SSE41-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; SSE41-NEXT: por %xmm2, %xmm1 +; SSE41-NEXT: subpd %xmm2, %xmm1 +; SSE41-NEXT: retq +; +; AVX-LABEL: test3: +; AVX: ## %bb.0: +; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero +; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX-NEXT: vbroadcastsd {{.*#+}} ymm1 = [4.503599627370496E+15,4.503599627370496E+15,4.503599627370496E+15,4.503599627370496E+15] +; AVX-NEXT: vorpd %ymm1, %ymm0, %ymm0 +; AVX-NEXT: vsubpd %ymm1, %ymm0, %ymm0 +; AVX-NEXT: retq +; +; AVX2-LABEL: test3: +; AVX2: ## %bb.0: +; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [4.503599627370496E+15,4.503599627370496E+15,4.503599627370496E+15,4.503599627370496E+15] +; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vsubpd %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: retq ; This test used to crash because we were custom lowering it as if it was ; a conversion between <4 x i32> and <4 x float>. -; AVX: vsubpd -; AVX2: vsubpd -; CHECK: retq %tmp = uitofp <4 x i32> %arg to <4 x double> ret <4 x double> %tmp } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; CHECK: {{.*}} +; CST: {{.*}} diff --git a/llvm/test/CodeGen/X86/vec_umulo.ll b/llvm/test/CodeGen/X86/vec_umulo.ll --- a/llvm/test/CodeGen/X86/vec_umulo.ll +++ b/llvm/test/CodeGen/X86/vec_umulo.ll @@ -1555,7 +1555,7 @@ ; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15],ymm1[24],ymm2[24],ymm1[25],ymm2[25],ymm1[26],ymm2[26],ymm1[27],ymm2[27],ymm1[28],ymm2[28],ymm1[29],ymm2[29],ymm1[30],ymm2[30],ymm1[31],ymm2[31] ; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm0[8],ymm2[8],ymm0[9],ymm2[9],ymm0[10],ymm2[10],ymm0[11],ymm2[11],ymm0[12],ymm2[12],ymm0[13],ymm2[13],ymm0[14],ymm2[14],ymm0[15],ymm2[15],ymm0[24],ymm2[24],ymm0[25],ymm2[25],ymm0[26],ymm2[26],ymm0[27],ymm2[27],ymm0[28],ymm2[28],ymm0[29],ymm2[29],ymm0[30],ymm2[30],ymm0[31],ymm2[31] ; AVX2-NEXT: vpmullw %ymm3, %ymm4, %ymm3 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX2-NEXT: vpand %ymm4, %ymm3, %ymm5 ; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[16],ymm2[16],ymm1[17],ymm2[17],ymm1[18],ymm2[18],ymm1[19],ymm2[19],ymm1[20],ymm2[20],ymm1[21],ymm2[21],ymm1[22],ymm2[22],ymm1[23],ymm2[23] ; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[4],ymm2[4],ymm0[5],ymm2[5],ymm0[6],ymm2[6],ymm0[7],ymm2[7],ymm0[16],ymm2[16],ymm0[17],ymm2[17],ymm0[18],ymm2[18],ymm0[19],ymm2[19],ymm0[20],ymm2[20],ymm0[21],ymm2[21],ymm0[22],ymm2[22],ymm0[23],ymm2[23] @@ -2216,7 +2216,7 @@ ; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm2[8],ymm4[8],ymm2[9],ymm4[9],ymm2[10],ymm4[10],ymm2[11],ymm4[11],ymm2[12],ymm4[12],ymm2[13],ymm4[13],ymm2[14],ymm4[14],ymm2[15],ymm4[15],ymm2[24],ymm4[24],ymm2[25],ymm4[25],ymm2[26],ymm4[26],ymm2[27],ymm4[27],ymm2[28],ymm4[28],ymm2[29],ymm4[29],ymm2[30],ymm4[30],ymm2[31],ymm4[31] ; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm6 = ymm0[8],ymm4[8],ymm0[9],ymm4[9],ymm0[10],ymm4[10],ymm0[11],ymm4[11],ymm0[12],ymm4[12],ymm0[13],ymm4[13],ymm0[14],ymm4[14],ymm0[15],ymm4[15],ymm0[24],ymm4[24],ymm0[25],ymm4[25],ymm0[26],ymm4[26],ymm0[27],ymm4[27],ymm0[28],ymm4[28],ymm0[29],ymm4[29],ymm0[30],ymm4[30],ymm0[31],ymm4[31] ; AVX2-NEXT: vpmullw %ymm5, %ymm6, %ymm5 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm6 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX2-NEXT: vpand %ymm6, %ymm5, %ymm7 ; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0],ymm4[0],ymm2[1],ymm4[1],ymm2[2],ymm4[2],ymm2[3],ymm4[3],ymm2[4],ymm4[4],ymm2[5],ymm4[5],ymm2[6],ymm4[6],ymm2[7],ymm4[7],ymm2[16],ymm4[16],ymm2[17],ymm4[17],ymm2[18],ymm4[18],ymm2[19],ymm4[19],ymm2[20],ymm4[20],ymm2[21],ymm4[21],ymm2[22],ymm4[22],ymm2[23],ymm4[23] ; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm4[0],ymm0[1],ymm4[1],ymm0[2],ymm4[2],ymm0[3],ymm4[3],ymm0[4],ymm4[4],ymm0[5],ymm4[5],ymm0[6],ymm4[6],ymm0[7],ymm4[7],ymm0[16],ymm4[16],ymm0[17],ymm4[17],ymm0[18],ymm4[18],ymm0[19],ymm4[19],ymm0[20],ymm4[20],ymm0[21],ymm4[21],ymm0[22],ymm4[22],ymm0[23],ymm4[23] @@ -2745,11 +2745,11 @@ ; ; AVX1-LABEL: umulo_v4i24: ; AVX1: # %bb.0: -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [16777215,16777215,16777215,16777215] -; AVX1-NEXT: vandps %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vandps %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vshufps {{.*#+}} xmm2 = xmm1[1,1,3,3] -; AVX1-NEXT: vshufps {{.*#+}} xmm3 = xmm0[1,1,3,3] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0] +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] ; AVX1-NEXT: vpmuludq %xmm2, %xmm3, %xmm2 ; AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm3 ; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] diff --git a/llvm/test/CodeGen/X86/vec_usubo.ll b/llvm/test/CodeGen/X86/vec_usubo.ll --- a/llvm/test/CodeGen/X86/vec_usubo.ll +++ b/llvm/test/CodeGen/X86/vec_usubo.ll @@ -902,16 +902,27 @@ ; SSE-NEXT: movdqa %xmm1, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: usubo_v2i64: -; AVX: # %bb.0: -; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] -; AVX-NEXT: vpxor %xmm2, %xmm0, %xmm3 -; AVX-NEXT: vpsubq %xmm1, %xmm0, %xmm1 -; AVX-NEXT: vpxor %xmm2, %xmm1, %xmm0 -; AVX-NEXT: vpcmpgtq %xmm3, %xmm0, %xmm0 -; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; AVX-NEXT: vmovdqa %xmm1, (%rdi) -; AVX-NEXT: retq +; AVX1-LABEL: usubo_v2i64: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm3 +; AVX1-NEXT: vpsubq %xmm1, %xmm0, %xmm1 +; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm0 +; AVX1-NEXT: vpcmpgtq %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX1-NEXT: vmovdqa %xmm1, (%rdi) +; AVX1-NEXT: retq +; +; AVX2-LABEL: usubo_v2i64: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm3 +; AVX2-NEXT: vpsubq %xmm1, %xmm0, %xmm1 +; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm0 +; AVX2-NEXT: vpcmpgtq %xmm3, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX2-NEXT: vmovdqa %xmm1, (%rdi) +; AVX2-NEXT: retq ; ; AVX512-LABEL: usubo_v2i64: ; AVX512: # %bb.0: @@ -1025,9 +1036,9 @@ ; ; AVX1-LABEL: usubo_v4i24: ; AVX1: # %bb.0: -; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [16777215,16777215,16777215,16777215] -; AVX1-NEXT: vandps %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vandps %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0] +; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm1 ; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm0 ; AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 diff --git a/llvm/test/CodeGen/X86/vector-bitreverse.ll b/llvm/test/CodeGen/X86/vector-bitreverse.ll --- a/llvm/test/CodeGen/X86/vector-bitreverse.ll +++ b/llvm/test/CodeGen/X86/vector-bitreverse.ll @@ -447,18 +447,44 @@ ; SSSE3-NEXT: movdqa %xmm1, %xmm0 ; SSSE3-NEXT: retq ; -; AVX-LABEL: test_bitreverse_v16i8: -; AVX: # %bb.0: -; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] -; AVX-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] -; AVX-NEXT: vpshufb %xmm0, %xmm1, %xmm0 -; AVX-NEXT: vpor %xmm0, %xmm2, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: test_bitreverse_v16i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] +; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 +; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] +; AVX1-NEXT: vpshufb %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpor %xmm0, %xmm2, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_bitreverse_v16i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] +; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 +; AVX2-NEXT: vpsrlw $4, %xmm0, %xmm0 +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] +; AVX2-NEXT: vpshufb %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpor %xmm0, %xmm2, %xmm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: test_bitreverse_v16i8: +; AVX512: # %bb.0: +; AVX512-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm2 +; AVX512-NEXT: vmovdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] +; AVX512-NEXT: vpshufb %xmm2, %xmm3, %xmm2 +; AVX512-NEXT: vpsrlw $4, %xmm0, %xmm0 +; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] +; AVX512-NEXT: vpshufb %xmm0, %xmm1, %xmm0 +; AVX512-NEXT: vpor %xmm0, %xmm2, %xmm0 +; AVX512-NEXT: retq ; ; XOP-LABEL: test_bitreverse_v16i8: ; XOP: # %bb.0: @@ -524,19 +550,47 @@ ; SSSE3-NEXT: movdqa %xmm1, %xmm0 ; SSSE3-NEXT: retq ; -; AVX-LABEL: test_bitreverse_v8i16: -; AVX: # %bb.0: -; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] -; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] -; AVX-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] -; AVX-NEXT: vpshufb %xmm0, %xmm1, %xmm0 -; AVX-NEXT: vpor %xmm0, %xmm2, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: test_bitreverse_v8i16: +; AVX1: # %bb.0: +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] +; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 +; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] +; AVX1-NEXT: vpshufb %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpor %xmm0, %xmm2, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_bitreverse_v8i16: +; AVX2: # %bb.0: +; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] +; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 +; AVX2-NEXT: vpsrlw $4, %xmm0, %xmm0 +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] +; AVX2-NEXT: vpshufb %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpor %xmm0, %xmm2, %xmm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: test_bitreverse_v8i16: +; AVX512: # %bb.0: +; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] +; AVX512-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm2 +; AVX512-NEXT: vmovdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] +; AVX512-NEXT: vpshufb %xmm2, %xmm3, %xmm2 +; AVX512-NEXT: vpsrlw $4, %xmm0, %xmm0 +; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] +; AVX512-NEXT: vpshufb %xmm0, %xmm1, %xmm0 +; AVX512-NEXT: vpor %xmm0, %xmm2, %xmm0 +; AVX512-NEXT: retq ; ; XOP-LABEL: test_bitreverse_v8i16: ; XOP: # %bb.0: @@ -609,19 +663,47 @@ ; SSSE3-NEXT: movdqa %xmm1, %xmm0 ; SSSE3-NEXT: retq ; -; AVX-LABEL: test_bitreverse_v4i32: -; AVX: # %bb.0: -; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] -; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] -; AVX-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] -; AVX-NEXT: vpshufb %xmm0, %xmm1, %xmm0 -; AVX-NEXT: vpor %xmm0, %xmm2, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: test_bitreverse_v4i32: +; AVX1: # %bb.0: +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] +; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 +; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] +; AVX1-NEXT: vpshufb %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpor %xmm0, %xmm2, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_bitreverse_v4i32: +; AVX2: # %bb.0: +; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] +; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 +; AVX2-NEXT: vpsrlw $4, %xmm0, %xmm0 +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] +; AVX2-NEXT: vpshufb %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpor %xmm0, %xmm2, %xmm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: test_bitreverse_v4i32: +; AVX512: # %bb.0: +; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] +; AVX512-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm2 +; AVX512-NEXT: vmovdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] +; AVX512-NEXT: vpshufb %xmm2, %xmm3, %xmm2 +; AVX512-NEXT: vpsrlw $4, %xmm0, %xmm0 +; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] +; AVX512-NEXT: vpshufb %xmm0, %xmm1, %xmm0 +; AVX512-NEXT: vpor %xmm0, %xmm2, %xmm0 +; AVX512-NEXT: retq ; ; XOP-LABEL: test_bitreverse_v4i32: ; XOP: # %bb.0: @@ -696,19 +778,47 @@ ; SSSE3-NEXT: movdqa %xmm1, %xmm0 ; SSSE3-NEXT: retq ; -; AVX-LABEL: test_bitreverse_v2i64: -; AVX: # %bb.0: -; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8] -; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] -; AVX-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] -; AVX-NEXT: vpshufb %xmm0, %xmm1, %xmm0 -; AVX-NEXT: vpor %xmm0, %xmm2, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: test_bitreverse_v2i64: +; AVX1: # %bb.0: +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] +; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 +; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] +; AVX1-NEXT: vpshufb %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpor %xmm0, %xmm2, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_bitreverse_v2i64: +; AVX2: # %bb.0: +; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] +; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 +; AVX2-NEXT: vpsrlw $4, %xmm0, %xmm0 +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] +; AVX2-NEXT: vpshufb %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpor %xmm0, %xmm2, %xmm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: test_bitreverse_v2i64: +; AVX512: # %bb.0: +; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8] +; AVX512-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm2 +; AVX512-NEXT: vmovdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] +; AVX512-NEXT: vpshufb %xmm2, %xmm3, %xmm2 +; AVX512-NEXT: vpsrlw $4, %xmm0, %xmm0 +; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15] +; AVX512-NEXT: vpshufb %xmm0, %xmm1, %xmm0 +; AVX512-NEXT: vpor %xmm0, %xmm2, %xmm0 +; AVX512-NEXT: retq ; ; XOP-LABEL: test_bitreverse_v2i64: ; XOP: # %bb.0: @@ -822,7 +932,7 @@ ; ; AVX2-LABEL: test_bitreverse_v32i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -835,7 +945,7 @@ ; ; AVX512-LABEL: test_bitreverse_v32i8: ; AVX512: # %bb.0: -; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] ; AVX512-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -878,8 +988,7 @@ ; ; GFNIAVX2-LABEL: test_bitreverse_v32i8: ; GFNIAVX2: # %bb.0: -; GFNIAVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745] -; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm1, %ymm0, %ymm0 +; GFNIAVX2-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; GFNIAVX2-NEXT: retq ; ; GFNIAVX512-LABEL: test_bitreverse_v32i8: @@ -998,7 +1107,7 @@ ; AVX2-LABEL: test_bitreverse_v16i16: ; AVX2: # %bb.0: ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,17,16,19,18,21,20,23,22,25,24,27,26,29,28,31,30] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -1012,7 +1121,7 @@ ; AVX512-LABEL: test_bitreverse_v16i16: ; AVX512: # %bb.0: ; AVX512-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,17,16,19,18,21,20,23,22,25,24,27,26,29,28,31,30] -; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] ; AVX512-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -1064,8 +1173,7 @@ ; GFNIAVX2-LABEL: test_bitreverse_v16i16: ; GFNIAVX2: # %bb.0: ; GFNIAVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,17,16,19,18,21,20,23,22,25,24,27,26,29,28,31,30] -; GFNIAVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745] -; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm1, %ymm0, %ymm0 +; GFNIAVX2-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; GFNIAVX2-NEXT: retq ; ; GFNIAVX512-LABEL: test_bitreverse_v16i16: @@ -1194,7 +1302,7 @@ ; AVX2-LABEL: test_bitreverse_v8i32: ; AVX2: # %bb.0: ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,19,18,17,16,23,22,21,20,27,26,25,24,31,30,29,28] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -1208,7 +1316,7 @@ ; AVX512-LABEL: test_bitreverse_v8i32: ; AVX512: # %bb.0: ; AVX512-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,19,18,17,16,23,22,21,20,27,26,25,24,31,30,29,28] -; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] ; AVX512-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -1260,8 +1368,7 @@ ; GFNIAVX2-LABEL: test_bitreverse_v8i32: ; GFNIAVX2: # %bb.0: ; GFNIAVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,19,18,17,16,23,22,21,20,27,26,25,24,31,30,29,28] -; GFNIAVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745] -; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm1, %ymm0, %ymm0 +; GFNIAVX2-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; GFNIAVX2-NEXT: retq ; ; GFNIAVX512-LABEL: test_bitreverse_v8i32: @@ -1394,7 +1501,7 @@ ; AVX2-LABEL: test_bitreverse_v4i64: ; AVX2: # %bb.0: ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,23,22,21,20,19,18,17,16,31,30,29,28,27,26,25,24] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -1408,7 +1515,7 @@ ; AVX512-LABEL: test_bitreverse_v4i64: ; AVX512: # %bb.0: ; AVX512-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,23,22,21,20,19,18,17,16,31,30,29,28,27,26,25,24] -; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] ; AVX512-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -1460,8 +1567,7 @@ ; GFNIAVX2-LABEL: test_bitreverse_v4i64: ; GFNIAVX2: # %bb.0: ; GFNIAVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,23,22,21,20,19,18,17,16,31,30,29,28,27,26,25,24] -; GFNIAVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745] -; GFNIAVX2-NEXT: vgf2p8affineqb $0, %ymm1, %ymm0, %ymm0 +; GFNIAVX2-NEXT: vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; GFNIAVX2-NEXT: retq ; ; GFNIAVX512-LABEL: test_bitreverse_v4i64: @@ -1635,7 +1741,7 @@ ; ; AVX2-LABEL: test_bitreverse_v64i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm3 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] ; AVX2-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -1655,7 +1761,7 @@ ; AVX512F-LABEL: test_bitreverse_v64i8: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -1723,7 +1829,7 @@ ; ; GFNIAVX1-LABEL: test_bitreverse_v64i8: ; GFNIAVX1: # %bb.0: -; GFNIAVX1-NEXT: vbroadcastsd {{.*#+}} ymm2 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745] +; GFNIAVX1-NEXT: vmovdqa {{.*#+}} ymm2 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128] ; GFNIAVX1-NEXT: vgf2p8affineqb $0, %ymm2, %ymm0, %ymm0 ; GFNIAVX1-NEXT: vgf2p8affineqb $0, %ymm2, %ymm1, %ymm1 ; GFNIAVX1-NEXT: retq @@ -1933,7 +2039,7 @@ ; AVX2: # %bb.0: ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] ; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm3, %ymm0, %ymm4 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] ; AVX2-NEXT: vpshufb %ymm4, %ymm5, %ymm4 @@ -1956,7 +2062,7 @@ ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] ; AVX512F-NEXT: vpshufb %ymm2, %ymm1, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm3, %ymm1, %ymm4 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] ; AVX512F-NEXT: vpshufb %ymm4, %ymm5, %ymm4 @@ -2036,7 +2142,7 @@ ; GFNIAVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2 ; GFNIAVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm0 ; GFNIAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; GFNIAVX1-NEXT: vbroadcastsd {{.*#+}} ymm2 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745] +; GFNIAVX1-NEXT: vmovdqa {{.*#+}} ymm2 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128] ; GFNIAVX1-NEXT: vgf2p8affineqb $0, %ymm2, %ymm0, %ymm0 ; GFNIAVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 ; GFNIAVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm4 @@ -2281,7 +2387,7 @@ ; AVX2: # %bb.0: ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] ; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm3, %ymm0, %ymm4 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] ; AVX2-NEXT: vpshufb %ymm4, %ymm5, %ymm4 @@ -2304,7 +2410,7 @@ ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] ; AVX512F-NEXT: vpshufb %ymm2, %ymm1, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm3, %ymm1, %ymm4 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] ; AVX512F-NEXT: vpshufb %ymm4, %ymm5, %ymm4 @@ -2384,7 +2490,7 @@ ; GFNIAVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2 ; GFNIAVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm0 ; GFNIAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; GFNIAVX1-NEXT: vbroadcastsd {{.*#+}} ymm2 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745] +; GFNIAVX1-NEXT: vmovdqa {{.*#+}} ymm2 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128] ; GFNIAVX1-NEXT: vgf2p8affineqb $0, %ymm2, %ymm0, %ymm0 ; GFNIAVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 ; GFNIAVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm4 @@ -2637,7 +2743,7 @@ ; AVX2: # %bb.0: ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8] ; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm3, %ymm0, %ymm4 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] ; AVX2-NEXT: vpshufb %ymm4, %ymm5, %ymm4 @@ -2660,7 +2766,7 @@ ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8] ; AVX512F-NEXT: vpshufb %ymm2, %ymm1, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm3, %ymm1, %ymm4 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240] ; AVX512F-NEXT: vpshufb %ymm4, %ymm5, %ymm4 @@ -2740,7 +2846,7 @@ ; GFNIAVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2 ; GFNIAVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm0 ; GFNIAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; GFNIAVX1-NEXT: vbroadcastsd {{.*#+}} ymm2 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745] +; GFNIAVX1-NEXT: vmovdqa {{.*#+}} ymm2 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128] ; GFNIAVX1-NEXT: vgf2p8affineqb $0, %ymm2, %ymm0, %ymm0 ; GFNIAVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 ; GFNIAVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm4 diff --git a/llvm/test/CodeGen/X86/vector-blend.ll b/llvm/test/CodeGen/X86/vector-blend.ll --- a/llvm/test/CodeGen/X86/vector-blend.ll +++ b/llvm/test/CodeGen/X86/vector-blend.ll @@ -262,11 +262,17 @@ ; SSE41-NEXT: movdqa %xmm2, %xmm0 ; SSE41-NEXT: retq ; -; AVX-LABEL: vsel_i8: -; AVX: # %bb.0: # %entry -; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255] -; AVX-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: vsel_i8: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255] +; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: vsel_i8: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [4294967040,4294967040,4294967040,4294967040] +; AVX2-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: retq entry: %vsel = select <16 x i1> , <16 x i8> %v1, <16 x i8> %v2 ret <16 x i8> %vsel @@ -619,7 +625,7 @@ ; ; AVX1-LABEL: constant_pblendvb_avx2: ; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm2 = [18374686483949879295,18374686483949879295,18374686483949879295,18374686483949879295] +; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm2 = [-5.4861292804117373E+303,-5.4861292804117373E+303,-5.4861292804117373E+303,-5.4861292804117373E+303] ; AVX1-NEXT: vandnps %ymm0, %ymm2, %ymm0 ; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 ; AVX1-NEXT: vorps %ymm0, %ymm1, %ymm0 @@ -627,7 +633,7 @@ ; ; AVX2-LABEL: constant_pblendvb_avx2: ; AVX2: # %bb.0: # %entry -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,0,255,0,0,0,255,255,255,0,255,0,0,0,255,255,255,0,255,0,0,0,255,255,255,0,255,0,0,0,255] +; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [18374686483949879295,18374686483949879295,18374686483949879295,18374686483949879295] ; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq entry: diff --git a/llvm/test/CodeGen/X86/vector-bo-select.ll b/llvm/test/CodeGen/X86/vector-bo-select.ll --- a/llvm/test/CodeGen/X86/vector-bo-select.ll +++ b/llvm/test/CodeGen/X86/vector-bo-select.ll @@ -2706,7 +2706,7 @@ ; AVX2-LABEL: mul_v4i32: ; AVX2: # %bb.0: ; AVX2-NEXT: vpslld $31, %xmm0, %xmm0 -; AVX2-NEXT: vbroadcastss {{.*#+}} xmm3 = [1,1,1,1] +; AVX2-NEXT: vbroadcastss {{.*#+}} xmm3 = [1.40129846E-45,1.40129846E-45,1.40129846E-45,1.40129846E-45] ; AVX2-NEXT: vblendvps %xmm0, %xmm2, %xmm3, %xmm0 ; AVX2-NEXT: vpmulld %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq @@ -2789,7 +2789,7 @@ ; AVX2: # %bb.0: ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX2-NEXT: vpslld $31, %ymm0, %ymm0 -; AVX2-NEXT: vbroadcastss {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1] +; AVX2-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.40129846E-45,1.40129846E-45,1.40129846E-45,1.40129846E-45,1.40129846E-45,1.40129846E-45,1.40129846E-45,1.40129846E-45] ; AVX2-NEXT: vblendvps %ymm0, %ymm2, %ymm3, %ymm0 ; AVX2-NEXT: vpmulld %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq @@ -2882,7 +2882,7 @@ ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [1,2,4,8,16,32,64,128] ; AVX2-NEXT: vpand %ymm3, %ymm2, %ymm2 ; AVX2-NEXT: vpcmpeqd %ymm3, %ymm2, %ymm2 -; AVX2-NEXT: vbroadcastss {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1] +; AVX2-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.40129846E-45,1.40129846E-45,1.40129846E-45,1.40129846E-45,1.40129846E-45,1.40129846E-45,1.40129846E-45,1.40129846E-45] ; AVX2-NEXT: vblendvps %ymm2, %ymm1, %ymm3, %ymm1 ; AVX2-NEXT: vpmulld %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq @@ -3069,7 +3069,7 @@ ; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [16,32,64,128] ; AVX2-NEXT: vpand %ymm5, %ymm4, %ymm6 ; AVX2-NEXT: vpcmpeqq %ymm5, %ymm6, %ymm5 -; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm6 = [1,1,1,1] +; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm6 = [4.9406564584124654E-324,4.9406564584124654E-324,4.9406564584124654E-324,4.9406564584124654E-324] ; AVX2-NEXT: vblendvpd %ymm5, %ymm3, %ymm6, %ymm3 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [1,2,4,8] ; AVX2-NEXT: vpand %ymm5, %ymm4, %ymm4 @@ -5685,7 +5685,7 @@ ; AVX2-NEXT: vpmovzxwd {{.*#+}} xmm5 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; AVX2-NEXT: vpslld $31, %xmm5, %xmm5 ; AVX2-NEXT: vpmovsxdq %xmm5, %ymm5 -; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm6 = [1,1,1,1] +; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm6 = [4.9406564584124654E-324,4.9406564584124654E-324,4.9406564584124654E-324,4.9406564584124654E-324] ; AVX2-NEXT: vblendvpd %ymm5, %ymm6, %ymm3, %ymm5 ; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] ; AVX2-NEXT: vpslld $31, %xmm0, %xmm0 diff --git a/llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll b/llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll --- a/llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll +++ b/llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll @@ -431,7 +431,8 @@ ; ; AVX-LABEL: constrained_vector_fmul_v2f64: ; AVX: # %bb.0: # %entry -; AVX-NEXT: vmovapd {{.*#+}} xmm0 = [1.7976931348623157E+308,1.7976931348623157E+308] +; AVX-NEXT: vmovddup {{.*#+}} xmm0 = [1.7976931348623157E+308,1.7976931348623157E+308] +; AVX-NEXT: # xmm0 = mem[0,0] ; AVX-NEXT: vmulpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX-NEXT: retq entry: @@ -493,7 +494,8 @@ ; AVX: # %bb.0: # %entry ; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; AVX-NEXT: vmulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX-NEXT: vmovapd {{.*#+}} xmm1 = [1.7976931348623157E+308,1.7976931348623157E+308] +; AVX-NEXT: vmovddup {{.*#+}} xmm1 = [1.7976931348623157E+308,1.7976931348623157E+308] +; AVX-NEXT: # xmm1 = mem[0,0] ; AVX-NEXT: vmulpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX-NEXT: retq @@ -516,17 +518,11 @@ ; CHECK-NEXT: mulpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-NEXT: retq ; -; AVX1-LABEL: constrained_vector_fmul_v4f64: -; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vmovapd {{.*#+}} ymm0 = [1.7976931348623157E+308,1.7976931348623157E+308,1.7976931348623157E+308,1.7976931348623157E+308] -; AVX1-NEXT: vmulpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; AVX1-NEXT: retq -; -; AVX512-LABEL: constrained_vector_fmul_v4f64: -; AVX512: # %bb.0: # %entry -; AVX512-NEXT: vbroadcastsd {{.*#+}} ymm0 = [1.7976931348623157E+308,1.7976931348623157E+308,1.7976931348623157E+308,1.7976931348623157E+308] -; AVX512-NEXT: vmulpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; AVX512-NEXT: retq +; AVX-LABEL: constrained_vector_fmul_v4f64: +; AVX: # %bb.0: # %entry +; AVX-NEXT: vbroadcastsd {{.*#+}} ymm0 = [1.7976931348623157E+308,1.7976931348623157E+308,1.7976931348623157E+308,1.7976931348623157E+308] +; AVX-NEXT: vmulpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX-NEXT: retq entry: %mul = call <4 x double> @llvm.experimental.constrained.fmul.v4f64( <4 x double> @llvm.experimental.constrained.fadd.v4f64( <4 x double> @llvm.experimental.constrained.fsub.v4f64( <4 x double> This Inner Loop Header: Depth=1 ; AVX2-NEXT: vmovdqu 1024(%rdi,%rax), %xmm2 -; AVX2-NEXT: vpsllvd %xmm0, %xmm2, %xmm3 -; AVX2-NEXT: vpsrlvd %xmm1, %xmm2, %xmm2 -; AVX2-NEXT: vpor %xmm2, %xmm3, %xmm2 +; AVX2-NEXT: vpsrlvd %xmm1, %xmm2, %xmm3 +; AVX2-NEXT: vpsllvd %xmm0, %xmm2, %xmm2 +; AVX2-NEXT: vpor %xmm3, %xmm2, %xmm2 ; AVX2-NEXT: vmovdqu %xmm2, 1024(%rdi,%rax) ; AVX2-NEXT: addq $16, %rax ; AVX2-NEXT: jne .LBB8_1 diff --git a/llvm/test/CodeGen/X86/vector-fshl-256.ll b/llvm/test/CodeGen/X86/vector-fshl-256.ll --- a/llvm/test/CodeGen/X86/vector-fshl-256.ll +++ b/llvm/test/CodeGen/X86/vector-fshl-256.ll @@ -22,7 +22,7 @@ define <4 x i64> @var_funnnel_v4i64(<4 x i64> %x, <4 x i64> %y, <4 x i64> %amt) nounwind { ; AVX1-LABEL: var_funnnel_v4i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovaps {{.*#+}} ymm3 = [63,63,63,63] +; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm3 = [3.1126135687998532E-322,3.1126135687998532E-322,3.1126135687998532E-322,3.1126135687998532E-322] ; AVX1-NEXT: vandnps %ymm3, %ymm2, %ymm4 ; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm5 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm6 @@ -123,7 +123,7 @@ ; ; XOPAVX1-LABEL: var_funnnel_v4i64: ; XOPAVX1: # %bb.0: -; XOPAVX1-NEXT: vmovaps {{.*#+}} ymm3 = [63,63,63,63] +; XOPAVX1-NEXT: vbroadcastsd {{.*#+}} ymm3 = [3.1126135687998532E-322,3.1126135687998532E-322,3.1126135687998532E-322,3.1126135687998532E-322] ; XOPAVX1-NEXT: vandps %ymm3, %ymm2, %ymm4 ; XOPAVX1-NEXT: vextractf128 $1, %ymm4, %xmm5 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm6 @@ -395,7 +395,7 @@ ; AVX512BW-LABEL: var_funnnel_v16i16: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastw {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpandn %ymm3, %ymm2, %ymm4 ; AVX512BW-NEXT: vpsrlw $1, %ymm1, %ymm1 ; AVX512BW-NEXT: vpsrlvw %zmm4, %zmm1, %zmm1 @@ -451,7 +451,7 @@ ; ; XOPAVX2-LABEL: var_funnnel_v16i16: ; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; XOPAVX2-NEXT: vpbroadcastw {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; XOPAVX2-NEXT: vpand %ymm3, %ymm2, %ymm4 ; XOPAVX2-NEXT: vextracti128 $1, %ymm4, %xmm5 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm6 @@ -573,11 +573,11 @@ ; ; AVX512F-LABEL: var_funnnel_v32i8: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] ; AVX512F-NEXT: vpandn %ymm3, %ymm2, %ymm4 ; AVX512F-NEXT: vpsllw $5, %ymm4, %ymm4 ; AVX512F-NEXT: vpsrlw $1, %ymm1, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm5 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] ; AVX512F-NEXT: vpand %ymm5, %ymm1, %ymm1 ; AVX512F-NEXT: vpsrlw $4, %ymm1, %ymm6 ; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm6, %ymm6 @@ -719,7 +719,7 @@ ; XOPAVX2-NEXT: vpshlb %xmm4, %xmm3, %xmm3 ; XOPAVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 ; XOPAVX2-NEXT: vextracti128 $1, %ymm2, %xmm5 -; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm6 = [249,249,249,249,249,249,249,249,249,249,249,249,249,249,249,249] +; XOPAVX2-NEXT: vpbroadcastb {{.*#+}} xmm6 = [249,249,249,249,249,249,249,249,249,249,249,249,249,249,249,249] ; XOPAVX2-NEXT: vpaddb %xmm6, %xmm5, %xmm7 ; XOPAVX2-NEXT: vpshlb %xmm7, %xmm3, %xmm3 ; XOPAVX2-NEXT: vpshlb %xmm4, %xmm1, %xmm1 @@ -761,7 +761,7 @@ ; ; AVX2-LABEL: splatvar_funnnel_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm3 = [63,63] ; AVX2-NEXT: vpandn %xmm3, %xmm2, %xmm4 ; AVX2-NEXT: vpsrlq $1, %ymm1, %ymm1 ; AVX2-NEXT: vpsrlq %xmm4, %ymm1, %ymm1 @@ -772,7 +772,7 @@ ; ; AVX512F-LABEL: splatvar_funnnel_v4i64: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63] +; AVX512F-NEXT: vpbroadcastq {{.*#+}} xmm3 = [63,63] ; AVX512F-NEXT: vpandn %xmm3, %xmm2, %xmm4 ; AVX512F-NEXT: vpsrlq $1, %ymm1, %ymm1 ; AVX512F-NEXT: vpsrlq %xmm4, %ymm1, %ymm1 @@ -794,7 +794,7 @@ ; ; AVX512BW-LABEL: splatvar_funnnel_v4i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63] +; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm3 = [63,63] ; AVX512BW-NEXT: vpandn %xmm3, %xmm2, %xmm4 ; AVX512BW-NEXT: vpsrlq $1, %ymm1, %ymm1 ; AVX512BW-NEXT: vpsrlq %xmm4, %ymm1, %ymm1 @@ -849,7 +849,7 @@ ; ; XOPAVX2-LABEL: splatvar_funnnel_v4i64: ; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63] +; XOPAVX2-NEXT: vpbroadcastq {{.*#+}} xmm3 = [63,63] ; XOPAVX2-NEXT: vpandn %xmm3, %xmm2, %xmm4 ; XOPAVX2-NEXT: vpsrlq $1, %ymm1, %ymm1 ; XOPAVX2-NEXT: vpsrlq %xmm4, %ymm1, %ymm1 @@ -1306,7 +1306,7 @@ ; AVX2-NEXT: vpbroadcastd %xmm1, %ymm1 ; AVX2-NEXT: movq $-1024, %rax # imm = 0xFC00 ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vbroadcastss {{.*#+}} ymm3 = [31,31,31,31,31,31,31,31] +; AVX2-NEXT: vbroadcastss {{.*#+}} ymm3 = [4.34402524E-44,4.34402524E-44,4.34402524E-44,4.34402524E-44,4.34402524E-44,4.34402524E-44,4.34402524E-44,4.34402524E-44] ; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm4 = [32,32,32,32,32,32,32,32] ; AVX2-NEXT: .p2align 4, 0x90 ; AVX2-NEXT: .LBB8_1: # %loop diff --git a/llvm/test/CodeGen/X86/vector-fshl-512.ll b/llvm/test/CodeGen/X86/vector-fshl-512.ll --- a/llvm/test/CodeGen/X86/vector-fshl-512.ll +++ b/llvm/test/CodeGen/X86/vector-fshl-512.ll @@ -222,19 +222,19 @@ ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm3 ; AVX512F-NEXT: vpsrlw $1, %ymm3, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm4 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] ; AVX512F-NEXT: vpand %ymm4, %ymm3, %ymm5 ; AVX512F-NEXT: vpsrlw $4, %ymm5, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm6 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm6, %ymm3, %ymm7 ; AVX512F-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm2 ; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm8 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm8 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] ; AVX512F-NEXT: vpxor %ymm3, %ymm8, %ymm9 ; AVX512F-NEXT: vpsllw $5, %ymm9, %ymm9 ; AVX512F-NEXT: vpblendvb %ymm9, %ymm7, %ymm5, %ymm5 ; AVX512F-NEXT: vpsrlw $2, %ymm5, %ymm7 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm10 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm10 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63] ; AVX512F-NEXT: vpand %ymm7, %ymm10, %ymm7 ; AVX512F-NEXT: vpaddb %ymm9, %ymm9, %ymm9 ; AVX512F-NEXT: vpblendvb %ymm9, %ymm7, %ymm5, %ymm5 @@ -260,12 +260,12 @@ ; AVX512F-NEXT: vinserti64x4 $1, %ymm5, %zmm1, %zmm1 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm4 ; AVX512F-NEXT: vpsllw $4, %ymm4, %ymm5 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm6 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] ; AVX512F-NEXT: vpand %ymm6, %ymm5, %ymm5 ; AVX512F-NEXT: vpsllw $5, %ymm3, %ymm3 ; AVX512F-NEXT: vpblendvb %ymm3, %ymm5, %ymm4, %ymm4 ; AVX512F-NEXT: vpsllw $2, %ymm4, %ymm5 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm7 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm7 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] ; AVX512F-NEXT: vpand %ymm7, %ymm5, %ymm5 ; AVX512F-NEXT: vpaddb %ymm3, %ymm3, %ymm3 ; AVX512F-NEXT: vpblendvb %ymm3, %ymm5, %ymm4, %ymm4 @@ -426,7 +426,7 @@ define <8 x i64> @splatvar_funnnel_v8i64(<8 x i64> %x, <8 x i64> %y, <8 x i64> %amt) nounwind { ; AVX512F-LABEL: splatvar_funnnel_v8i64: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63] +; AVX512F-NEXT: vpbroadcastq {{.*#+}} xmm3 = [63,63] ; AVX512F-NEXT: vpandn %xmm3, %xmm2, %xmm4 ; AVX512F-NEXT: vpsrlq $1, %zmm1, %zmm1 ; AVX512F-NEXT: vpsrlq %xmm4, %zmm1, %zmm1 @@ -448,7 +448,7 @@ ; ; AVX512BW-LABEL: splatvar_funnnel_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63] +; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm3 = [63,63] ; AVX512BW-NEXT: vpandn %xmm3, %xmm2, %xmm4 ; AVX512BW-NEXT: vpsrlq $1, %zmm1, %zmm1 ; AVX512BW-NEXT: vpsrlq %xmm4, %zmm1, %zmm1 diff --git a/llvm/test/CodeGen/X86/vector-fshl-rot-128.ll b/llvm/test/CodeGen/X86/vector-fshl-rot-128.ll --- a/llvm/test/CodeGen/X86/vector-fshl-rot-128.ll +++ b/llvm/test/CodeGen/X86/vector-fshl-rot-128.ll @@ -87,7 +87,7 @@ ; ; AVX2-LABEL: var_funnnel_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [63,63] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [63,63] ; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm3 ; AVX2-NEXT: vpsllvq %xmm3, %xmm0, %xmm3 ; AVX2-NEXT: vpxor %xmm4, %xmm4, %xmm4 @@ -224,8 +224,7 @@ ; ; AVX2-LABEL: var_funnnel_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [31,31,31,31] -; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; AVX2-NEXT: vpsllvd %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [32,32,32,32] ; AVX2-NEXT: vpsubd %xmm1, %xmm3, %xmm1 @@ -367,7 +366,7 @@ ; AVX2-LABEL: var_funnnel_v8i16: ; AVX2: # %bb.0: ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [16,16,16,16,16,16,16,16] +; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm2 = [16,16,16,16,16,16,16,16] ; AVX2-NEXT: vpsubw %xmm1, %xmm2, %xmm2 ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero @@ -388,7 +387,7 @@ ; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} ymm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero ; AVX512F-NEXT: vpsllvd %ymm2, %ymm0, %ymm2 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16] +; AVX512F-NEXT: vpbroadcastw {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16] ; AVX512F-NEXT: vpsubw %xmm1, %xmm3, %xmm1 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero ; AVX512F-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0 @@ -418,7 +417,7 @@ ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16] +; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16] ; AVX512BW-NEXT: vpsubw %xmm1, %xmm3, %xmm1 ; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpor %xmm0, %xmm2, %xmm0 @@ -731,17 +730,29 @@ ; SSE-NEXT: por %xmm4, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: splatvar_funnnel_v2i64: -; AVX: # %bb.0: -; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [63,63] -; AVX-NEXT: vpand %xmm2, %xmm1, %xmm3 -; AVX-NEXT: vpsllq %xmm3, %xmm0, %xmm3 -; AVX-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX-NEXT: vpsubq %xmm1, %xmm4, %xmm1 -; AVX-NEXT: vpand %xmm2, %xmm1, %xmm1 -; AVX-NEXT: vpsrlq %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpor %xmm0, %xmm3, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: splatvar_funnnel_v2i64: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [63,63] +; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm3 +; AVX1-NEXT: vpsllq %xmm3, %xmm0, %xmm3 +; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; AVX1-NEXT: vpsubq %xmm1, %xmm4, %xmm1 +; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpor %xmm0, %xmm3, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: splatvar_funnnel_v2i64: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [63,63] +; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm3 +; AVX2-NEXT: vpsllq %xmm3, %xmm0, %xmm3 +; AVX2-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; AVX2-NEXT: vpsubq %xmm1, %xmm4, %xmm1 +; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpsrlq %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpor %xmm0, %xmm3, %xmm0 +; AVX2-NEXT: retq ; ; AVX512F-LABEL: splatvar_funnnel_v2i64: ; AVX512F: # %bb.0: diff --git a/llvm/test/CodeGen/X86/vector-fshl-rot-256.ll b/llvm/test/CodeGen/X86/vector-fshl-rot-256.ll --- a/llvm/test/CodeGen/X86/vector-fshl-rot-256.ll +++ b/llvm/test/CodeGen/X86/vector-fshl-rot-256.ll @@ -163,8 +163,7 @@ ; ; AVX2-LABEL: var_funnnel_v8i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [31,31,31,31,31,31,31,31] -; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 ; AVX2-NEXT: vpsllvd %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm3 = [32,32,32,32,32,32,32,32] ; AVX2-NEXT: vpsubd %ymm1, %ymm3, %ymm1 @@ -281,7 +280,7 @@ ; AVX2-NEXT: vpsllvd %ymm5, %ymm0, %ymm5 ; AVX2-NEXT: vpsrld $16, %ymm5, %ymm5 ; AVX2-NEXT: vpackusdw %ymm4, %ymm5, %ymm4 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm5 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] ; AVX2-NEXT: vpsubw %ymm1, %ymm5, %ymm1 ; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15] ; AVX2-NEXT: vpsrlvd %ymm5, %ymm3, %ymm3 @@ -299,7 +298,7 @@ ; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero ; AVX512F-NEXT: vpsllvd %zmm2, %zmm0, %zmm2 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] ; AVX512F-NEXT: vpsubw %ymm1, %ymm3, %ymm1 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero ; AVX512F-NEXT: vpsrlvd %zmm1, %zmm0, %zmm0 @@ -326,7 +325,7 @@ ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 ; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX512BW-NEXT: vpbroadcastw {{.*#+}} ymm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] ; AVX512BW-NEXT: vpsubw %ymm1, %ymm3, %ymm1 ; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpor %ymm0, %ymm2, %ymm0 @@ -581,7 +580,7 @@ ; ; AVX2-LABEL: splatvar_funnnel_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [63,63] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [63,63] ; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm3 ; AVX2-NEXT: vpsllq %xmm3, %ymm0, %ymm3 ; AVX2-NEXT: vpxor %xmm4, %xmm4, %xmm4 diff --git a/llvm/test/CodeGen/X86/vector-fshl-rot-512.ll b/llvm/test/CodeGen/X86/vector-fshl-rot-512.ll --- a/llvm/test/CodeGen/X86/vector-fshl-rot-512.ll +++ b/llvm/test/CodeGen/X86/vector-fshl-rot-512.ll @@ -36,12 +36,12 @@ define <32 x i16> @var_funnnel_v32i16(<32 x i16> %x, <32 x i16> %amt) nounwind { ; AVX512F-LABEL: var_funnnel_v32i16: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm4 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm5 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512F-NEXT: vpsllvd %zmm4, %zmm5, %zmm4 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm6 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] ; AVX512F-NEXT: vpsubw %ymm3, %ymm6, %ymm3 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm3 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero ; AVX512F-NEXT: vpsrlvd %zmm3, %zmm5, %zmm3 @@ -139,7 +139,7 @@ ; AVX512F-NEXT: vpaddb %ymm3, %ymm3, %ymm3 ; AVX512F-NEXT: vpblendvb %ymm3, %ymm6, %ymm2, %ymm2 ; AVX512F-NEXT: vpsrlw $7, %ymm2, %ymm4 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm6 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] ; AVX512F-NEXT: vpand %ymm6, %ymm4, %ymm4 ; AVX512F-NEXT: vpaddb %ymm2, %ymm2, %ymm8 ; AVX512F-NEXT: vpor %ymm4, %ymm8, %ymm4 diff --git a/llvm/test/CodeGen/X86/vector-fshl-rot-sub128.ll b/llvm/test/CodeGen/X86/vector-fshl-rot-sub128.ll --- a/llvm/test/CodeGen/X86/vector-fshl-rot-sub128.ll +++ b/llvm/test/CodeGen/X86/vector-fshl-rot-sub128.ll @@ -77,8 +77,7 @@ ; ; AVX2-LABEL: var_funnnel_v2i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [31,31,31,31] -; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; AVX2-NEXT: vpsllvd %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [32,32,32,32] ; AVX2-NEXT: vpsubd %xmm1, %xmm3, %xmm1 diff --git a/llvm/test/CodeGen/X86/vector-fshr-128.ll b/llvm/test/CodeGen/X86/vector-fshr-128.ll --- a/llvm/test/CodeGen/X86/vector-fshr-128.ll +++ b/llvm/test/CodeGen/X86/vector-fshr-128.ll @@ -84,7 +84,7 @@ ; ; AVX2-LABEL: var_funnnel_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm3 = [63,63] ; AVX2-NEXT: vpand %xmm3, %xmm2, %xmm4 ; AVX2-NEXT: vpsrlvq %xmm4, %xmm1, %xmm1 ; AVX2-NEXT: vpandn %xmm3, %xmm2, %xmm2 @@ -95,7 +95,7 @@ ; ; AVX512F-LABEL: var_funnnel_v2i64: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63] +; AVX512F-NEXT: vpbroadcastq {{.*#+}} xmm3 = [63,63] ; AVX512F-NEXT: vpand %xmm3, %xmm2, %xmm4 ; AVX512F-NEXT: vpsrlvq %xmm4, %xmm1, %xmm1 ; AVX512F-NEXT: vpandn %xmm3, %xmm2, %xmm2 @@ -117,7 +117,7 @@ ; ; AVX512BW-LABEL: var_funnnel_v2i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63] +; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm3 = [63,63] ; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm4 ; AVX512BW-NEXT: vpsrlvq %xmm4, %xmm1, %xmm1 ; AVX512BW-NEXT: vpandn %xmm3, %xmm2, %xmm2 @@ -168,7 +168,7 @@ ; ; XOPAVX2-LABEL: var_funnnel_v2i64: ; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63] +; XOPAVX2-NEXT: vpbroadcastq {{.*#+}} xmm3 = [63,63] ; XOPAVX2-NEXT: vpand %xmm3, %xmm2, %xmm4 ; XOPAVX2-NEXT: vpsrlvq %xmm4, %xmm1, %xmm1 ; XOPAVX2-NEXT: vpandn %xmm3, %xmm2, %xmm2 @@ -604,7 +604,7 @@ ; AVX512BW-LABEL: var_funnnel_v8i16: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm4 ; AVX512BW-NEXT: vpsrlvw %zmm4, %zmm1, %zmm1 ; AVX512BW-NEXT: vpandn %xmm3, %xmm2, %xmm2 @@ -641,18 +641,31 @@ ; AVX512VLVBMI2-NEXT: vmovdqa %xmm1, %xmm0 ; AVX512VLVBMI2-NEXT: retq ; -; XOP-LABEL: var_funnnel_v8i16: -; XOP: # %bb.0: -; XOP-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15] -; XOP-NEXT: vpand %xmm3, %xmm2, %xmm4 -; XOP-NEXT: vpxor %xmm5, %xmm5, %xmm5 -; XOP-NEXT: vpsubw %xmm4, %xmm5, %xmm4 -; XOP-NEXT: vpshlw %xmm4, %xmm1, %xmm1 -; XOP-NEXT: vpandn %xmm3, %xmm2, %xmm2 -; XOP-NEXT: vpaddw %xmm0, %xmm0, %xmm0 -; XOP-NEXT: vpshlw %xmm2, %xmm0, %xmm0 -; XOP-NEXT: vpor %xmm1, %xmm0, %xmm0 -; XOP-NEXT: retq +; XOPAVX1-LABEL: var_funnnel_v8i16: +; XOPAVX1: # %bb.0: +; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15] +; XOPAVX1-NEXT: vpand %xmm3, %xmm2, %xmm4 +; XOPAVX1-NEXT: vpxor %xmm5, %xmm5, %xmm5 +; XOPAVX1-NEXT: vpsubw %xmm4, %xmm5, %xmm4 +; XOPAVX1-NEXT: vpshlw %xmm4, %xmm1, %xmm1 +; XOPAVX1-NEXT: vpandn %xmm3, %xmm2, %xmm2 +; XOPAVX1-NEXT: vpaddw %xmm0, %xmm0, %xmm0 +; XOPAVX1-NEXT: vpshlw %xmm2, %xmm0, %xmm0 +; XOPAVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; XOPAVX1-NEXT: retq +; +; XOPAVX2-LABEL: var_funnnel_v8i16: +; XOPAVX2: # %bb.0: +; XOPAVX2-NEXT: vpbroadcastw {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15] +; XOPAVX2-NEXT: vpand %xmm3, %xmm2, %xmm4 +; XOPAVX2-NEXT: vpxor %xmm5, %xmm5, %xmm5 +; XOPAVX2-NEXT: vpsubw %xmm4, %xmm5, %xmm4 +; XOPAVX2-NEXT: vpshlw %xmm4, %xmm1, %xmm1 +; XOPAVX2-NEXT: vpandn %xmm3, %xmm2, %xmm2 +; XOPAVX2-NEXT: vpaddw %xmm0, %xmm0, %xmm0 +; XOPAVX2-NEXT: vpshlw %xmm2, %xmm0, %xmm0 +; XOPAVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; XOPAVX2-NEXT: retq ; ; X86-SSE2-LABEL: var_funnnel_v8i16: ; X86-SSE2: # %bb.0: @@ -825,41 +838,73 @@ ; SSE41-NEXT: movdqa %xmm2, %xmm0 ; SSE41-NEXT: retq ; -; AVX-LABEL: var_funnnel_v16i8: -; AVX: # %bb.0: -; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] -; AVX-NEXT: vpand %xmm3, %xmm2, %xmm4 -; AVX-NEXT: vpsllw $5, %xmm4, %xmm4 -; AVX-NEXT: vpaddb %xmm4, %xmm4, %xmm5 -; AVX-NEXT: vpsrlw $4, %xmm1, %xmm6 -; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6, %xmm6 -; AVX-NEXT: vpblendvb %xmm4, %xmm6, %xmm1, %xmm1 -; AVX-NEXT: vpsrlw $2, %xmm1, %xmm4 -; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4 -; AVX-NEXT: vpblendvb %xmm5, %xmm4, %xmm1, %xmm1 -; AVX-NEXT: vpsrlw $1, %xmm1, %xmm4 -; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4 -; AVX-NEXT: vpaddb %xmm5, %xmm5, %xmm5 -; AVX-NEXT: vpblendvb %xmm5, %xmm4, %xmm1, %xmm1 -; AVX-NEXT: vpandn %xmm3, %xmm2, %xmm2 -; AVX-NEXT: vpsllw $5, %xmm2, %xmm2 -; AVX-NEXT: vpaddb %xmm2, %xmm2, %xmm3 -; AVX-NEXT: vpaddb %xmm0, %xmm0, %xmm0 -; AVX-NEXT: vpsllw $4, %xmm0, %xmm4 -; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4 -; AVX-NEXT: vpblendvb %xmm2, %xmm4, %xmm0, %xmm0 -; AVX-NEXT: vpsllw $2, %xmm0, %xmm2 -; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 -; AVX-NEXT: vpblendvb %xmm3, %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpaddb %xmm0, %xmm0, %xmm2 -; AVX-NEXT: vpaddb %xmm3, %xmm3, %xmm3 -; AVX-NEXT: vpblendvb %xmm3, %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: var_funnnel_v16i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm4 +; AVX1-NEXT: vpsllw $5, %xmm4, %xmm4 +; AVX1-NEXT: vpaddb %xmm4, %xmm4, %xmm5 +; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm6 +; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6, %xmm6 +; AVX1-NEXT: vpblendvb %xmm4, %xmm6, %xmm1, %xmm1 +; AVX1-NEXT: vpsrlw $2, %xmm1, %xmm4 +; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4 +; AVX1-NEXT: vpblendvb %xmm5, %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vpsrlw $1, %xmm1, %xmm4 +; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4 +; AVX1-NEXT: vpaddb %xmm5, %xmm5, %xmm5 +; AVX1-NEXT: vpblendvb %xmm5, %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vpandn %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpsllw $5, %xmm2, %xmm2 +; AVX1-NEXT: vpaddb %xmm2, %xmm2, %xmm3 +; AVX1-NEXT: vpaddb %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vpsllw $4, %xmm0, %xmm4 +; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4 +; AVX1-NEXT: vpblendvb %xmm2, %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vpsllw $2, %xmm0, %xmm2 +; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; AVX1-NEXT: vpblendvb %xmm3, %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpaddb %xmm0, %xmm0, %xmm2 +; AVX1-NEXT: vpaddb %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpblendvb %xmm3, %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: var_funnnel_v16i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; AVX2-NEXT: vpand %xmm3, %xmm2, %xmm4 +; AVX2-NEXT: vpsllw $5, %xmm4, %xmm4 +; AVX2-NEXT: vpaddb %xmm4, %xmm4, %xmm5 +; AVX2-NEXT: vpsrlw $4, %xmm1, %xmm6 +; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6, %xmm6 +; AVX2-NEXT: vpblendvb %xmm4, %xmm6, %xmm1, %xmm1 +; AVX2-NEXT: vpsrlw $2, %xmm1, %xmm4 +; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4 +; AVX2-NEXT: vpblendvb %xmm5, %xmm4, %xmm1, %xmm1 +; AVX2-NEXT: vpsrlw $1, %xmm1, %xmm4 +; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4 +; AVX2-NEXT: vpaddb %xmm5, %xmm5, %xmm5 +; AVX2-NEXT: vpblendvb %xmm5, %xmm4, %xmm1, %xmm1 +; AVX2-NEXT: vpandn %xmm3, %xmm2, %xmm2 +; AVX2-NEXT: vpsllw $5, %xmm2, %xmm2 +; AVX2-NEXT: vpaddb %xmm2, %xmm2, %xmm3 +; AVX2-NEXT: vpaddb %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: vpsllw $4, %xmm0, %xmm4 +; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4 +; AVX2-NEXT: vpblendvb %xmm2, %xmm4, %xmm0, %xmm0 +; AVX2-NEXT: vpsllw $2, %xmm0, %xmm2 +; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; AVX2-NEXT: vpblendvb %xmm3, %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpaddb %xmm0, %xmm0, %xmm2 +; AVX2-NEXT: vpaddb %xmm3, %xmm3, %xmm3 +; AVX2-NEXT: vpblendvb %xmm3, %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: retq ; ; AVX512F-LABEL: var_funnnel_v16i8: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] ; AVX512F-NEXT: vpand %xmm3, %xmm2, %xmm4 ; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero,xmm4[4],zero,zero,zero,xmm4[5],zero,zero,zero,xmm4[6],zero,zero,zero,xmm4[7],zero,zero,zero,xmm4[8],zero,zero,zero,xmm4[9],zero,zero,zero,xmm4[10],zero,zero,zero,xmm4[11],zero,zero,zero,xmm4[12],zero,zero,zero,xmm4[13],zero,zero,zero,xmm4[14],zero,zero,zero,xmm4[15],zero,zero,zero ; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero @@ -945,18 +990,31 @@ ; AVX512VLVBMI2-NEXT: vzeroupper ; AVX512VLVBMI2-NEXT: retq ; -; XOP-LABEL: var_funnnel_v16i8: -; XOP: # %bb.0: -; XOP-NEXT: vmovdqa {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] -; XOP-NEXT: vpand %xmm3, %xmm2, %xmm4 -; XOP-NEXT: vpxor %xmm5, %xmm5, %xmm5 -; XOP-NEXT: vpsubb %xmm4, %xmm5, %xmm4 -; XOP-NEXT: vpshlb %xmm4, %xmm1, %xmm1 -; XOP-NEXT: vpandn %xmm3, %xmm2, %xmm2 -; XOP-NEXT: vpaddb %xmm0, %xmm0, %xmm0 -; XOP-NEXT: vpshlb %xmm2, %xmm0, %xmm0 -; XOP-NEXT: vpor %xmm1, %xmm0, %xmm0 -; XOP-NEXT: retq +; XOPAVX1-LABEL: var_funnnel_v16i8: +; XOPAVX1: # %bb.0: +; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; XOPAVX1-NEXT: vpand %xmm3, %xmm2, %xmm4 +; XOPAVX1-NEXT: vpxor %xmm5, %xmm5, %xmm5 +; XOPAVX1-NEXT: vpsubb %xmm4, %xmm5, %xmm4 +; XOPAVX1-NEXT: vpshlb %xmm4, %xmm1, %xmm1 +; XOPAVX1-NEXT: vpandn %xmm3, %xmm2, %xmm2 +; XOPAVX1-NEXT: vpaddb %xmm0, %xmm0, %xmm0 +; XOPAVX1-NEXT: vpshlb %xmm2, %xmm0, %xmm0 +; XOPAVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; XOPAVX1-NEXT: retq +; +; XOPAVX2-LABEL: var_funnnel_v16i8: +; XOPAVX2: # %bb.0: +; XOPAVX2-NEXT: vpbroadcastb {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; XOPAVX2-NEXT: vpand %xmm3, %xmm2, %xmm4 +; XOPAVX2-NEXT: vpxor %xmm5, %xmm5, %xmm5 +; XOPAVX2-NEXT: vpsubb %xmm4, %xmm5, %xmm4 +; XOPAVX2-NEXT: vpshlb %xmm4, %xmm1, %xmm1 +; XOPAVX2-NEXT: vpandn %xmm3, %xmm2, %xmm2 +; XOPAVX2-NEXT: vpaddb %xmm0, %xmm0, %xmm0 +; XOPAVX2-NEXT: vpshlb %xmm2, %xmm0, %xmm0 +; XOPAVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; XOPAVX2-NEXT: retq ; ; X86-SSE2-LABEL: var_funnnel_v16i8: ; X86-SSE2: # %bb.0: @@ -1041,20 +1099,31 @@ ; SSE-NEXT: por %xmm1, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: splatvar_funnnel_v2i64: -; AVX: # %bb.0: -; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63] -; AVX-NEXT: vpand %xmm3, %xmm2, %xmm4 -; AVX-NEXT: vpsrlq %xmm4, %xmm1, %xmm1 -; AVX-NEXT: vpandn %xmm3, %xmm2, %xmm2 -; AVX-NEXT: vpaddq %xmm0, %xmm0, %xmm0 -; AVX-NEXT: vpsllq %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: splatvar_funnnel_v2i64: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63] +; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm4 +; AVX1-NEXT: vpsrlq %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vpandn %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpaddq %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vpsllq %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: splatvar_funnnel_v2i64: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm3 = [63,63] +; AVX2-NEXT: vpand %xmm3, %xmm2, %xmm4 +; AVX2-NEXT: vpsrlq %xmm4, %xmm1, %xmm1 +; AVX2-NEXT: vpandn %xmm3, %xmm2, %xmm2 +; AVX2-NEXT: vpaddq %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: vpsllq %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: retq ; ; AVX512F-LABEL: splatvar_funnnel_v2i64: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63] +; AVX512F-NEXT: vpbroadcastq {{.*#+}} xmm3 = [63,63] ; AVX512F-NEXT: vpand %xmm3, %xmm2, %xmm4 ; AVX512F-NEXT: vpsrlq %xmm4, %xmm1, %xmm1 ; AVX512F-NEXT: vpandn %xmm3, %xmm2, %xmm2 @@ -1076,7 +1145,7 @@ ; ; AVX512BW-LABEL: splatvar_funnnel_v2i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63] +; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm3 = [63,63] ; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm4 ; AVX512BW-NEXT: vpsrlq %xmm4, %xmm1, %xmm1 ; AVX512BW-NEXT: vpandn %xmm3, %xmm2, %xmm2 @@ -1113,16 +1182,27 @@ ; AVX512VLVBMI2-NEXT: vmovdqa %xmm1, %xmm0 ; AVX512VLVBMI2-NEXT: retq ; -; XOP-LABEL: splatvar_funnnel_v2i64: -; XOP: # %bb.0: -; XOP-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63] -; XOP-NEXT: vpand %xmm3, %xmm2, %xmm4 -; XOP-NEXT: vpsrlq %xmm4, %xmm1, %xmm1 -; XOP-NEXT: vpandn %xmm3, %xmm2, %xmm2 -; XOP-NEXT: vpaddq %xmm0, %xmm0, %xmm0 -; XOP-NEXT: vpsllq %xmm2, %xmm0, %xmm0 -; XOP-NEXT: vpor %xmm1, %xmm0, %xmm0 -; XOP-NEXT: retq +; XOPAVX1-LABEL: splatvar_funnnel_v2i64: +; XOPAVX1: # %bb.0: +; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63] +; XOPAVX1-NEXT: vpand %xmm3, %xmm2, %xmm4 +; XOPAVX1-NEXT: vpsrlq %xmm4, %xmm1, %xmm1 +; XOPAVX1-NEXT: vpandn %xmm3, %xmm2, %xmm2 +; XOPAVX1-NEXT: vpaddq %xmm0, %xmm0, %xmm0 +; XOPAVX1-NEXT: vpsllq %xmm2, %xmm0, %xmm0 +; XOPAVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; XOPAVX1-NEXT: retq +; +; XOPAVX2-LABEL: splatvar_funnnel_v2i64: +; XOPAVX2: # %bb.0: +; XOPAVX2-NEXT: vpbroadcastq {{.*#+}} xmm3 = [63,63] +; XOPAVX2-NEXT: vpand %xmm3, %xmm2, %xmm4 +; XOPAVX2-NEXT: vpsrlq %xmm4, %xmm1, %xmm1 +; XOPAVX2-NEXT: vpandn %xmm3, %xmm2, %xmm2 +; XOPAVX2-NEXT: vpaddq %xmm0, %xmm0, %xmm0 +; XOPAVX2-NEXT: vpsllq %xmm2, %xmm0, %xmm0 +; XOPAVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; XOPAVX2-NEXT: retq ; ; X86-SSE2-LABEL: splatvar_funnnel_v2i64: ; X86-SSE2: # %bb.0: @@ -1376,25 +1456,38 @@ ; SSE-NEXT: movdqa %xmm3, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: splatvar_funnnel_v16i8: -; AVX: # %bb.0: -; AVX-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] -; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 -; AVX-NEXT: vpsrlw %xmm2, %xmm3, %xmm3 -; AVX-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] -; AVX-NEXT: vpand %xmm4, %xmm3, %xmm3 -; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; AVX-NEXT: vpsrlw %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpand %xmm4, %xmm0, %xmm0 -; AVX-NEXT: vpackuswb %xmm3, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: splatvar_funnnel_v16i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] +; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; AVX1-NEXT: vpsrlw %xmm2, %xmm3, %xmm3 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] +; AVX1-NEXT: vpand %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX1-NEXT: vpsrlw %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vpackuswb %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: splatvar_funnnel_v16i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] +; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; AVX2-NEXT: vpsrlw %xmm2, %xmm3, %xmm3 +; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpand %xmm4, %xmm3, %xmm3 +; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX2-NEXT: vpsrlw %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpand %xmm4, %xmm0, %xmm0 +; AVX2-NEXT: vpackuswb %xmm3, %xmm0, %xmm0 +; AVX2-NEXT: retq ; ; AVX512F-LABEL: splatvar_funnnel_v16i8: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] ; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 ; AVX512F-NEXT: vpsrlw %xmm2, %xmm3, %xmm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] +; AVX512F-NEXT: vpbroadcastw {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] ; AVX512F-NEXT: vpand %xmm4, %xmm3, %xmm3 ; AVX512F-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; AVX512F-NEXT: vpsrlw %xmm2, %xmm0, %xmm0 @@ -1420,7 +1513,7 @@ ; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] ; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 ; AVX512BW-NEXT: vpsrlw %xmm2, %xmm3, %xmm3 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] +; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] ; AVX512BW-NEXT: vpand %xmm4, %xmm3, %xmm3 ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; AVX512BW-NEXT: vpsrlw %xmm2, %xmm0, %xmm0 diff --git a/llvm/test/CodeGen/X86/vector-fshr-256.ll b/llvm/test/CodeGen/X86/vector-fshr-256.ll --- a/llvm/test/CodeGen/X86/vector-fshr-256.ll +++ b/llvm/test/CodeGen/X86/vector-fshr-256.ll @@ -22,7 +22,7 @@ define <4 x i64> @var_funnnel_v4i64(<4 x i64> %x, <4 x i64> %y, <4 x i64> %amt) nounwind { ; AVX1-LABEL: var_funnnel_v4i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovaps {{.*#+}} ymm3 = [63,63,63,63] +; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm3 = [3.1126135687998532E-322,3.1126135687998532E-322,3.1126135687998532E-322,3.1126135687998532E-322] ; AVX1-NEXT: vandps %ymm3, %ymm2, %ymm4 ; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm5 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm6 @@ -124,7 +124,7 @@ ; ; XOPAVX1-LABEL: var_funnnel_v4i64: ; XOPAVX1: # %bb.0: -; XOPAVX1-NEXT: vmovaps {{.*#+}} ymm3 = [63,63,63,63] +; XOPAVX1-NEXT: vbroadcastsd {{.*#+}} ymm3 = [3.1126135687998532E-322,3.1126135687998532E-322,3.1126135687998532E-322,3.1126135687998532E-322] ; XOPAVX1-NEXT: vandps %ymm3, %ymm2, %ymm4 ; XOPAVX1-NEXT: vextractf128 $1, %ymm4, %xmm5 ; XOPAVX1-NEXT: vpxor %xmm6, %xmm6, %xmm6 @@ -423,7 +423,7 @@ ; AVX512BW-LABEL: var_funnnel_v16i16: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512BW-NEXT: vpbroadcastw {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpand %ymm3, %ymm2, %ymm4 ; AVX512BW-NEXT: vpsrlvw %zmm4, %zmm1, %zmm1 ; AVX512BW-NEXT: vpandn %ymm3, %ymm2, %ymm2 @@ -483,7 +483,7 @@ ; ; XOPAVX2-LABEL: var_funnnel_v16i16: ; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; XOPAVX2-NEXT: vpbroadcastw {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; XOPAVX2-NEXT: vpand %ymm3, %ymm2, %ymm4 ; XOPAVX2-NEXT: vextracti128 $1, %ymm4, %xmm5 ; XOPAVX2-NEXT: vpxor %xmm6, %xmm6, %xmm6 @@ -576,7 +576,7 @@ ; ; AVX2-LABEL: var_funnnel_v32i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] ; AVX2-NEXT: vpand %ymm3, %ymm2, %ymm4 ; AVX2-NEXT: vpsllw $5, %ymm4, %ymm4 ; AVX2-NEXT: vpaddb %ymm4, %ymm4, %ymm5 @@ -608,7 +608,7 @@ ; ; AVX512F-LABEL: var_funnnel_v32i8: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] ; AVX512F-NEXT: vpand %ymm3, %ymm2, %ymm4 ; AVX512F-NEXT: vpsllw $5, %ymm4, %ymm4 ; AVX512F-NEXT: vpaddb %ymm4, %ymm4, %ymm5 @@ -752,7 +752,7 @@ ; XOPAVX2-NEXT: vpsubb %xmm2, %xmm4, %xmm4 ; XOPAVX2-NEXT: vpshlb %xmm4, %xmm1, %xmm1 ; XOPAVX2-NEXT: vinserti128 $1, %xmm5, %ymm1, %ymm1 -; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm4 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; XOPAVX2-NEXT: vpbroadcastb {{.*#+}} xmm4 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] ; XOPAVX2-NEXT: vpxor %xmm4, %xmm3, %xmm3 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm5 ; XOPAVX2-NEXT: vpaddb %xmm5, %xmm5, %xmm5 @@ -792,7 +792,7 @@ ; ; AVX2-LABEL: splatvar_funnnel_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm3 = [63,63] ; AVX2-NEXT: vpand %xmm3, %xmm2, %xmm4 ; AVX2-NEXT: vpsrlq %xmm4, %ymm1, %ymm1 ; AVX2-NEXT: vpandn %xmm3, %xmm2, %xmm2 @@ -803,7 +803,7 @@ ; ; AVX512F-LABEL: splatvar_funnnel_v4i64: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63] +; AVX512F-NEXT: vpbroadcastq {{.*#+}} xmm3 = [63,63] ; AVX512F-NEXT: vpand %xmm3, %xmm2, %xmm4 ; AVX512F-NEXT: vpsrlq %xmm4, %ymm1, %ymm1 ; AVX512F-NEXT: vpandn %xmm3, %xmm2, %xmm2 @@ -825,7 +825,7 @@ ; ; AVX512BW-LABEL: splatvar_funnnel_v4i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63] +; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm3 = [63,63] ; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm4 ; AVX512BW-NEXT: vpsrlq %xmm4, %ymm1, %ymm1 ; AVX512BW-NEXT: vpandn %xmm3, %xmm2, %xmm2 @@ -881,7 +881,7 @@ ; ; XOPAVX2-LABEL: splatvar_funnnel_v4i64: ; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63] +; XOPAVX2-NEXT: vpbroadcastq {{.*#+}} xmm3 = [63,63] ; XOPAVX2-NEXT: vpand %xmm3, %xmm2, %xmm4 ; XOPAVX2-NEXT: vpsrlq %xmm4, %ymm1, %ymm1 ; XOPAVX2-NEXT: vpandn %xmm3, %xmm2, %xmm2 @@ -1163,7 +1163,7 @@ ; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31] ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 ; AVX2-NEXT: vpsrlw %xmm2, %ymm3, %ymm3 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX2-NEXT: vpand %ymm4, %ymm3, %ymm3 ; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23] ; AVX2-NEXT: vpsrlw %xmm2, %ymm0, %ymm0 @@ -1176,7 +1176,7 @@ ; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31] ; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 ; AVX512F-NEXT: vpsrlw %xmm2, %ymm3, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX512F-NEXT: vpand %ymm4, %ymm3, %ymm3 ; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23] ; AVX512F-NEXT: vpsrlw %xmm2, %ymm0, %ymm0 @@ -1202,7 +1202,7 @@ ; AVX512BW-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31] ; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 ; AVX512BW-NEXT: vpsrlw %xmm2, %ymm3, %ymm3 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512BW-NEXT: vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX512BW-NEXT: vpand %ymm4, %ymm3, %ymm3 ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23] ; AVX512BW-NEXT: vpsrlw %xmm2, %ymm0, %ymm0 @@ -1654,7 +1654,7 @@ ; AVX2-NEXT: vpaddb %ymm0, %ymm0, %ymm0 ; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX2-NEXT: vpand %ymm3, %ymm2, %ymm2 ; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] ; AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 @@ -1676,7 +1676,7 @@ ; AVX512F-NEXT: vpaddb %ymm0, %ymm0, %ymm0 ; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX512F-NEXT: vpand %ymm3, %ymm2, %ymm2 ; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] ; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 diff --git a/llvm/test/CodeGen/X86/vector-fshr-512.ll b/llvm/test/CodeGen/X86/vector-fshr-512.ll --- a/llvm/test/CodeGen/X86/vector-fshr-512.ll +++ b/llvm/test/CodeGen/X86/vector-fshr-512.ll @@ -225,16 +225,16 @@ ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3 ; AVX512F-NEXT: vpaddb %ymm3, %ymm3, %ymm4 ; AVX512F-NEXT: vpsllw $4, %ymm4, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm5 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] ; AVX512F-NEXT: vpand %ymm5, %ymm3, %ymm6 ; AVX512F-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm2 ; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm7 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm7 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] ; AVX512F-NEXT: vpxor %ymm7, %ymm3, %ymm8 ; AVX512F-NEXT: vpsllw $5, %ymm8, %ymm8 ; AVX512F-NEXT: vpblendvb %ymm8, %ymm6, %ymm4, %ymm4 ; AVX512F-NEXT: vpsllw $2, %ymm4, %ymm6 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm9 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm9 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] ; AVX512F-NEXT: vpand %ymm6, %ymm9, %ymm6 ; AVX512F-NEXT: vpaddb %ymm8, %ymm8, %ymm8 ; AVX512F-NEXT: vpblendvb %ymm8, %ymm6, %ymm4, %ymm4 @@ -257,17 +257,17 @@ ; AVX512F-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm0 ; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm4 ; AVX512F-NEXT: vpsrlw $4, %ymm4, %ymm5 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm6 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm6, %ymm5, %ymm5 ; AVX512F-NEXT: vpsllw $5, %ymm3, %ymm3 ; AVX512F-NEXT: vpblendvb %ymm3, %ymm5, %ymm4, %ymm4 ; AVX512F-NEXT: vpsrlw $2, %ymm4, %ymm5 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm7 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm7 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63] ; AVX512F-NEXT: vpand %ymm7, %ymm5, %ymm5 ; AVX512F-NEXT: vpaddb %ymm3, %ymm3, %ymm3 ; AVX512F-NEXT: vpblendvb %ymm3, %ymm5, %ymm4, %ymm4 ; AVX512F-NEXT: vpsrlw $1, %ymm4, %ymm5 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm8 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm8 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] ; AVX512F-NEXT: vpand %ymm5, %ymm8, %ymm5 ; AVX512F-NEXT: vpaddb %ymm3, %ymm3, %ymm3 ; AVX512F-NEXT: vpblendvb %ymm3, %ymm5, %ymm4, %ymm3 @@ -424,7 +424,7 @@ define <8 x i64> @splatvar_funnnel_v8i64(<8 x i64> %x, <8 x i64> %y, <8 x i64> %amt) nounwind { ; AVX512F-LABEL: splatvar_funnnel_v8i64: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63] +; AVX512F-NEXT: vpbroadcastq {{.*#+}} xmm3 = [63,63] ; AVX512F-NEXT: vpand %xmm3, %xmm2, %xmm4 ; AVX512F-NEXT: vpsrlq %xmm4, %zmm1, %zmm1 ; AVX512F-NEXT: vpandn %xmm3, %xmm2, %xmm2 @@ -446,7 +446,7 @@ ; ; AVX512BW-LABEL: splatvar_funnnel_v8i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63] +; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm3 = [63,63] ; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm4 ; AVX512BW-NEXT: vpsrlq %xmm4, %zmm1, %zmm1 ; AVX512BW-NEXT: vpandn %xmm3, %xmm2, %xmm2 @@ -628,7 +628,7 @@ ; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm4[8],ymm3[8],ymm4[9],ymm3[9],ymm4[10],ymm3[10],ymm4[11],ymm3[11],ymm4[12],ymm3[12],ymm4[13],ymm3[13],ymm4[14],ymm3[14],ymm4[15],ymm3[15],ymm4[24],ymm3[24],ymm4[25],ymm3[25],ymm4[26],ymm3[26],ymm4[27],ymm3[27],ymm4[28],ymm3[28],ymm4[29],ymm3[29],ymm4[30],ymm3[30],ymm4[31],ymm3[31] ; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 ; AVX512F-NEXT: vpsrlw %xmm2, %ymm5, %ymm5 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm6 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX512F-NEXT: vpand %ymm6, %ymm5, %ymm5 ; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[4],ymm3[4],ymm4[5],ymm3[5],ymm4[6],ymm3[6],ymm4[7],ymm3[7],ymm4[16],ymm3[16],ymm4[17],ymm3[17],ymm4[18],ymm3[18],ymm4[19],ymm3[19],ymm4[20],ymm3[20],ymm4[21],ymm3[21],ymm4[22],ymm3[22],ymm4[23],ymm3[23] ; AVX512F-NEXT: vpsrlw %xmm2, %ymm3, %ymm3 @@ -911,7 +911,7 @@ ; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [128,1,2,4,8,16,32,64,128,1,2,4,8,16,32,64] ; AVX512F-NEXT: # ymm4 = mem[0,1,0,1] ; AVX512F-NEXT: vpmullw %ymm4, %ymm3, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX512F-NEXT: vpand %ymm5, %ymm3, %ymm3 ; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] ; AVX512F-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [128,64,32,16,8,4,2,1,128,64,32,16,8,4,2,1] diff --git a/llvm/test/CodeGen/X86/vector-fshr-rot-128.ll b/llvm/test/CodeGen/X86/vector-fshr-rot-128.ll --- a/llvm/test/CodeGen/X86/vector-fshr-rot-128.ll +++ b/llvm/test/CodeGen/X86/vector-fshr-rot-128.ll @@ -87,7 +87,7 @@ ; ; AVX2-LABEL: var_funnnel_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [63,63] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [63,63] ; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm3 ; AVX2-NEXT: vpsrlvq %xmm3, %xmm0, %xmm3 ; AVX2-NEXT: vpxor %xmm4, %xmm4, %xmm4 @@ -232,8 +232,7 @@ ; ; AVX2-LABEL: var_funnnel_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [31,31,31,31] -; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; AVX2-NEXT: vpsrlvd %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [32,32,32,32] ; AVX2-NEXT: vpsubd %xmm1, %xmm3, %xmm1 @@ -390,7 +389,7 @@ ; AVX2-NEXT: vpsrlvd %ymm2, %ymm0, %ymm2 ; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX2-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16] +; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16] ; AVX2-NEXT: vpsubw %xmm1, %xmm3, %xmm1 ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero ; AVX2-NEXT: vpsllvd %ymm1, %ymm0, %ymm0 @@ -406,7 +405,7 @@ ; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} ymm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero ; AVX512F-NEXT: vpsrlvd %ymm2, %ymm0, %ymm2 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16] +; AVX512F-NEXT: vpbroadcastw {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16] ; AVX512F-NEXT: vpsubw %xmm1, %xmm3, %xmm1 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero ; AVX512F-NEXT: vpsllvd %ymm1, %ymm0, %ymm0 @@ -436,7 +435,7 @@ ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16] +; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16] ; AVX512BW-NEXT: vpsubw %xmm1, %xmm3, %xmm1 ; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpor %xmm0, %xmm2, %xmm0 @@ -753,17 +752,29 @@ ; SSE-NEXT: por %xmm4, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: splatvar_funnnel_v2i64: -; AVX: # %bb.0: -; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [63,63] -; AVX-NEXT: vpand %xmm2, %xmm1, %xmm3 -; AVX-NEXT: vpsrlq %xmm3, %xmm0, %xmm3 -; AVX-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX-NEXT: vpsubq %xmm1, %xmm4, %xmm1 -; AVX-NEXT: vpand %xmm2, %xmm1, %xmm1 -; AVX-NEXT: vpsllq %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpor %xmm0, %xmm3, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: splatvar_funnnel_v2i64: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [63,63] +; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm3 +; AVX1-NEXT: vpsrlq %xmm3, %xmm0, %xmm3 +; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; AVX1-NEXT: vpsubq %xmm1, %xmm4, %xmm1 +; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpsllq %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpor %xmm0, %xmm3, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: splatvar_funnnel_v2i64: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [63,63] +; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm3 +; AVX2-NEXT: vpsrlq %xmm3, %xmm0, %xmm3 +; AVX2-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; AVX2-NEXT: vpsubq %xmm1, %xmm4, %xmm1 +; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpsllq %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpor %xmm0, %xmm3, %xmm0 +; AVX2-NEXT: retq ; ; AVX512F-LABEL: splatvar_funnnel_v2i64: ; AVX512F: # %bb.0: @@ -1089,25 +1100,38 @@ ; SSE-NEXT: packuswb %xmm2, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: splatvar_funnnel_v16i8: -; AVX: # %bb.0: -; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; AVX-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; AVX-NEXT: vpsrlw %xmm1, %xmm2, %xmm2 -; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] -; AVX-NEXT: vpand %xmm3, %xmm2, %xmm2 -; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; AVX-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpand %xmm3, %xmm0, %xmm0 -; AVX-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: splatvar_funnnel_v16i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX1-NEXT: vpsrlw %xmm1, %xmm2, %xmm2 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] +; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; AVX1-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: splatvar_funnnel_v16i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX2-NEXT: vpsrlw %xmm1, %xmm2, %xmm2 +; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpand %xmm3, %xmm2, %xmm2 +; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; AVX2-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpand %xmm3, %xmm0, %xmm0 +; AVX2-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: retq ; ; AVX512F-LABEL: splatvar_funnnel_v16i8: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; AVX512F-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; AVX512F-NEXT: vpsrlw %xmm1, %xmm2, %xmm2 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] +; AVX512F-NEXT: vpbroadcastw {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] ; AVX512F-NEXT: vpand %xmm3, %xmm2, %xmm2 ; AVX512F-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; AVX512F-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 @@ -1133,7 +1157,7 @@ ; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; AVX512BW-NEXT: vpsrlw %xmm1, %xmm2, %xmm2 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] +; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] ; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm2 ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; AVX512BW-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 diff --git a/llvm/test/CodeGen/X86/vector-fshr-rot-256.ll b/llvm/test/CodeGen/X86/vector-fshr-rot-256.ll --- a/llvm/test/CodeGen/X86/vector-fshr-rot-256.ll +++ b/llvm/test/CodeGen/X86/vector-fshr-rot-256.ll @@ -171,8 +171,7 @@ ; ; AVX2-LABEL: var_funnnel_v8i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [31,31,31,31,31,31,31,31] -; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 ; AVX2-NEXT: vpsrlvd %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm3 = [32,32,32,32,32,32,32,32] ; AVX2-NEXT: vpsubd %ymm1, %ymm3, %ymm1 @@ -297,7 +296,7 @@ ; AVX2-NEXT: vpsrlvd %ymm5, %ymm0, %ymm5 ; AVX2-NEXT: vpsrld $16, %ymm5, %ymm5 ; AVX2-NEXT: vpackusdw %ymm4, %ymm5, %ymm4 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm5 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] ; AVX2-NEXT: vpsubw %ymm1, %ymm5, %ymm1 ; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15] ; AVX2-NEXT: vpsllvd %ymm5, %ymm3, %ymm3 @@ -315,7 +314,7 @@ ; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero ; AVX512F-NEXT: vpsrlvd %zmm2, %zmm0, %zmm2 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] ; AVX512F-NEXT: vpsubw %ymm1, %ymm3, %ymm1 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero ; AVX512F-NEXT: vpsllvd %zmm1, %zmm0, %zmm0 @@ -342,7 +341,7 @@ ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 ; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX512BW-NEXT: vpbroadcastw {{.*#+}} ymm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] ; AVX512BW-NEXT: vpsubw %ymm1, %ymm3, %ymm1 ; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpor %ymm0, %ymm2, %ymm0 @@ -607,7 +606,7 @@ ; ; AVX2-LABEL: splatvar_funnnel_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [63,63] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [63,63] ; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm3 ; AVX2-NEXT: vpsrlq %xmm3, %ymm0, %ymm3 ; AVX2-NEXT: vpxor %xmm4, %xmm4, %xmm4 @@ -923,7 +922,7 @@ ; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; AVX2-NEXT: vpsrlw %xmm1, %ymm2, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX2-NEXT: vpand %ymm3, %ymm2, %ymm2 ; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] ; AVX2-NEXT: vpsrlw %xmm1, %ymm0, %ymm0 @@ -936,7 +935,7 @@ ; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; AVX512F-NEXT: vpsrlw %xmm1, %ymm2, %ymm2 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX512F-NEXT: vpand %ymm3, %ymm2, %ymm2 ; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] ; AVX512F-NEXT: vpsrlw %xmm1, %ymm0, %ymm0 @@ -962,7 +961,7 @@ ; AVX512BW-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; AVX512BW-NEXT: vpsrlw %xmm1, %ymm2, %ymm2 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512BW-NEXT: vpbroadcastw {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX512BW-NEXT: vpand %ymm3, %ymm2, %ymm2 ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] ; AVX512BW-NEXT: vpsrlw %xmm1, %ymm0, %ymm0 diff --git a/llvm/test/CodeGen/X86/vector-fshr-rot-512.ll b/llvm/test/CodeGen/X86/vector-fshr-rot-512.ll --- a/llvm/test/CodeGen/X86/vector-fshr-rot-512.ll +++ b/llvm/test/CodeGen/X86/vector-fshr-rot-512.ll @@ -36,12 +36,12 @@ define <32 x i16> @var_funnnel_v32i16(<32 x i16> %x, <32 x i16> %amt) nounwind { ; AVX512F-LABEL: var_funnnel_v32i16: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm4 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm5 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512F-NEXT: vpsrlvd %zmm4, %zmm5, %zmm4 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm6 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] ; AVX512F-NEXT: vpsubw %ymm3, %ymm6, %ymm3 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm3 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero ; AVX512F-NEXT: vpsllvd %zmm3, %zmm5, %zmm3 @@ -371,7 +371,7 @@ ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2 ; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; AVX512F-NEXT: vpsrlw %xmm1, %ymm3, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX512F-NEXT: vpand %ymm4, %ymm3, %ymm3 ; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] ; AVX512F-NEXT: vpsrlw %xmm1, %ymm2, %ymm2 diff --git a/llvm/test/CodeGen/X86/vector-fshr-rot-sub128.ll b/llvm/test/CodeGen/X86/vector-fshr-rot-sub128.ll --- a/llvm/test/CodeGen/X86/vector-fshr-rot-sub128.ll +++ b/llvm/test/CodeGen/X86/vector-fshr-rot-sub128.ll @@ -83,8 +83,7 @@ ; ; AVX2-LABEL: var_funnnel_v2i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [31,31,31,31] -; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; AVX2-NEXT: vpsrlvd %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [32,32,32,32] ; AVX2-NEXT: vpsubd %xmm1, %xmm3, %xmm1 diff --git a/llvm/test/CodeGen/X86/vector-idiv-sdiv-128.ll b/llvm/test/CodeGen/X86/vector-idiv-sdiv-128.ll --- a/llvm/test/CodeGen/X86/vector-idiv-sdiv-128.ll +++ b/llvm/test/CodeGen/X86/vector-idiv-sdiv-128.ll @@ -222,7 +222,7 @@ ; AVX2NOBW-NEXT: vpaddb %xmm0, %xmm1, %xmm0 ; AVX2NOBW-NEXT: vpsrlw $2, %xmm0, %xmm1 ; AVX2NOBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; AVX2NOBW-NEXT: vmovdqa {{.*#+}} xmm2 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] +; AVX2NOBW-NEXT: vpbroadcastb {{.*#+}} xmm2 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] ; AVX2NOBW-NEXT: vpxor %xmm2, %xmm1, %xmm1 ; AVX2NOBW-NEXT: vpsrlw $7, %xmm0, %xmm0 ; AVX2NOBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 @@ -240,7 +240,7 @@ ; AVX512BW-NEXT: vpaddb %xmm0, %xmm1, %xmm0 ; AVX512BW-NEXT: vpsrlw $2, %xmm0, %xmm1 ; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} xmm2 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] ; AVX512BW-NEXT: vpxor %xmm2, %xmm1, %xmm1 ; AVX512BW-NEXT: vpsrlw $7, %xmm0, %xmm0 ; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 @@ -507,22 +507,38 @@ ; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: retq ; -; AVX2-LABEL: test_rem7_4i32: -; AVX2: # %bb.0: -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2454267027,2454267027,2454267027,2454267027] -; AVX2-NEXT: vpmuldq %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpmuldq %xmm2, %xmm0, %xmm2 -; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] -; AVX2-NEXT: vpaddd %xmm0, %xmm1, %xmm1 -; AVX2-NEXT: vpsrld $31, %xmm1, %xmm2 -; AVX2-NEXT: vpsrad $2, %xmm1, %xmm1 -; AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [7,7,7,7] -; AVX2-NEXT: vpmulld %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: retq +; AVX2NOBW-LABEL: test_rem7_4i32: +; AVX2NOBW: # %bb.0: +; AVX2NOBW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; AVX2NOBW-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2454267027,2454267027,2454267027,2454267027] +; AVX2NOBW-NEXT: vpmuldq %xmm2, %xmm1, %xmm1 +; AVX2NOBW-NEXT: vpmuldq %xmm2, %xmm0, %xmm2 +; AVX2NOBW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; AVX2NOBW-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] +; AVX2NOBW-NEXT: vpaddd %xmm0, %xmm1, %xmm1 +; AVX2NOBW-NEXT: vpsrld $31, %xmm1, %xmm2 +; AVX2NOBW-NEXT: vpsrad $2, %xmm1, %xmm1 +; AVX2NOBW-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; AVX2NOBW-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX2NOBW-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; AVX2NOBW-NEXT: retq +; +; AVX512BW-LABEL: test_rem7_4i32: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; AVX512BW-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2454267027,2454267027,2454267027,2454267027] +; AVX512BW-NEXT: vpmuldq %xmm2, %xmm1, %xmm1 +; AVX512BW-NEXT: vpmuldq %xmm2, %xmm0, %xmm2 +; AVX512BW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; AVX512BW-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] +; AVX512BW-NEXT: vpaddd %xmm0, %xmm1, %xmm1 +; AVX512BW-NEXT: vpsrld $31, %xmm1, %xmm2 +; AVX512BW-NEXT: vpsrad $2, %xmm1, %xmm1 +; AVX512BW-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; AVX512BW-NEXT: vpbroadcastd {{.*#+}} xmm2 = [7,7,7,7] +; AVX512BW-NEXT: vpmulld %xmm2, %xmm1, %xmm1 +; AVX512BW-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; AVX512BW-NEXT: retq %res = srem <4 x i32> %a, ret <4 x i32> %res } @@ -619,7 +635,7 @@ ; AVX2NOBW-NEXT: vpaddb %xmm0, %xmm1, %xmm1 ; AVX2NOBW-NEXT: vpsrlw $2, %xmm1, %xmm2 ; AVX2NOBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 -; AVX2NOBW-NEXT: vmovdqa {{.*#+}} xmm3 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] +; AVX2NOBW-NEXT: vpbroadcastb {{.*#+}} xmm3 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] ; AVX2NOBW-NEXT: vpxor %xmm3, %xmm2, %xmm2 ; AVX2NOBW-NEXT: vpsrlw $7, %xmm1, %xmm1 ; AVX2NOBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 @@ -641,7 +657,7 @@ ; AVX512BW-NEXT: vpaddb %xmm0, %xmm1, %xmm1 ; AVX512BW-NEXT: vpsrlw $2, %xmm1, %xmm2 ; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} xmm3 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] ; AVX512BW-NEXT: vpxor %xmm3, %xmm2, %xmm2 ; AVX512BW-NEXT: vpsrlw $7, %xmm1, %xmm1 ; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 diff --git a/llvm/test/CodeGen/X86/vector-idiv-sdiv-256.ll b/llvm/test/CodeGen/X86/vector-idiv-sdiv-256.ll --- a/llvm/test/CodeGen/X86/vector-idiv-sdiv-256.ll +++ b/llvm/test/CodeGen/X86/vector-idiv-sdiv-256.ll @@ -201,7 +201,7 @@ ; AVX2NOBW: # %bb.0: ; AVX2NOBW-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2NOBW-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31] -; AVX2NOBW-NEXT: vmovdqa {{.*#+}} ymm3 = [37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632] +; AVX2NOBW-NEXT: vpbroadcastw {{.*#+}} ymm3 = [37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632] ; AVX2NOBW-NEXT: vpmulhw %ymm3, %ymm2, %ymm2 ; AVX2NOBW-NEXT: vpsrlw $8, %ymm2, %ymm2 ; AVX2NOBW-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23] @@ -211,7 +211,7 @@ ; AVX2NOBW-NEXT: vpaddb %ymm0, %ymm1, %ymm0 ; AVX2NOBW-NEXT: vpsrlw $2, %ymm0, %ymm1 ; AVX2NOBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 -; AVX2NOBW-NEXT: vmovdqa {{.*#+}} ymm2 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] +; AVX2NOBW-NEXT: vpbroadcastb {{.*#+}} ymm2 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] ; AVX2NOBW-NEXT: vpxor %ymm2, %ymm1, %ymm1 ; AVX2NOBW-NEXT: vpsrlw $7, %ymm0, %ymm0 ; AVX2NOBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 @@ -228,7 +228,7 @@ ; AVX512BW-NEXT: vpaddb %ymm0, %ymm1, %ymm0 ; AVX512BW-NEXT: vpsrlw $2, %ymm0, %ymm1 ; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} ymm2 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] ; AVX512BW-NEXT: vpxor %ymm2, %ymm1, %ymm1 ; AVX512BW-NEXT: vpsrlw $7, %ymm0, %ymm0 ; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 @@ -485,22 +485,38 @@ ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; -; AVX2-LABEL: test_rem7_8i32: -; AVX2: # %bb.0: -; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm0[1,1,3,3,5,5,7,7] -; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027] -; AVX2-NEXT: vpmuldq %ymm2, %ymm1, %ymm1 -; AVX2-NEXT: vpmuldq %ymm2, %ymm0, %ymm2 -; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[1,1,3,3,5,5,7,7] -; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4],ymm1[5],ymm2[6],ymm1[7] -; AVX2-NEXT: vpaddd %ymm0, %ymm1, %ymm1 -; AVX2-NEXT: vpsrld $31, %ymm1, %ymm2 -; AVX2-NEXT: vpsrad $2, %ymm1, %ymm1 -; AVX2-NEXT: vpaddd %ymm2, %ymm1, %ymm1 -; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [7,7,7,7,7,7,7,7] -; AVX2-NEXT: vpmulld %ymm2, %ymm1, %ymm1 -; AVX2-NEXT: vpsubd %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: retq +; AVX2NOBW-LABEL: test_rem7_8i32: +; AVX2NOBW: # %bb.0: +; AVX2NOBW-NEXT: vpshufd {{.*#+}} ymm1 = ymm0[1,1,3,3,5,5,7,7] +; AVX2NOBW-NEXT: vpbroadcastd {{.*#+}} ymm2 = [2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027] +; AVX2NOBW-NEXT: vpmuldq %ymm2, %ymm1, %ymm1 +; AVX2NOBW-NEXT: vpmuldq %ymm2, %ymm0, %ymm2 +; AVX2NOBW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[1,1,3,3,5,5,7,7] +; AVX2NOBW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4],ymm1[5],ymm2[6],ymm1[7] +; AVX2NOBW-NEXT: vpaddd %ymm0, %ymm1, %ymm1 +; AVX2NOBW-NEXT: vpsrld $31, %ymm1, %ymm2 +; AVX2NOBW-NEXT: vpsrad $2, %ymm1, %ymm1 +; AVX2NOBW-NEXT: vpaddd %ymm2, %ymm1, %ymm1 +; AVX2NOBW-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX2NOBW-NEXT: vpsubd %ymm1, %ymm0, %ymm0 +; AVX2NOBW-NEXT: retq +; +; AVX512BW-LABEL: test_rem7_8i32: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpshufd {{.*#+}} ymm1 = ymm0[1,1,3,3,5,5,7,7] +; AVX512BW-NEXT: vpbroadcastd {{.*#+}} ymm2 = [2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027] +; AVX512BW-NEXT: vpmuldq %ymm2, %ymm1, %ymm1 +; AVX512BW-NEXT: vpmuldq %ymm2, %ymm0, %ymm2 +; AVX512BW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[1,1,3,3,5,5,7,7] +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4],ymm1[5],ymm2[6],ymm1[7] +; AVX512BW-NEXT: vpaddd %ymm0, %ymm1, %ymm1 +; AVX512BW-NEXT: vpsrld $31, %ymm1, %ymm2 +; AVX512BW-NEXT: vpsrad $2, %ymm1, %ymm1 +; AVX512BW-NEXT: vpaddd %ymm2, %ymm1, %ymm1 +; AVX512BW-NEXT: vpbroadcastd {{.*#+}} ymm2 = [7,7,7,7,7,7,7,7] +; AVX512BW-NEXT: vpmulld %ymm2, %ymm1, %ymm1 +; AVX512BW-NEXT: vpsubd %ymm1, %ymm0, %ymm0 +; AVX512BW-NEXT: retq %res = srem <8 x i32> %a, ret <8 x i32> %res } @@ -595,7 +611,7 @@ ; AVX2NOBW: # %bb.0: ; AVX2NOBW-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2NOBW-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31] -; AVX2NOBW-NEXT: vmovdqa {{.*#+}} ymm3 = [37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632] +; AVX2NOBW-NEXT: vpbroadcastw {{.*#+}} ymm3 = [37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632] ; AVX2NOBW-NEXT: vpmulhw %ymm3, %ymm2, %ymm2 ; AVX2NOBW-NEXT: vpsrlw $8, %ymm2, %ymm2 ; AVX2NOBW-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23] @@ -605,7 +621,7 @@ ; AVX2NOBW-NEXT: vpaddb %ymm0, %ymm1, %ymm1 ; AVX2NOBW-NEXT: vpsrlw $2, %ymm1, %ymm2 ; AVX2NOBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 -; AVX2NOBW-NEXT: vmovdqa {{.*#+}} ymm3 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] +; AVX2NOBW-NEXT: vpbroadcastb {{.*#+}} ymm3 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] ; AVX2NOBW-NEXT: vpxor %ymm3, %ymm2, %ymm2 ; AVX2NOBW-NEXT: vpsrlw $7, %ymm1, %ymm1 ; AVX2NOBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 @@ -626,7 +642,7 @@ ; AVX512BW-NEXT: vpaddb %ymm0, %ymm1, %ymm1 ; AVX512BW-NEXT: vpsrlw $2, %ymm1, %ymm2 ; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm3 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] +; AVX512BW-NEXT: vpbroadcastb {{.*#+}} ymm3 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] ; AVX512BW-NEXT: vpxor %ymm3, %ymm2, %ymm2 ; AVX512BW-NEXT: vpsrlw $7, %ymm1, %ymm1 ; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 @@ -739,7 +755,7 @@ ; AVX2NOBW-NEXT: vpaddb %ymm1, %ymm2, %ymm1 ; AVX2NOBW-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; AVX2NOBW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 -; AVX2NOBW-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX2NOBW-NEXT: vpbroadcastw {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX2NOBW-NEXT: vpand %ymm3, %ymm2, %ymm2 ; AVX2NOBW-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] ; AVX2NOBW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 diff --git a/llvm/test/CodeGen/X86/vector-idiv-sdiv-512.ll b/llvm/test/CodeGen/X86/vector-idiv-sdiv-512.ll --- a/llvm/test/CodeGen/X86/vector-idiv-sdiv-512.ll +++ b/llvm/test/CodeGen/X86/vector-idiv-sdiv-512.ll @@ -103,7 +103,7 @@ ; AVX512F-LABEL: test_div7_32i16: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725] +; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm2 = [18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725] ; AVX512F-NEXT: vpmulhw %ymm2, %ymm1, %ymm1 ; AVX512F-NEXT: vpsrlw $15, %ymm1, %ymm3 ; AVX512F-NEXT: vpsraw $1, %ymm1, %ymm1 @@ -132,7 +132,7 @@ ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11],ymm2[12],ymm1[12],ymm2[13],ymm1[13],ymm2[14],ymm1[14],ymm2[15],ymm1[15],ymm2[24],ymm1[24],ymm2[25],ymm1[25],ymm2[26],ymm1[26],ymm2[27],ymm1[27],ymm2[28],ymm1[28],ymm2[29],ymm1[29],ymm2[30],ymm1[30],ymm2[31],ymm1[31] -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632] +; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm4 = [37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632] ; AVX512F-NEXT: vpmulhw %ymm4, %ymm3, %ymm3 ; AVX512F-NEXT: vpsrlw $8, %ymm3, %ymm3 ; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[4],ymm1[4],ymm2[5],ymm1[5],ymm2[6],ymm1[6],ymm2[7],ymm1[7],ymm2[16],ymm1[16],ymm2[17],ymm1[17],ymm2[18],ymm1[18],ymm2[19],ymm1[19],ymm2[20],ymm1[20],ymm2[21],ymm1[21],ymm2[22],ymm1[22],ymm2[23],ymm1[23] @@ -141,12 +141,12 @@ ; AVX512F-NEXT: vpackuswb %ymm3, %ymm5, %ymm3 ; AVX512F-NEXT: vpaddb %ymm1, %ymm3, %ymm1 ; AVX512F-NEXT: vpsrlw $7, %ymm1, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm5 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] ; AVX512F-NEXT: vpand %ymm5, %ymm3, %ymm3 ; AVX512F-NEXT: vpsrlw $2, %ymm1, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm6 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63] ; AVX512F-NEXT: vpand %ymm6, %ymm1, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm7 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm7 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] ; AVX512F-NEXT: vpxor %ymm7, %ymm1, %ymm1 ; AVX512F-NEXT: vpaddb %ymm3, %ymm1, %ymm1 ; AVX512F-NEXT: vpsubb %ymm7, %ymm1, %ymm1 @@ -221,7 +221,7 @@ ; AVX512F-NEXT: vpsrlw $8, %ymm5, %ymm5 ; AVX512F-NEXT: vpackuswb %ymm4, %ymm5, %ymm4 ; AVX512F-NEXT: vpsrlw $7, %ymm1, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm5 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] ; AVX512F-NEXT: vpand %ymm5, %ymm1, %ymm1 ; AVX512F-NEXT: vpaddb %ymm1, %ymm4, %ymm1 ; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15],ymm2[24],ymm0[24],ymm2[25],ymm0[25],ymm2[26],ymm0[26],ymm2[27],ymm0[27],ymm2[28],ymm0[28],ymm2[29],ymm0[29],ymm2[30],ymm0[30],ymm2[31],ymm0[31] @@ -411,7 +411,7 @@ ; AVX512F-LABEL: test_rem7_32i16: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725] +; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm2 = [18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725,18725] ; AVX512F-NEXT: vpmulhw %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vpsrlw $15, %ymm3, %ymm4 ; AVX512F-NEXT: vpsraw $1, %ymm3, %ymm3 @@ -448,7 +448,7 @@ ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11],ymm2[12],ymm1[12],ymm2[13],ymm1[13],ymm2[14],ymm1[14],ymm2[15],ymm1[15],ymm2[24],ymm1[24],ymm2[25],ymm1[25],ymm2[26],ymm1[26],ymm2[27],ymm1[27],ymm2[28],ymm1[28],ymm2[29],ymm1[29],ymm2[30],ymm1[30],ymm2[31],ymm1[31] -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632] +; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm4 = [37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632,37632] ; AVX512F-NEXT: vpmulhw %ymm4, %ymm3, %ymm3 ; AVX512F-NEXT: vpsrlw $8, %ymm3, %ymm3 ; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[4],ymm1[4],ymm2[5],ymm1[5],ymm2[6],ymm1[6],ymm2[7],ymm1[7],ymm2[16],ymm1[16],ymm2[17],ymm1[17],ymm2[18],ymm1[18],ymm2[19],ymm1[19],ymm2[20],ymm1[20],ymm2[21],ymm1[21],ymm2[22],ymm1[22],ymm2[23],ymm1[23] @@ -457,17 +457,17 @@ ; AVX512F-NEXT: vpackuswb %ymm3, %ymm5, %ymm3 ; AVX512F-NEXT: vpaddb %ymm1, %ymm3, %ymm3 ; AVX512F-NEXT: vpsrlw $7, %ymm3, %ymm5 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm6 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] ; AVX512F-NEXT: vpand %ymm6, %ymm5, %ymm5 ; AVX512F-NEXT: vpsrlw $2, %ymm3, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm7 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm7 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63] ; AVX512F-NEXT: vpand %ymm7, %ymm3, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm8 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm8 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] ; AVX512F-NEXT: vpxor %ymm3, %ymm8, %ymm3 ; AVX512F-NEXT: vpaddb %ymm5, %ymm3, %ymm3 ; AVX512F-NEXT: vpsubb %ymm8, %ymm3, %ymm3 ; AVX512F-NEXT: vpsllw $3, %ymm3, %ymm5 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm9 = [248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm9 = [248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248] ; AVX512F-NEXT: vpand %ymm5, %ymm9, %ymm5 ; AVX512F-NEXT: vpsubb %ymm5, %ymm3, %ymm3 ; AVX512F-NEXT: vpaddb %ymm3, %ymm1, %ymm1 @@ -550,12 +550,12 @@ ; AVX512F-NEXT: vpsrlw $8, %ymm6, %ymm6 ; AVX512F-NEXT: vpackuswb %ymm5, %ymm6, %ymm5 ; AVX512F-NEXT: vpsrlw $7, %ymm3, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm6 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] ; AVX512F-NEXT: vpand %ymm6, %ymm3, %ymm3 ; AVX512F-NEXT: vpaddb %ymm3, %ymm5, %ymm5 ; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm7 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX512F-NEXT: vpand %ymm3, %ymm7, %ymm7 ; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm5[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] ; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm5, %ymm5 diff --git a/llvm/test/CodeGen/X86/vector-idiv-udiv-128.ll b/llvm/test/CodeGen/X86/vector-idiv-udiv-128.ll --- a/llvm/test/CodeGen/X86/vector-idiv-udiv-128.ll +++ b/llvm/test/CodeGen/X86/vector-idiv-udiv-128.ll @@ -578,22 +578,38 @@ ; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: retq ; -; AVX2-LABEL: test_rem7_4i32: -; AVX2: # %bb.0: -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [613566757,613566757,613566757,613566757] -; AVX2-NEXT: vpmuludq %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpmuludq %xmm2, %xmm0, %xmm2 -; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] -; AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm2 -; AVX2-NEXT: vpsrld $1, %xmm2, %xmm2 -; AVX2-NEXT: vpaddd %xmm1, %xmm2, %xmm1 -; AVX2-NEXT: vpsrld $2, %xmm1, %xmm1 -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [7,7,7,7] -; AVX2-NEXT: vpmulld %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: retq +; AVX2NOBW-LABEL: test_rem7_4i32: +; AVX2NOBW: # %bb.0: +; AVX2NOBW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; AVX2NOBW-NEXT: vpbroadcastd {{.*#+}} xmm2 = [613566757,613566757,613566757,613566757] +; AVX2NOBW-NEXT: vpmuludq %xmm2, %xmm1, %xmm1 +; AVX2NOBW-NEXT: vpmuludq %xmm2, %xmm0, %xmm2 +; AVX2NOBW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; AVX2NOBW-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] +; AVX2NOBW-NEXT: vpsubd %xmm1, %xmm0, %xmm2 +; AVX2NOBW-NEXT: vpsrld $1, %xmm2, %xmm2 +; AVX2NOBW-NEXT: vpaddd %xmm1, %xmm2, %xmm1 +; AVX2NOBW-NEXT: vpsrld $2, %xmm1, %xmm1 +; AVX2NOBW-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX2NOBW-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; AVX2NOBW-NEXT: retq +; +; AVX512BW-LABEL: test_rem7_4i32: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; AVX512BW-NEXT: vpbroadcastd {{.*#+}} xmm2 = [613566757,613566757,613566757,613566757] +; AVX512BW-NEXT: vpmuludq %xmm2, %xmm1, %xmm1 +; AVX512BW-NEXT: vpmuludq %xmm2, %xmm0, %xmm2 +; AVX512BW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; AVX512BW-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] +; AVX512BW-NEXT: vpsubd %xmm1, %xmm0, %xmm2 +; AVX512BW-NEXT: vpsrld $1, %xmm2, %xmm2 +; AVX512BW-NEXT: vpaddd %xmm1, %xmm2, %xmm1 +; AVX512BW-NEXT: vpsrld $2, %xmm1, %xmm1 +; AVX512BW-NEXT: vpbroadcastd {{.*#+}} xmm2 = [7,7,7,7] +; AVX512BW-NEXT: vpmulld %xmm2, %xmm1, %xmm1 +; AVX512BW-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; AVX512BW-NEXT: retq %res = urem <4 x i32> %a, ret <4 x i32> %res } diff --git a/llvm/test/CodeGen/X86/vector-idiv-udiv-256.ll b/llvm/test/CodeGen/X86/vector-idiv-udiv-256.ll --- a/llvm/test/CodeGen/X86/vector-idiv-udiv-256.ll +++ b/llvm/test/CodeGen/X86/vector-idiv-udiv-256.ll @@ -202,7 +202,7 @@ ; AVX2NOBW: # %bb.0: ; AVX2NOBW-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2NOBW-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31] -; AVX2NOBW-NEXT: vmovdqa {{.*#+}} ymm3 = [37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37] +; AVX2NOBW-NEXT: vpbroadcastw {{.*#+}} ymm3 = [37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37] ; AVX2NOBW-NEXT: vpmullw %ymm3, %ymm2, %ymm2 ; AVX2NOBW-NEXT: vpsrlw $8, %ymm2, %ymm2 ; AVX2NOBW-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23] @@ -516,22 +516,38 @@ ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; -; AVX2-LABEL: test_rem7_8i32: -; AVX2: # %bb.0: -; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm0[1,1,3,3,5,5,7,7] -; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [613566757,613566757,613566757,613566757,613566757,613566757,613566757,613566757] -; AVX2-NEXT: vpmuludq %ymm2, %ymm1, %ymm1 -; AVX2-NEXT: vpmuludq %ymm2, %ymm0, %ymm2 -; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[1,1,3,3,5,5,7,7] -; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4],ymm1[5],ymm2[6],ymm1[7] -; AVX2-NEXT: vpsubd %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vpsrld $1, %ymm2, %ymm2 -; AVX2-NEXT: vpaddd %ymm1, %ymm2, %ymm1 -; AVX2-NEXT: vpsrld $2, %ymm1, %ymm1 -; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [7,7,7,7,7,7,7,7] -; AVX2-NEXT: vpmulld %ymm2, %ymm1, %ymm1 -; AVX2-NEXT: vpsubd %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: retq +; AVX2NOBW-LABEL: test_rem7_8i32: +; AVX2NOBW: # %bb.0: +; AVX2NOBW-NEXT: vpshufd {{.*#+}} ymm1 = ymm0[1,1,3,3,5,5,7,7] +; AVX2NOBW-NEXT: vpbroadcastd {{.*#+}} ymm2 = [613566757,613566757,613566757,613566757,613566757,613566757,613566757,613566757] +; AVX2NOBW-NEXT: vpmuludq %ymm2, %ymm1, %ymm1 +; AVX2NOBW-NEXT: vpmuludq %ymm2, %ymm0, %ymm2 +; AVX2NOBW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[1,1,3,3,5,5,7,7] +; AVX2NOBW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4],ymm1[5],ymm2[6],ymm1[7] +; AVX2NOBW-NEXT: vpsubd %ymm1, %ymm0, %ymm2 +; AVX2NOBW-NEXT: vpsrld $1, %ymm2, %ymm2 +; AVX2NOBW-NEXT: vpaddd %ymm1, %ymm2, %ymm1 +; AVX2NOBW-NEXT: vpsrld $2, %ymm1, %ymm1 +; AVX2NOBW-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX2NOBW-NEXT: vpsubd %ymm1, %ymm0, %ymm0 +; AVX2NOBW-NEXT: retq +; +; AVX512BW-LABEL: test_rem7_8i32: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpshufd {{.*#+}} ymm1 = ymm0[1,1,3,3,5,5,7,7] +; AVX512BW-NEXT: vpbroadcastd {{.*#+}} ymm2 = [613566757,613566757,613566757,613566757,613566757,613566757,613566757,613566757] +; AVX512BW-NEXT: vpmuludq %ymm2, %ymm1, %ymm1 +; AVX512BW-NEXT: vpmuludq %ymm2, %ymm0, %ymm2 +; AVX512BW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[1,1,3,3,5,5,7,7] +; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4],ymm1[5],ymm2[6],ymm1[7] +; AVX512BW-NEXT: vpsubd %ymm1, %ymm0, %ymm2 +; AVX512BW-NEXT: vpsrld $1, %ymm2, %ymm2 +; AVX512BW-NEXT: vpaddd %ymm1, %ymm2, %ymm1 +; AVX512BW-NEXT: vpsrld $2, %ymm1, %ymm1 +; AVX512BW-NEXT: vpbroadcastd {{.*#+}} ymm2 = [7,7,7,7,7,7,7,7] +; AVX512BW-NEXT: vpmulld %ymm2, %ymm1, %ymm1 +; AVX512BW-NEXT: vpsubd %ymm1, %ymm0, %ymm0 +; AVX512BW-NEXT: retq %res = urem <8 x i32> %a, ret <8 x i32> %res } @@ -624,7 +640,7 @@ ; AVX2NOBW: # %bb.0: ; AVX2NOBW-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2NOBW-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31] -; AVX2NOBW-NEXT: vmovdqa {{.*#+}} ymm3 = [37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37] +; AVX2NOBW-NEXT: vpbroadcastw {{.*#+}} ymm3 = [37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37] ; AVX2NOBW-NEXT: vpmullw %ymm3, %ymm2, %ymm2 ; AVX2NOBW-NEXT: vpsrlw $8, %ymm2, %ymm2 ; AVX2NOBW-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23] @@ -777,7 +793,7 @@ ; AVX2NOBW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 ; AVX2NOBW-NEXT: vpsrlw $8, %ymm3, %ymm3 ; AVX2NOBW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 -; AVX2NOBW-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX2NOBW-NEXT: vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX2NOBW-NEXT: vpand %ymm4, %ymm3, %ymm3 ; AVX2NOBW-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[4],ymm1[4],ymm2[5],ymm1[5],ymm2[6],ymm1[6],ymm2[7],ymm1[7],ymm2[16],ymm1[16],ymm2[17],ymm1[17],ymm2[18],ymm1[18],ymm2[19],ymm1[19],ymm2[20],ymm1[20],ymm2[21],ymm1[21],ymm2[22],ymm1[22],ymm2[23],ymm1[23] ; AVX2NOBW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 diff --git a/llvm/test/CodeGen/X86/vector-idiv-udiv-512.ll b/llvm/test/CodeGen/X86/vector-idiv-udiv-512.ll --- a/llvm/test/CodeGen/X86/vector-idiv-udiv-512.ll +++ b/llvm/test/CodeGen/X86/vector-idiv-udiv-512.ll @@ -103,7 +103,7 @@ define <32 x i16> @test_div7_32i16(<32 x i16> %a) nounwind { ; AVX512F-LABEL: test_div7_32i16: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363] +; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm1 = [9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363] ; AVX512F-NEXT: vpmulhuw %ymm1, %ymm0, %ymm2 ; AVX512F-NEXT: vpsubw %ymm2, %ymm0, %ymm3 ; AVX512F-NEXT: vpsrlw $1, %ymm3, %ymm3 @@ -135,7 +135,7 @@ ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31] -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37] +; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm3 = [37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37] ; AVX512F-NEXT: vpmullw %ymm3, %ymm2, %ymm2 ; AVX512F-NEXT: vpsrlw $8, %ymm2, %ymm2 ; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23] @@ -144,7 +144,7 @@ ; AVX512F-NEXT: vpackuswb %ymm2, %ymm4, %ymm2 ; AVX512F-NEXT: vpsubb %ymm2, %ymm0, %ymm4 ; AVX512F-NEXT: vpsrlw $1, %ymm4, %ymm4 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm5 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] ; AVX512F-NEXT: vpand %ymm5, %ymm4, %ymm4 ; AVX512F-NEXT: vpaddb %ymm2, %ymm4, %ymm2 ; AVX512F-NEXT: vpsrlw $2, %ymm2, %ymm2 @@ -431,7 +431,7 @@ ; AVX512F-LABEL: test_rem7_32i16: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363] +; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm2 = [9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363,9363] ; AVX512F-NEXT: vpmulhuw %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vpsubw %ymm3, %ymm1, %ymm4 ; AVX512F-NEXT: vpsrlw $1, %ymm4, %ymm4 @@ -471,7 +471,7 @@ ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX512F-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15],ymm1[24],ymm2[24],ymm1[25],ymm2[25],ymm1[26],ymm2[26],ymm1[27],ymm2[27],ymm1[28],ymm2[28],ymm1[29],ymm2[29],ymm1[30],ymm2[30],ymm1[31],ymm2[31] -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37] +; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm4 = [37,37,37,37,37,37,37,37,37,37,37,37,37,37,37,37] ; AVX512F-NEXT: vpmullw %ymm4, %ymm3, %ymm3 ; AVX512F-NEXT: vpsrlw $8, %ymm3, %ymm3 ; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[16],ymm2[16],ymm1[17],ymm2[17],ymm1[18],ymm2[18],ymm1[19],ymm2[19],ymm1[20],ymm2[20],ymm1[21],ymm2[21],ymm1[22],ymm2[22],ymm1[23],ymm2[23] @@ -480,14 +480,14 @@ ; AVX512F-NEXT: vpackuswb %ymm3, %ymm5, %ymm3 ; AVX512F-NEXT: vpsubb %ymm3, %ymm1, %ymm5 ; AVX512F-NEXT: vpsrlw $1, %ymm5, %ymm5 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm6 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] ; AVX512F-NEXT: vpand %ymm6, %ymm5, %ymm5 ; AVX512F-NEXT: vpaddb %ymm3, %ymm5, %ymm3 ; AVX512F-NEXT: vpsrlw $2, %ymm3, %ymm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm5 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm5 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63] ; AVX512F-NEXT: vpand %ymm5, %ymm3, %ymm3 ; AVX512F-NEXT: vpsllw $3, %ymm3, %ymm7 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm8 = [248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm8 = [248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248] ; AVX512F-NEXT: vpand %ymm7, %ymm8, %ymm7 ; AVX512F-NEXT: vpsubb %ymm7, %ymm3, %ymm3 ; AVX512F-NEXT: vpaddb %ymm3, %ymm1, %ymm1 @@ -570,7 +570,7 @@ ; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 ; AVX512F-NEXT: vpsrlw $8, %ymm2, %ymm2 ; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm5 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX512F-NEXT: vpand %ymm2, %ymm5, %ymm5 ; AVX512F-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm4[0],ymm1[0],ymm4[1],ymm1[1],ymm4[2],ymm1[2],ymm4[3],ymm1[3],ymm4[4],ymm1[4],ymm4[5],ymm1[5],ymm4[6],ymm1[6],ymm4[7],ymm1[7],ymm4[16],ymm1[16],ymm4[17],ymm1[17],ymm4[18],ymm1[18],ymm4[19],ymm1[19],ymm4[20],ymm1[20],ymm4[21],ymm1[21],ymm4[22],ymm1[22],ymm4[23],ymm1[23] ; AVX512F-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm4, %ymm4 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-5.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-5.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-5.ll @@ -1395,8 +1395,7 @@ ; AVX2-FAST-NEXT: vpermd %ymm10, %ymm11, %ymm10 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,0,1,6,7,16,17,22,23,24,25,30,31,20,21,22,23,16,17,22,23] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2],ymm10[3,4,5,6,7] -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [1,4,6,0,1,4,6,0] -; AVX2-FAST-NEXT: # ymm10 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm10, %ymm10 ; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm10, %ymm7 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm9[0,1,2,3,4],ymm7[5,6,7],ymm9[8,9,10,11,12],ymm7[13,14,15] @@ -1410,8 +1409,7 @@ ; AVX2-FAST-NEXT: vpermd %ymm10, %ymm11, %ymm10 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,2,3,4,5,18,19,20,21,26,27,16,17,30,31,30,31,18,19,20,21] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2],ymm10[3,4,5,6,7] -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [2,4,7,0,2,4,7,0] -; AVX2-FAST-NEXT: # ymm10 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm10, %ymm10 ; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm10, %ymm8 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0,1,2,3,4],ymm8[5,6,7],ymm9[8,9,10,11,12],ymm8[13,14,15] @@ -2884,8 +2882,7 @@ ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = <4,5,14,15,8,9,2,3,12,13,6,7,u,u,u,u> ; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm0 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm9[3,4,5,6,7] -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [1,4,6,0,1,4,6,0] -; AVX2-FAST-NEXT: # ymm9 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = ; AVX2-FAST-NEXT: vpermd %ymm13, %ymm9, %ymm10 ; AVX2-FAST-NEXT: vpshufb %ymm15, %ymm10, %ymm10 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm10 = ymm0[0,1,2,3,4],ymm10[5,6,7],ymm0[8,9,10,11,12],ymm10[13,14,15] @@ -2920,11 +2917,10 @@ ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm14 = <6,7,0,1,10,11,4,5,14,15,8,9,u,u,u,u> ; AVX2-FAST-NEXT: vpshufb %xmm14, %xmm1, %xmm1 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [2,4,7,0,2,4,7,0] -; AVX2-FAST-NEXT: # ymm2 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpermd %ymm13, %ymm2, %ymm9 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = -; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm9, %ymm9 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = +; AVX2-FAST-NEXT: vpermd %ymm13, %ymm6, %ymm9 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = +; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm9, %ymm9 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm9 = ymm1[0,1,2,3,4],ymm9[5,6,7],ymm1[8,9,10,11,12],ymm9[13,14,15] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm9[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -2938,8 +2934,8 @@ ; AVX2-FAST-NEXT: vpermd %ymm1, %ymm10, %ymm1 ; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm1, %ymm1 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] -; AVX2-FAST-NEXT: vpermd %ymm11, %ymm2, %ymm1 -; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpermd %ymm11, %ymm6, %ymm1 +; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm1, %ymm1 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2,3,4],ymm1[5,6,7],ymm0[8,9,10,11,12],ymm1[13,14,15] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm7[0,1],ymm3[2],ymm7[3],ymm3[4],ymm7[5,6],ymm3[7],ymm7[8,9],ymm3[10],ymm7[11],ymm3[12],ymm7[13,14],ymm3[15] @@ -6005,8 +6001,7 @@ ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = ; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm10, %ymm10 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm1[0,1,2],ymm10[3,4,5,6,7] -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [1,4,6,0,1,4,6,0] -; AVX2-FAST-NEXT: # ymm1 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm11 ; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm11, %ymm11 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm11 = ymm10[0,1,2,3,4],ymm11[5,6,7],ymm10[8,9,10,11,12],ymm11[13,14,15] @@ -6074,8 +6069,7 @@ ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = ; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm6, %ymm6 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm1[0,1,2],ymm6[3,4,5,6,7] -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [2,4,7,0,2,4,7,0] -; AVX2-FAST-NEXT: # ymm1 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = ; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm10 # 32-byte Folded Reload ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = ; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm10, %ymm10 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-7.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-7.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-7.ll @@ -2355,8 +2355,7 @@ ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = ; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm8, %ymm6 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm5[0,1,2],ymm4[3],ymm5[4,5],ymm4[6],ymm5[7] -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [3,6,2,5,3,6,2,5] -; AVX2-FAST-NEXT: # ymm11 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = ; AVX2-FAST-NEXT: vpermd %ymm10, %ymm11, %ymm10 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25,30,31,u,u,u,u] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0,1,2,3,4,5,6],ymm6[7] @@ -2404,7 +2403,7 @@ ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm12 = xmm13[0],xmm12[1],xmm13[2,3,4,5],xmm12[6],xmm13[7] ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[0,1,2,3,0,1,14,15,12,13,10,11,u,u,u,u] ; AVX2-FAST-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm13 = [21474836482,21474836482,21474836482,21474836482] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = ; AVX2-FAST-NEXT: vpermd %ymm7, %ymm13, %ymm13 ; AVX2-FAST-NEXT: vpshufhw {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5,6],ymm13[7] @@ -2450,8 +2449,7 @@ ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm9 = ymm13[0],ymm9[1,2,3,4,5,6,7],ymm13[8],ymm9[9,10,11,12,13,14,15] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm13[0,1,2,3],ymm9[4,5,6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7] -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm15 = [2,5,1,4,2,5,1,4] -; AVX2-FAST-NEXT: # ymm15 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = ; AVX2-FAST-NEXT: vpermd %ymm13, %ymm15, %ymm13 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = ; AVX2-FAST-NEXT: vpermd %ymm7, %ymm15, %ymm15 @@ -2470,12 +2468,10 @@ ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1],ymm14[2,3,4,5,6,7] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm13 = ymm6[0],ymm13[1,2,3,4,5,6,7],ymm6[8],ymm13[9,10,11,12,13,14,15] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm13[4,5,6,7] -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [0,4,7,0,0,4,7,0] -; AVX2-FAST-NEXT: # ymm13 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = ; AVX2-FAST-NEXT: vpermd %ymm7, %ymm13, %ymm7 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1],ymm4[2,3],ymm5[4,5],ymm4[6,7] -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [2,6,1,5,2,6,1,5] -; AVX2-FAST-NEXT: # ymm5 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = ; AVX2-FAST-NEXT: vpermd %ymm4, %ymm5, %ymm4 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm5 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,26,27] ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,26,27,28,29,u,u,u,u,u,u] @@ -4911,7 +4907,7 @@ ; ; AVX2-FAST-LABEL: load_i16_stride7_vf32: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: subq $680, %rsp # imm = 0x2A8 +; AVX2-FAST-NEXT: subq $616, %rsp # imm = 0x268 ; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm5 ; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %ymm6 ; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %ymm7 @@ -4935,7 +4931,7 @@ ; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm4 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0] ; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm2, %ymm4, %ymm2 -; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqu %ymm2, (%rsp) # 32-byte Spill ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0],ymm7[1],ymm8[2,3,4],ymm7[5],ymm8[6,7] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] ; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm2 @@ -4945,7 +4941,7 @@ ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4],xmm3[5],xmm4[6],xmm3[7] ; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm3, %xmm1 ; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm1 -; AVX2-FAST-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1,2],ymm12[3],ymm11[4,5],ymm12[6],ymm11[7] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2,3,4,5],xmm1[6],xmm2[7] @@ -5013,42 +5009,38 @@ ; AVX2-FAST-NEXT: vextracti128 $1, %ymm3, %xmm4 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2],xmm3[3],xmm4[4,5,6,7] ; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm2, %ymm1, %ymm0 -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm2, %ymm1, %ymm6 ; AVX2-FAST-NEXT: vmovdqa 352(%rdi), %ymm0 -; AVX2-FAST-NEXT: vmovdqa 384(%rdi), %ymm10 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0,1,2],ymm0[3],ymm10[4,5],ymm0[6],ymm10[7] +; AVX2-FAST-NEXT: vmovdqa 384(%rdi), %ymm15 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm15[0,1,2],ymm0[3],ymm15[4,5],ymm0[6],ymm15[7] ; AVX2-FAST-NEXT: vmovdqa %ymm0, %ymm5 -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm15 = [3,6,2,5,3,6,2,5] -; AVX2-FAST-NEXT: # ymm15 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpermd %ymm2, %ymm15, %ymm2 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = +; AVX2-FAST-NEXT: vpermd %ymm2, %ymm1, %ymm2 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,2,3,0,1,6,7,8,9,14,15,12,13,14,15,16,17,18,19,16,17,22,23,24,25,30,31,28,29,30,31] ; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm2, %ymm14 -; AVX2-FAST-NEXT: vmovdqa 416(%rdi), %ymm7 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm7[0,1,0,2] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = -; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm0, %ymm13 +; AVX2-FAST-NEXT: vmovdqa 416(%rdi), %ymm2 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm2[0,1,0,2] +; AVX2-FAST-NEXT: vmovdqa %ymm2, %ymm10 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = +; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm0, %ymm13 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3,4,5,6],ymm13[7] -; AVX2-FAST-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm1 = mem[0,1,2,3,4],ymm13[5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 128(%rdi), %ymm1 -; AVX2-FAST-NEXT: vmovdqa 160(%rdi), %ymm2 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm2[0,1,2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7] -; AVX2-FAST-NEXT: vmovdqa %ymm2, %ymm14 -; AVX2-FAST-NEXT: vmovdqa %ymm1, %ymm3 -; AVX2-FAST-NEXT: vpermd %ymm12, %ymm15, %ymm12 +; AVX2-FAST-NEXT: vpblendd $31, (%rsp), %ymm13, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm2 = mem[0,1,2,3,4],ymm13[5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm2, (%rsp) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 128(%rdi), %ymm13 +; AVX2-FAST-NEXT: vmovdqa 160(%rdi), %ymm14 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm14[0,1,2],ymm13[3],ymm14[4,5],ymm13[6],ymm14[7] +; AVX2-FAST-NEXT: vpermd %ymm12, %ymm1, %ymm12 ; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm12, %ymm4 ; AVX2-FAST-NEXT: vmovdqa 192(%rdi), %ymm1 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm1[0,1,0,2] -; AVX2-FAST-NEXT: vmovdqa %ymm1, %ymm13 -; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm12, %ymm11 +; AVX2-FAST-NEXT: vmovdqa %ymm1, %ymm3 +; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm12, %ymm11 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm11[7] -; AVX2-FAST-NEXT: vpblendd $31, (%rsp), %ymm4, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm1 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm1 = mem[0,1,2,3,4],ymm4[5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa %ymm5, %ymm15 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1],ymm10[2],ymm5[3,4,5],ymm10[6],ymm5[7] +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1],ymm15[2],ymm5[3,4,5],ymm15[6],ymm5[7] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm4, %xmm11 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm11[4],xmm4[5],xmm11[6],xmm4[7] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm11 = [0,1,0,1,14,15,12,13,10,11,8,9,12,13,14,15] @@ -5061,7 +5053,7 @@ ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1],ymm14[2],ymm3[3,4,5],ymm14[6],ymm3[7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1],ymm14[2],ymm13[3,4,5],ymm14[6],ymm13[7] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm4 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm4[4],xmm0[5],xmm4[6],xmm0[7] ; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm0, %xmm0 @@ -5072,103 +5064,100 @@ ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2],ymm10[3],ymm5[4,5],ymm10[6],ymm5[7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2],ymm15[3],ymm5[4,5],ymm15[6],ymm5[7] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm2 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2,3,4,5],xmm0[6],xmm2[7] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,0,1,14,15,12,13,10,11,12,13,14,15] ; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm4 = [21474836482,21474836482,21474836482,21474836482] -; AVX2-FAST-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermd %ymm7, %ymm4, %ymm11 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = +; AVX2-FAST-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermd %ymm10, %ymm4, %ymm11 ; AVX2-FAST-NEXT: vpshufhw {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm11[7] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2],ymm14[3],ymm3[4,5],ymm14[6],ymm3[7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2],ymm14[3],ymm13[4,5],ymm14[6],ymm13[7] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm11 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm11[0],xmm0[1],xmm11[2,3,4,5],xmm0[6],xmm11[7] ; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermd %ymm13, %ymm4, %ymm2 +; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermd %ymm3, %ymm4, %ymm2 ; AVX2-FAST-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm2[7] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm8[0],ymm0[1,2,3,4,5,6,7],ymm8[8],ymm0[9,10,11,12,13,14,15] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0],ymm5[1],ymm10[2,3],ymm5[4],ymm10[5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0],ymm5[1],ymm15[2,3],ymm5[4],ymm15[5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm2 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3,4,5],xmm2[6],xmm0[7] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,2,3,0,1,14,15,12,13,12,13,14,15] ; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm7[0,1,1,3] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm10[0,1,1,3] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = ; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm4, %ymm11 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm11[7] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm9[0],ymm0[1,2,3,4,5,6,7],ymm9[8],ymm0[9,10,11,12,13,14,15] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0],ymm3[1],ymm14[2,3],ymm3[4],ymm14[5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa %ymm3, %ymm7 -; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0],ymm13[1],ymm14[2,3],ymm13[4],ymm14[5,6,7] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm9 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm9[1],xmm0[2,3,4,5],xmm9[6],xmm0[7] ; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm13[0,1,1,3] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm3[0,1,1,3] ; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm2, %ymm8 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm8[7] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm6[0],ymm0[1,2,3,4,5,6,7],ymm6[8],ymm0[9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0],ymm5[1],ymm10[2,3,4],ymm5[5],ymm10[6,7] -; AVX2-FAST-NEXT: vmovdqa %ymm10, %ymm11 -; AVX2-FAST-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0],ymm5[1],ymm15[2,3,4],ymm5[5],ymm15[6,7] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm6 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm6[0],xmm0[1],xmm6[2],xmm0[3],xmm6[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm13 = [0,1,6,7,4,5,2,3,0,1,14,15,12,13,14,15] -; AVX2-FAST-NEXT: vpshufb %xmm13, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,6,7,4,5,2,3,0,1,14,15,12,13,14,15] +; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm0, %xmm0 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,21,26,27] +; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm4, %ymm4 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm0[0,1,2,3,4,5,6],ymm4[7] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm5[0,1],ymm3[2],ymm5[3,4],ymm3[5],ymm5[6,7] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm9 = xmm8[8,9,4,5,4,5,6,7,8,9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = [8,9,4,5,4,5,6,7,8,9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm8, %xmm9 +; AVX2-FAST-NEXT: vmovdqa %xmm0, %xmm11 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm8, %xmm8 ; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[3,1,2,3,4,5,6,7] ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm0[0,1],ymm10[2],ymm0[3,4,5],ymm10[6],ymm0[7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = <0,3,7,2,6,u,u,u> -; AVX2-FAST-NEXT: vpermd %ymm9, %ymm6, %ymm12 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <0,3,7,2,6,u,u,u> +; AVX2-FAST-NEXT: vpermd %ymm9, %ymm1, %ymm12 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[u,u,u,u,u,u,u,u,0,1,6,7,8,9,14,15,16,17,22,23,20,21,22,23,16,17,22,23,24,25,30,31] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1],ymm12[2,3,4,5,6,7] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm8[0],ymm4[1,2,3,4,5,6,7],ymm8[8],ymm4[9,10,11,12,13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3],ymm4[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm14[0],ymm7[1],ymm14[2,3,4],ymm7[5],ymm14[6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3],ymm4[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm14[0],ymm13[1],ymm14[2,3,4],ymm13[5],ymm14[6,7] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm4, %xmm8 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm8[0],xmm4[1],xmm8[2],xmm4[3],xmm8[4,5,6,7] -; AVX2-FAST-NEXT: vpshufb %xmm13, %xmm4, %xmm4 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,21,26,27] +; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm4, %xmm4 +; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm2, %ymm2 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5,6],ymm2[7] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0,1],ymm14[2],ymm9[3,4,5],ymm14[6],ymm9[7] -; AVX2-FAST-NEXT: vpermd %ymm4, %ymm6, %ymm4 +; AVX2-FAST-NEXT: vpblendd $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm4 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm4 = ymm9[0,1],mem[2],ymm9[3,4,5],mem[6],ymm9[7] +; AVX2-FAST-NEXT: vpermd %ymm4, %ymm1, %ymm4 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm1[0,1],ymm12[2],ymm1[3,4],ymm12[5],ymm1[6,7] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm7 = xmm6[8,9,4,5,4,5,6,7,8,9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm6, %xmm7 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm6, %xmm6 ; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[3,1,2,3,4,5,6,7] ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] @@ -5178,14 +5167,13 @@ ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,3,3,3,0,3,7,7] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-FAST-NEXT: vpermd %ymm13, %ymm2, %ymm4 +; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm4 # 32-byte Folded Reload ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = ; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm4, %ymm4 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm11[0,1],ymm15[2],ymm11[3,4],ymm15[5],ymm11[6,7] -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [2,5,1,4,2,5,1,4] -; AVX2-FAST-NEXT: # ymm11 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpermd %ymm8, %ymm11, %ymm8 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm15[0,1],ymm11[2],ymm15[3,4],ymm11[5],ymm15[6,7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = +; AVX2-FAST-NEXT: vpermd %ymm8, %ymm6, %ymm8 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25,30,31,u,u,u,u,u,u] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm8[0,1,2,3,4],ymm4[5,6,7],ymm8[8,9,10,11,12],ymm4[13,14,15] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm5[0,1],ymm3[2,3],ymm5[4,5],ymm3[6,7] @@ -5195,20 +5183,19 @@ ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[10,11,6,7,4,5,6,7,8,9,10,11,12,13,14,15] ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm0[0,1,2],ymm10[3],ymm0[4,5],ymm10[6],ymm0[7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = <0,4,7,3,6,u,u,u> -; AVX2-FAST-NEXT: vpermd %ymm8, %ymm0, %ymm8 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = <0,4,7,3,6,u,u,u> +; AVX2-FAST-NEXT: vpermd %ymm8, %ymm10, %ymm8 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[u,u,u,u,u,u,u,u,2,3,4,5,10,11,12,13,18,19,18,19,18,19,18,19,18,19,20,21,26,27,28,29] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1],ymm8[2,3,4,5,6,7] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm6[0],ymm4[1,2,3,4,5,6,7],ymm6[8],ymm4[9,10,11,12,13,14,15] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm4[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FAST-NEXT: vpermd %ymm10, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7] -; AVX2-FAST-NEXT: vpermd %ymm6, %ymm11, %ymm6 +; AVX2-FAST-NEXT: vpermd %ymm5, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm14[0,1],ymm13[2],ymm14[3,4],ymm13[5],ymm14[6,7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = +; AVX2-FAST-NEXT: vpermd %ymm6, %ymm0, %ymm6 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25,30,31,u,u,u,u,u,u] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm6[0,1,2,3,4],ymm2[5,6,7],ymm6[8,9,10,11,12],ymm2[13,14,15] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm1[0,1],ymm12[2,3],ymm1[4,5],ymm12[6,7] @@ -5216,28 +5203,24 @@ ; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm7, %xmm0 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[10,11,6,7,4,5,6,7,8,9,10,11,12,13,14,15] ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3] -; AVX2-FAST-NEXT: vmovdqa %ymm14, %ymm11 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1,2],ymm14[3],ymm9[4,5],ymm14[6],ymm9[7] -; AVX2-FAST-NEXT: vmovdqa %ymm9, %ymm14 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <0,4,7,3,6,u,u,u> -; AVX2-FAST-NEXT: vpermd %ymm6, %ymm1, %ymm6 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1,2],ymm7[3],ymm9[4,5],ymm7[6],ymm9[7] +; AVX2-FAST-NEXT: vmovdqa %ymm9, %ymm4 +; AVX2-FAST-NEXT: vpermd %ymm6, %ymm10, %ymm6 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,u,u,u,u,2,3,4,5,10,11,12,13,18,19,18,19,18,19,18,19,18,19,20,21,26,27,28,29] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm6[2,3,4,5,6,7] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm0[0],ymm2[1,2,3,4,5,6,7],ymm0[8],ymm2[9,10,11,12,13,14,15] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm0 = mem[0,1],ymm15[2,3],mem[4,5],ymm15[6,7] -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,4,7,0,0,4,7,0] -; AVX2-FAST-NEXT: # ymm1 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpermd %ymm13, %ymm1, %ymm3 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = -; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm3, %ymm3 -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [2,6,1,5,2,6,1,5] -; AVX2-FAST-NEXT: # ymm6 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1],ymm11[2,3],ymm15[4,5],ymm11[6,7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = +; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm3 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = +; AVX2-FAST-NEXT: vpshufb %ymm15, %ymm3, %ymm3 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm6, %ymm0 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = -; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = +; AVX2-FAST-NEXT: vpshufb %ymm11, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5,6,7],ymm0[8,9,10,11,12],ymm3[13,14,15] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-FAST-NEXT: vpblendd $72, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm3 # 32-byte Folded Reload @@ -5250,22 +5233,22 @@ ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-FAST-NEXT: vpblendd $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm8 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm8 = mem[0],ymm2[1],mem[2,3],ymm2[4],mem[5,6,7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <1,4,0,3,7,u,u,u> -; AVX2-FAST-NEXT: vpermd %ymm8, %ymm2, %ymm8 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = -; AVX2-FAST-NEXT: vpshufb %ymm15, %ymm8, %ymm8 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = <1,4,0,3,7,u,u,u> +; AVX2-FAST-NEXT: vpermd %ymm8, %ymm10, %ymm8 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = +; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm8, %ymm8 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm8[2,3,4,5,6,7] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0],ymm0[1,2,3,4,5,6,7],ymm3[8],ymm0[9,10,11,12,13,14,15] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FAST-NEXT: vpermd %ymm10, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1],ymm4[2,3],ymm5[4,5],ymm4[6,7] +; AVX2-FAST-NEXT: vpermd %ymm5, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpshufb %ymm15, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm14[0,1],ymm13[2,3],ymm14[4,5],ymm13[6,7] ; AVX2-FAST-NEXT: vpermd %ymm3, %ymm6, %ymm3 -; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm3, %ymm3 +; AVX2-FAST-NEXT: vpshufb %ymm11, %ymm3, %ymm3 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0,1,2,3,4],ymm1[5,6,7],ymm3[8,9,10,11,12],ymm1[13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm11[0],ymm14[1],ymm11[2,3],ymm14[4],ymm11[5,6,7] -; AVX2-FAST-NEXT: vpermd %ymm3, %ymm2, %ymm3 -; AVX2-FAST-NEXT: vpshufb %ymm15, %ymm3, %ymm3 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0],ymm4[1],ymm7[2,3],ymm4[4],ymm7[5,6,7] +; AVX2-FAST-NEXT: vpermd %ymm3, %ymm10, %ymm3 +; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm3, %ymm3 ; AVX2-FAST-NEXT: vpblendd $183, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm5 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm5 = mem[0,1,2],ymm12[3],mem[4,5],ymm12[6],mem[7] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm5, %xmm6 @@ -5275,9 +5258,9 @@ ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1],ymm3[2,3,4,5,6,7] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0],ymm1[1,2,3,4,5,6,7],ymm3[8],ymm1[9,10,11,12,13,14,15] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FAST-NEXT: vmovups (%rsp), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm3, (%rsi) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm3, (%rsi) +; AVX2-FAST-NEXT: vmovups (%rsp), %ymm3 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm3, 32(%rsi) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm3, (%rdx) @@ -5303,7 +5286,7 @@ ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FAST-NEXT: vmovdqa %ymm1, (%rax) ; AVX2-FAST-NEXT: vmovdqa %ymm0, 32(%rax) -; AVX2-FAST-NEXT: addq $680, %rsp # imm = 0x2A8 +; AVX2-FAST-NEXT: addq $616, %rsp # imm = 0x268 ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; @@ -10966,8 +10949,9 @@ ; AVX2-FAST: # %bb.0: ; AVX2-FAST-NEXT: subq $1544, %rsp # imm = 0x608 ; AVX2-FAST-NEXT: vmovdqa 256(%rdi), %ymm5 -; AVX2-FAST-NEXT: vmovdqu %ymm5, (%rsp) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 224(%rdi), %ymm15 +; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 224(%rdi), %ymm6 +; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa 512(%rdi), %ymm11 ; AVX2-FAST-NEXT: vmovdqa 544(%rdi), %ymm12 ; AVX2-FAST-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -11002,37 +10986,37 @@ ; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm4, %ymm4 ; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm3, %ymm4, %ymm3 ; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm15[0,1],ymm5[2],ymm15[3,4,5],ymm5[6],ymm15[7] -; AVX2-FAST-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1],ymm5[2],ymm6[3,4,5],ymm5[6],ymm6[7] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm3, %xmm4 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4],xmm3[5],xmm4[6],xmm3[7] ; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm3, %xmm3 ; AVX2-FAST-NEXT: vmovdqa 288(%rdi), %ymm12 -; AVX2-FAST-NEXT: vmovdqa 320(%rdi), %ymm4 -; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0],ymm12[1],ymm4[2,3,4],ymm12[5],ymm4[6,7] +; AVX2-FAST-NEXT: vmovdqa 320(%rdi), %ymm15 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm15[0],ymm12[1],ymm15[2,3,4],ymm12[5],ymm15[6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3] ; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm4, %ymm4 -; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm3, %ymm4, %ymm3 -; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 704(%rdi), %ymm4 -; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa %ymm0, %ymm6 +; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm3, %ymm4, %ymm0 +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 704(%rdi), %ymm0 +; AVX2-FAST-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa 672(%rdi), %ymm3 ; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4,5],ymm4[6],ymm3[7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm0[2],ymm3[3,4,5],ymm0[6],ymm3[7] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm3, %xmm4 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4],xmm3[5],xmm4[6],xmm3[7] ; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX2-FAST-NEXT: vmovdqa 736(%rdi), %ymm4 -; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 768(%rdi), %ymm3 +; AVX2-FAST-NEXT: vmovdqa 736(%rdi), %ymm3 ; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3,4],ymm4[5],ymm3[6,7] +; AVX2-FAST-NEXT: vmovdqa 768(%rdi), %ymm0 +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0],ymm3[1],ymm0[2,3,4],ymm3[5],ymm0[6,7] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] ; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm3, %ymm1 -; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm2, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm2, %ymm1, %ymm0 +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1,2],ymm9[3],ymm10[4,5],ymm9[6],ymm10[7] @@ -11043,50 +11027,47 @@ ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1],ymm8[2],ymm7[3,4],ymm8[5],ymm7[6,7] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <2,5,1,u,4,u,u,u> ; AVX2-FAST-NEXT: vpermd %ymm2, %ymm1, %ymm3 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [2,3,2,3,2,3,2,3,8,9,0,1,6,7,8,9,18,19,18,19,18,19,18,19,24,25,16,17,22,23,24,25] -; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm3, %ymm5 -; AVX2-FAST-NEXT: vmovdqa %ymm1, %ymm3 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = [2,3,0,1,14,15,12,13,10,11,10,11,10,11,10,11] -; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm4, %xmm4 -; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm4, %ymm5, %ymm4 -; AVX2-FAST-NEXT: vmovdqa %ymm0, %ymm2 -; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [2,3,2,3,2,3,2,3,8,9,0,1,6,7,8,9,18,19,18,19,18,19,18,19,24,25,16,17,22,23,24,25] +; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm3, %ymm5 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = [2,3,0,1,14,15,12,13,10,11,10,11,10,11,10,11] +; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm4, %xmm4 +; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm4, %ymm5, %ymm0 +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm14[0,1,2],ymm13[3],ymm14[4,5],ymm13[6],ymm14[7] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm4, %xmm5 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2,3,4,5],xmm4[6],xmm5[7] -; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm4, %xmm4 +; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm4, %xmm4 ; AVX2-FAST-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm5 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm5 = mem[0,1],ymm11[2],mem[3,4],ymm11[5],mem[6,7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <2,5,1,u,4,u,u,u> ; AVX2-FAST-NEXT: vpermd %ymm5, %ymm1, %ymm5 -; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm5, %ymm5 -; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm4, %ymm5, %ymm0 +; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm5, %ymm5 +; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm4, %ymm5, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd $72, (%rsp), %ymm15, %ymm4 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm4 = ymm15[0,1,2],mem[3],ymm15[4,5],mem[6],ymm15[7] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd $183, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm4 = mem[0,1,2],ymm0[3],mem[4,5],ymm0[6],mem[7] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm4, %xmm5 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2,3,4,5],xmm4[6],xmm5[7] -; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm4, %xmm4 -; AVX2-FAST-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm5 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm5 = mem[0,1],ymm12[2],mem[3,4],ymm12[5],mem[6,7] +; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm4, %xmm4 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm15[0,1],ymm12[2],ymm15[3,4],ymm12[5],ymm15[6,7] ; AVX2-FAST-NEXT: vpermd %ymm5, %ymm1, %ymm5 -; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm5, %ymm5 -; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm4, %ymm5, %ymm0 +; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm5, %ymm5 +; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm4, %ymm5, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $183, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm4 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm4 = mem[0,1,2],ymm12[3],mem[4,5],ymm12[6],mem[7] +; AVX2-FAST-NEXT: vpblendd $72, (%rsp), %ymm12, %ymm4 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm4 = ymm12[0,1,2],mem[3],ymm12[4,5],mem[6],ymm12[7] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm4, %xmm5 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2,3,4,5],xmm4[6],xmm5[7] -; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm4, %xmm3 +; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm4, %xmm3 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm15[0,1],ymm6[2],ymm15[3,4],ymm6[5],ymm15[6,7] +; AVX2-FAST-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm4 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm4 = ymm15[0,1],mem[2],ymm15[3,4],mem[5],ymm15[6,7] ; AVX2-FAST-NEXT: vpermd %ymm4, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[2,3,2,3,2,3,2,3,8,9,0,1,6,7,8,9,18,19,18,19,18,19,18,19,24,25,16,17,22,23,24,25] -; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm3, %ymm1, %ymm0 +; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm3, %ymm1, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0],ymm10[1],ymm9[2,3],ymm10[4],ymm9[5,6,7] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm2 @@ -11098,8 +11079,7 @@ ; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm3, %ymm5 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = [4,5,2,3,0,1,14,15,12,13,12,13,12,13,12,13] ; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm4, %xmm4 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0] -; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm4, %ymm5, %ymm4 +; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm4, %ymm5, %ymm4 ; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm13[0],ymm14[1],ymm13[2,3],ymm14[4],ymm13[5,6,7] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm4, %xmm5 @@ -11110,35 +11090,33 @@ ; AVX2-FAST-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpermd %ymm5, %ymm1, %ymm5 ; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm5, %ymm5 -; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm4, %ymm5, %ymm4 -; AVX2-FAST-NEXT: vmovdqa %ymm0, %ymm9 +; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm4, %ymm5, %ymm4 ; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovdqa %ymm12, %ymm13 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm12[0],ymm0[1],ymm12[2,3],ymm0[4],ymm12[5,6,7] +; AVX2-FAST-NEXT: vmovdqu (%rsp), %ymm13 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm13[0],ymm12[1],ymm13[2,3],ymm12[4],ymm13[5,6,7] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm4, %xmm5 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm5[1],xmm4[2,3,4,5],xmm5[6],xmm4[7] ; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm4, %xmm4 -; AVX2-FAST-NEXT: vmovdqa %ymm15, %ymm8 -; AVX2-FAST-NEXT: vmovdqa %ymm6, %ymm7 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm15[0,1],ymm6[2,3],ymm15[4,5],ymm6[6,7] +; AVX2-FAST-NEXT: vmovdqa %ymm15, %ymm0 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm15[0,1],ymm7[2,3],ymm15[4,5],ymm7[6,7] ; AVX2-FAST-NEXT: vpermd %ymm5, %ymm1, %ymm5 ; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm5, %ymm5 -; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm4, %ymm5, %ymm4 -; AVX2-FAST-NEXT: vmovdqa %ymm9, %ymm12 +; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm4, %ymm5, %ymm4 +; AVX2-FAST-NEXT: vmovdqa %ymm6, %ymm15 ; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-NEXT: vmovdqu (%rsp), %ymm9 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0],ymm6[1],ymm9[2,3],ymm6[4],ymm9[5,6,7] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0],ymm8[1],ymm9[2,3],ymm8[4],ymm9[5,6,7] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm4, %xmm5 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm5[1],xmm4[2,3,4,5],xmm5[6],xmm4[7] ; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm4, %xmm3 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm14[0,1],ymm15[2,3],ymm14[4,5],ymm15[6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1],ymm14[2,3],ymm6[4,5],ymm14[6,7] ; AVX2-FAST-NEXT: vpermd %ymm4, %ymm1, %ymm1 ; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vpblendvb %ymm12, %ymm3, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm3, %ymm1, %ymm1 ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-NEXT: vpblendd $221, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload @@ -11146,14 +11124,14 @@ ; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm2[0],xmm1[1],xmm2[2],xmm1[3],xmm2[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $183, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm1 = mem[0,1,2],ymm1[3],mem[4,5],ymm1[6],mem[7] +; AVX2-FAST-NEXT: vpblendd $72, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm1 = ymm1[0,1,2],mem[3],ymm1[4,5],mem[6],ymm1[7] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm1[1,3,2,3] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [6,7,6,7,6,7,6,7,8,9,4,5,10,11,0,1,22,23,22,23,22,23,22,23,24,25,20,21,26,27,16,17] ; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm2, %ymm4 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [6,7,4,5,2,3,0,1,14,15,14,15,14,15,14,15] ; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm3, %xmm3 -; AVX2-FAST-NEXT: vpblendvb %ymm12, %ymm3, %ymm4, %ymm3 +; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm3, %ymm4, %ymm3 ; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-FAST-NEXT: vpblendd $221, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload @@ -11164,112 +11142,99 @@ ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm10[0,1,2],ymm11[3],ymm10[4,5],ymm11[6],ymm10[7] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[1,3,2,3] ; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm4, %ymm4 -; AVX2-FAST-NEXT: vpblendvb %ymm12, %ymm3, %ymm4, %ymm3 +; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm3, %ymm4, %ymm3 ; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm9[0],ymm6[1],ymm9[2,3,4],ymm6[5],ymm9[6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm9[0],ymm8[1],ymm9[2,3,4],ymm8[5],ymm9[6,7] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm3, %xmm4 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2],xmm3[3],xmm4[4,5,6,7] ; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm3, %xmm3 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm14[0,1,2],ymm15[3],ymm14[4,5],ymm15[6],ymm14[7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2],ymm14[3],ymm6[4,5],ymm14[6],ymm6[7] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[1,3,2,3] ; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm4, %ymm4 -; AVX2-FAST-NEXT: vpblendvb %ymm12, %ymm3, %ymm4, %ymm3 +; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm3, %ymm4, %ymm3 ; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm13[0],ymm0[1],ymm13[2,3,4],ymm0[5],ymm13[6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm13[0],ymm12[1],ymm13[2,3,4],ymm12[5],ymm13[6,7] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm3, %xmm4 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2],xmm3[3],xmm4[4,5,6,7] ; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm8[0,1,2],ymm7[3],ymm8[4,5],ymm7[6],ymm8[7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0,1,2],ymm7[3],ymm0[4,5],ymm7[6],ymm0[7] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[1,3,2,3] ; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm3, %ymm1 -; AVX2-FAST-NEXT: vpblendvb %ymm12, %ymm2, %ymm1, %ymm0 +; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm2, %ymm1, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 128(%rdi), %ymm11 -; AVX2-FAST-NEXT: vmovdqa 160(%rdi), %ymm10 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2],ymm11[3],ymm10[4,5],ymm11[6],ymm10[7] -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [3,6,2,5,3,6,2,5] -; AVX2-FAST-NEXT: # ymm1 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,2,3,0,1,6,7,8,9,14,15,12,13,14,15,16,17,18,19,16,17,22,23,24,25,30,31,28,29,30,31] -; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vmovdqa 192(%rdi), %ymm2 -; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm2[0,1,0,2] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = -; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm5, %ymm4 +; AVX2-FAST-NEXT: vmovdqa 128(%rdi), %ymm9 +; AVX2-FAST-NEXT: vmovdqa 160(%rdi), %ymm11 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2],ymm9[3],ymm11[4,5],ymm9[6],ymm11[7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm6, %ymm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,2,3,0,1,6,7,8,9,14,15,12,13,14,15,16,17,18,19,16,17,22,23,24,25,30,31,28,29,30,31] +; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vmovdqa 192(%rdi), %ymm1 +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm1[0,1,0,2] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = +; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm3, %ymm4 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm4[7] ; AVX2-FAST-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm0 = mem[0,1,2,3,4],ymm0[5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 576(%rdi), %ymm12 -; AVX2-FAST-NEXT: vmovdqa 608(%rdi), %ymm0 -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm12[3],ymm0[4,5],ymm12[6],ymm0[7] -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 -; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vmovdqa 640(%rdi), %ymm2 -; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm2[0,1,0,2] -; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm3, %ymm13 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm13[7] +; AVX2-FAST-NEXT: vmovdqa 576(%rdi), %ymm10 +; AVX2-FAST-NEXT: vmovdqa 608(%rdi), %ymm14 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2],ymm10[3],ymm14[4,5],ymm10[6],ymm14[7] +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm6, %ymm0 +; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vmovdqa 640(%rdi), %ymm1 +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm1[0,1,0,2] +; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm2, %ymm12 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm12[7] ; AVX2-FAST-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm0 = mem[0,1,2,3,4],ymm0[5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 352(%rdi), %ymm14 -; AVX2-FAST-NEXT: vmovdqa 384(%rdi), %ymm8 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2],ymm14[3],ymm8[4,5],ymm14[6],ymm8[7] -; AVX2-FAST-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 -; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vmovdqa 416(%rdi), %ymm2 -; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,2] -; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm2, %ymm15 +; AVX2-FAST-NEXT: vmovdqa 352(%rdi), %ymm1 +; AVX2-FAST-NEXT: vmovdqa 384(%rdi), %ymm4 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2],ymm1[3],ymm4[4,5],ymm1[6],ymm4[7] +; AVX2-FAST-NEXT: vmovdqa %ymm4, %ymm13 +; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa %ymm1, %ymm12 +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm6, %ymm0 +; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vmovdqa 416(%rdi), %ymm1 +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,2] +; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm1, %ymm15 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm15[7] ; AVX2-FAST-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm0 = mem[0,1,2,3,4],ymm0[5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa 800(%rdi), %ymm4 -; AVX2-FAST-NEXT: vmovdqa 832(%rdi), %ymm13 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2],ymm4[3],ymm13[4,5],ymm4[6],ymm13[7] -; AVX2-FAST-NEXT: vmovdqa %ymm4, %ymm7 -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 -; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vmovdqa 864(%rdi), %ymm1 -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,2] -; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm1, %ymm6 +; AVX2-FAST-NEXT: vmovdqa 832(%rdi), %ymm7 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2],ymm4[3],ymm7[4,5],ymm4[6],ymm7[7] +; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm6, %ymm0 +; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vmovdqa 864(%rdi), %ymm5 +; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,2] +; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm5, %ymm6 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm6[7] ; AVX2-FAST-NEXT: vpblendd $31, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm0 = mem[0,1,2,3,4],ymm0[5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1],ymm10[2],ymm11[3,4,5],ymm10[6],ymm11[7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1],ymm11[2],ymm9[3,4,5],ymm11[6],ymm9[7] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm6 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm6[4],xmm0[5],xmm6[6],xmm0[7] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,0,1,14,15,12,13,10,11,8,9,12,13,14,15] ; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm0, %xmm0 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = -; AVX2-FAST-NEXT: vpshufb %ymm15, %ymm5, %ymm5 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm5[7] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0],ymm0[1,2,3,4,5,6,7],ymm4[8],ymm0[9,10,11,12,13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa %ymm12, %ymm9 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1],ymm12[2],ymm9[3,4,5],ymm12[6],ymm9[7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm5 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm5[4],xmm0[5],xmm5[6],xmm0[7] -; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vpshufb %ymm15, %ymm3, %ymm3 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm3[7] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0],ymm0[1,2,3,4,5,6,7],ymm3[8],ymm0[9,10,11,12,13,14,15] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1],ymm8[2],ymm14[3,4,5],ymm8[6],ymm14[7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1],ymm14[2],ymm10[3,4,5],ymm14[6],ymm10[7] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm3 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4],xmm0[5],xmm3[6],xmm0[7] ; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm0, %xmm0 @@ -11280,155 +11245,166 @@ ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1,2,3,4,5,6,7],ymm2[8],ymm0[9,10,11,12,13,14,15] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1],ymm13[2],ymm7[3,4,5],ymm13[6],ymm7[7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1],ymm13[2],ymm12[3,4,5],ymm13[6],ymm12[7] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm2 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4],xmm0[5],xmm2[6],xmm0[7] ; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vpshufb %ymm15, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1],ymm7[2],ymm4[3,4,5],ymm7[6],ymm4[7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5],xmm1[6],xmm0[7] +; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpshufb %ymm15, %ymm5, %ymm1 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa %ymm10, %ymm4 -; AVX2-FAST-NEXT: vmovdqa %ymm11, %ymm10 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2],ymm4[3],ymm11[4,5],ymm4[6],ymm11[7] +; AVX2-FAST-NEXT: vmovdqa %ymm9, %ymm5 +; AVX2-FAST-NEXT: vmovdqa %ymm11, %ymm9 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2],ymm11[3],ymm5[4,5],ymm11[6],ymm5[7] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3,4,5],xmm0[6],xmm1[7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,0,1,14,15,12,13,10,11,12,13,14,15] -; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm13 = [0,1,2,3,0,1,14,15,12,13,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpshufb %xmm13, %xmm0, %xmm0 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm2 = [21474836482,21474836482,21474836482,21474836482] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-NEXT: vpermd %ymm5, %ymm2, %ymm3 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-FAST-NEXT: vpermd %ymm11, %ymm1, %ymm3 ; AVX2-FAST-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm3[7] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0],ymm0[1,2,3,4,5,6,7],ymm3[8],ymm0[9,10,11,12,13,14,15] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2],ymm12[3],ymm9[4,5],ymm12[6],ymm9[7] -; AVX2-FAST-NEXT: vmovdqa %ymm12, %ymm15 -; AVX2-FAST-NEXT: vmovdqa %ymm9, %ymm12 +; AVX2-FAST-NEXT: vmovdqa %ymm10, %ymm6 +; AVX2-FAST-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa %ymm14, %ymm15 +; AVX2-FAST-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2],ymm14[3],ymm10[4,5],ymm14[6],ymm10[7] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm3 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0],xmm0[1],xmm3[2,3,4,5],xmm0[6],xmm3[7] -; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpshufb %xmm13, %xmm0, %xmm0 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-NEXT: vpermd %ymm6, %ymm2, %ymm3 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vpermd %ymm2, %ymm1, %ymm3 ; AVX2-FAST-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm3[7] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0],ymm0[1,2,3,4,5,6,7],ymm3[8],ymm0[9,10,11,12,13,14,15] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2],ymm13[3],ymm7[4,5],ymm13[6],ymm7[7] -; AVX2-FAST-NEXT: vmovdqa %ymm13, %ymm8 -; AVX2-FAST-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2],ymm7[3],ymm4[4,5],ymm7[6],ymm4[7] +; AVX2-FAST-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm3 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0],xmm0[1],xmm3[2,3,4,5],xmm0[6],xmm3[7] -; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpshufb %xmm13, %xmm0, %xmm0 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-NEXT: vpermd %ymm9, %ymm2, %ymm3 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FAST-NEXT: vpermd %ymm8, %ymm1, %ymm3 ; AVX2-FAST-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm3[7] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0],ymm0[1,2,3,4,5,6,7],ymm3[8],ymm0[9,10,11,12,13,14,15] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa %ymm14, %ymm11 -; AVX2-FAST-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2],ymm13[3],ymm14[4,5],ymm13[6],ymm14[7] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2],ymm10[3],ymm12[4,5],ymm10[6],ymm12[7] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm3 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0],xmm0[1],xmm3[2,3,4,5],xmm0[6],xmm3[7] -; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpshufb %xmm13, %xmm0, %xmm0 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-FAST-NEXT: vpermd %ymm14, %ymm2, %ymm1 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vpermd %ymm4, %ymm1, %ymm1 ; AVX2-FAST-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,4,7,8,9,10,11,12,13,12,15] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0],ymm10[1],ymm4[2,3],ymm10[4],ymm4[5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0],ymm5[1],ymm9[2,3],ymm5[4],ymm9[5,6,7] +; AVX2-FAST-NEXT: vmovdqa %ymm9, %ymm13 +; AVX2-FAST-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa %ymm5, %ymm14 +; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5],xmm1[6],xmm0[7] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,4,5,2,3,0,1,14,15,12,13,12,13,14,15] ; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm0 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm5[0,1,1,3] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm11[0,1,1,3] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = -; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm5 +; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm9, %ymm5 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm5[7] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm5[0],ymm0[1,2,3,4,5,6,7],ymm5[8],ymm0[9,10,11,12,13,14,15] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa %ymm12, %ymm7 -; AVX2-FAST-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0],ymm12[1],ymm15[2,3],ymm12[4],ymm15[5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0],ymm6[1],ymm15[2,3],ymm6[4],ymm15[5,6,7] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm5 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm5[1],xmm0[2,3,4,5],xmm5[6],xmm0[7] ; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm0 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm6[0,1,1,3] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm2[0,1,1,3] ; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm5, %ymm6 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm6[7] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm6[0],ymm0[1,2,3,4,5,6,7],ymm6[8],ymm0[9,10,11,12,13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1,2,3,4,5,6,7],ymm2[8],ymm0[9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0],ymm11[1],ymm13[2,3],ymm11[4],ymm13[5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0],ymm12[1],ymm10[2,3],ymm12[4],ymm10[5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm6 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm6[1],xmm0[2,3,4,5],xmm6[6],xmm0[7] ; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm0 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm6 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm14[0,1,1,3] -; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm12, %ymm15 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm4[0,1,1,3] +; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm15 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm15[7] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm11[0],ymm6[1,2,3,4,5,6,7],ymm11[8],ymm6[9,10,11,12,13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm11[0,1,2,3],ymm6[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm2[0],ymm6[1,2,3,4,5,6,7],ymm2[8],ymm6[9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm2[0,1,2,3],ymm6[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0],ymm15[1],ymm8[2,3],ymm15[4],ymm8[5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0],ymm15[1],ymm7[2,3],ymm15[4],ymm7[5,6,7] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm6, %xmm11 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0],xmm11[1],xmm6[2,3,4,5],xmm11[6],xmm6[7] ; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm6, %xmm1 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm9[0,1,1,3] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm8[0,1,1,3] ; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm11, %ymm3 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm3[7] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0],ymm1[1,2,3,4,5,6,7],ymm3[8],ymm1[9,10,11,12,13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3,4,5,6,7],ymm2[8],ymm1[9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0],ymm10[1],ymm4[2,3,4],ymm10[5],ymm4[6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0],ymm14[1],ymm13[2,3,4],ymm14[5],ymm13[6,7] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm3 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2],xmm1[3],xmm3[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,6,7,4,5,2,3,0,1,14,15,12,13,14,15] -; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm1, %xmm1 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,6,7,4,5,2,3,0,1,14,15,12,13,14,15] +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX2-FAST-NEXT: vmovdqa %xmm2, %xmm8 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = -; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = +; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm9, %ymm2 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3,4,5,6],ymm2[7] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1],ymm9[2],ymm10[3,4],ymm9[5],ymm10[6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1],ymm14[2],ymm10[3,4],ymm14[5],ymm10[6,7] ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm1[8,9,4,5,4,5,6,7,8,9,10,11,12,13,14,15] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm1 ; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7] ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm14[0,1],ymm13[2],ymm14[3,4,5],ymm13[6],ymm14[7] +; AVX2-FAST-NEXT: vpblendd $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm6 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm6 = ymm13[0,1],mem[2],ymm13[3,4,5],mem[6],ymm13[7] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <0,3,7,2,6,u,u,u> ; AVX2-FAST-NEXT: vpermd %ymm6, %ymm1, %ymm6 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,u,u,u,u,0,1,6,7,8,9,14,15,16,17,22,23,20,21,22,23,16,17,22,23,24,25,30,31] @@ -11436,13 +11412,15 @@ ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1,2,3,4,5,6,7],ymm3[8],ymm2[9,10,11,12,13,14,15] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd $221, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm2 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm2 = mem[0],ymm7[1],mem[2,3,4],ymm7[5],mem[6,7] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm2 = ymm2[0],mem[1],ymm2[2,3,4],mem[5],ymm2[6,7] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2],xmm2[3],xmm3[4,5,6,7] -; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm2, %xmm2 +; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm2, %xmm2 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm5, %ymm3 +; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm5, %ymm3 +; AVX2-FAST-NEXT: vmovdqa %ymm7, %ymm9 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload @@ -11461,26 +11439,25 @@ ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1,2,3,4,5,6,7],ymm3[8],ymm2[9,10,11,12,13,14,15] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $221, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm2 = mem[0],ymm2[1],mem[2,3,4],ymm2[5],mem[6,7] +; AVX2-FAST-NEXT: vpblendd $221, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm2 = mem[0],ymm12[1],mem[2,3,4],ymm12[5],mem[6,7] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2],xmm2[3],xmm3[4,5,6,7] -; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm2, %xmm2 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,6,7,4,5,2,3,0,1,14,15,12,13,14,15] ; AVX2-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,21,26,27] +; AVX2-FAST-NEXT: vpshufb %ymm9, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5,6],ymm0[7] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $219, (%rsp), %ymm12, %ymm2 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm2 = mem[0,1],ymm12[2],mem[3,4],ymm12[5],mem[6,7] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm2 = mem[0,1],ymm2[2],mem[3,4],ymm2[5],mem[6,7] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = [8,9,4,5,4,5,6,7,8,9,10,11,12,13,14,15] ; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm2, %xmm3 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm2, %xmm2 ; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7] ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm3 = ymm3[0,1],mem[2],ymm3[3,4,5],mem[6],ymm3[7] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm9[0,1],ymm12[2],ymm9[3,4,5],ymm12[6],ymm9[7] ; AVX2-FAST-NEXT: vpermd %ymm3, %ymm1, %ymm3 ; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm3, %ymm3 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3,4,5,6,7] @@ -11495,221 +11472,219 @@ ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,21,26,27] ; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm2[7] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm2 = mem[0,1],ymm2[2],mem[3,4],ymm2[5],mem[6,7] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd $219, (%rsp), %ymm11, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm2 = mem[0,1],ymm11[2],mem[3,4],ymm11[5],mem[6,7] ; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm2, %xmm3 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm2, %xmm2 ; AVX2-FAST-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7] ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $187, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm3 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm3 = mem[0,1],ymm15[2],mem[3,4,5],ymm15[6],mem[7] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd $187, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm3 = mem[0,1],ymm3[2],mem[3,4,5],ymm3[6],mem[7] ; AVX2-FAST-NEXT: vpermd %ymm3, %ymm1, %ymm1 ; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm1, %ymm1 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [0,3,3,3,0,3,7,7] -; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vmovdqa %ymm0, %ymm1 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = [0,3,3,3,0,3,7,7] +; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm2 # 32-byte Folded Reload ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm2 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7] -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [2,5,1,4,2,5,1,4] -; AVX2-FAST-NEXT: # ymm11 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpermd %ymm2, %ymm11, %ymm5 +; AVX2-FAST-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm2 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = +; AVX2-FAST-NEXT: vpermd %ymm2, %ymm0, %ymm5 +; AVX2-FAST-NEXT: vmovdqa %ymm0, %ymm2 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25,30,31,u,u,u,u,u,u] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm3[5,6,7],ymm5[8,9,10,11,12],ymm3[13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0,1],ymm9[2,3],ymm10[4,5],ymm9[6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0,1],ymm14[2,3],ymm10[4,5],ymm14[6,7] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm6, %xmm3 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm9 = xmm3[8,9,4,5,4,5,6,7,8,9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [8,9,4,5,4,5,6,7,8,9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm3, %xmm10 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[10,11,6,7,4,5,6,7,8,9,10,11,12,13,14,15] -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm6[0],xmm9[0],xmm6[1],xmm9[1],xmm6[2],xmm9[2],xmm6[3],xmm9[3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm14[0,1,2],ymm13[3],ymm14[4,5],ymm13[6],ymm14[7] +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm6[0],xmm10[0],xmm6[1],xmm10[1],xmm6[2],xmm10[2],xmm6[3],xmm10[3] +; AVX2-FAST-NEXT: vpblendd $72, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm10 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm10 = ymm13[0,1,2],mem[3],ymm13[4,5],mem[6],ymm13[7] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = <0,4,7,3,6,u,u,u> ; AVX2-FAST-NEXT: vpermd %ymm10, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,2,3,4,5,10,11,12,13,18,19,18,19,18,19,18,19,18,19,20,21,26,27,28,29] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1],ymm0[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1],ymm0[2,3,4,5,6,7] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm0[0],ymm5[1,2,3,4,5,6,7],ymm0[8],ymm5[9,10,11,12,13,14,15] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-FAST-NEXT: vmovdqa %ymm1, %ymm3 -; AVX2-FAST-NEXT: vpermd %ymm13, %ymm1, %ymm0 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = -; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1],ymm10[2],ymm6[3,4],ymm10[5],ymm6[6,7] -; AVX2-FAST-NEXT: vpermd %ymm5, %ymm11, %ymm5 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = -; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm5, %ymm5 +; AVX2-FAST-NEXT: vpermd %ymm10, %ymm15, %ymm0 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1],ymm13[2],ymm6[3,4],ymm13[5],ymm6[6,7] +; AVX2-FAST-NEXT: vpermd %ymm5, %ymm2, %ymm5 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = +; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm5, %ymm5 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm5[0,1,2,3,4],ymm0[5,6,7],ymm5[8,9,10,11,12],ymm0[13,14,15] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1],ymm7[2,3],ymm8[4,5],ymm7[6,7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm5, %xmm9 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = [8,9,4,5,4,5,6,7,8,9,10,11,12,13,14,15] -; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm9, %xmm9 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm7 = [10,11,6,7,4,5,6,7,8,9,10,11,12,13,14,15] -; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm5, %xmm5 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm9[0],xmm5[1],xmm9[1],xmm5[2],xmm9[2],xmm5[3],xmm9[3] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm4[0,1,2],ymm14[3],ymm4[4,5],ymm14[6],ymm4[7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm5, %xmm14 +; AVX2-FAST-NEXT: vmovdqa %xmm1, %xmm8 +; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm14, %xmm14 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [10,11,6,7,4,5,6,7,8,9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm5, %xmm5 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm14[0],xmm5[1],xmm14[1],xmm5[2],xmm14[2],xmm5[3],xmm14[3] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm4[0,1,2],ymm7[3],ymm4[4,5],ymm7[6],ymm4[7] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = <0,4,7,3,6,u,u,u> -; AVX2-FAST-NEXT: vpermd %ymm9, %ymm4, %ymm9 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[u,u,u,u,u,u,u,u,2,3,4,5,10,11,12,13,18,19,18,19,18,19,18,19,18,19,20,21,26,27,28,29] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1],ymm9[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpermd %ymm14, %ymm4, %ymm14 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm14 = ymm14[u,u,u,u,u,u,u,u,2,3,4,5,10,11,12,13,18,19,18,19,18,19,18,19,18,19,20,21,26,27,28,29] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1],ymm14[2,3,4,5,6,7] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm5[0],ymm0[1,2,3,4,5,6,7],ymm5[8],ymm0[9,10,11,12,13,14,15] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm5 = ymm5[0,1],mem[2],ymm5[3,4],mem[5],ymm5[6,7] -; AVX2-FAST-NEXT: vpermd %ymm5, %ymm11, %ymm5 -; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm5, %ymm5 +; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm5 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm5 = ymm4[0,1],mem[2],ymm4[3,4],mem[5],ymm4[6,7] +; AVX2-FAST-NEXT: vpermd %ymm5, %ymm2, %ymm5 +; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm5, %ymm5 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm5[0,1,2,3,4],ymm0[5,6,7],ymm5[8,9,10,11,12],ymm0[13,14,15] -; AVX2-FAST-NEXT: vpblendd $51, (%rsp), %ymm12, %ymm5 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm5 = mem[0,1],ymm12[2,3],mem[4,5],ymm12[6,7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm5, %xmm9 -; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm9, %xmm9 -; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm5, %xmm5 -; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm9[0],xmm5[1],xmm9[1],xmm5[2],xmm9[2],xmm5[3],xmm9[3] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $72, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm9 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm9 = ymm8[0,1,2],mem[3],ymm8[4,5],mem[6],ymm8[7] -; AVX2-FAST-NEXT: vpermd %ymm9, %ymm4, %ymm9 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = -; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm9, %ymm9 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1],ymm9[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm5 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm5 = ymm4[0,1],mem[2,3],ymm4[4,5],mem[6,7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm5, %xmm14 +; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm14, %xmm14 +; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm5, %xmm5 +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm14[0],xmm5[1],xmm14[1],xmm5[2],xmm14[2],xmm5[3],xmm14[3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm9[0,1,2],ymm12[3],ymm9[4,5],ymm12[6],ymm9[7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <0,4,7,3,6,u,u,u> +; AVX2-FAST-NEXT: vpermd %ymm14, %ymm1, %ymm14 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = +; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm14, %ymm14 +; AVX2-FAST-NEXT: vmovdqa %ymm4, %ymm12 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1],ymm14[2,3,4,5,6,7] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm5[0],ymm0[1,2,3,4,5,6,7],ymm5[8],ymm0[9,10,11,12,13,14,15] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [0,3,3,3,0,3,7,7] -; AVX2-FAST-NEXT: vpermd %ymm7, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vpermd %ymm4, %ymm15, %ymm0 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1],ymm12[2],ymm8[3,4],ymm12[5],ymm8[6,7] -; AVX2-FAST-NEXT: vpermd %ymm1, %ymm11, %ymm1 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25,30,31,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1],ymm8[2],ymm9[3,4],ymm8[5],ymm9[6,7] +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1 +; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm1, %ymm1 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7],ymm1[8,9,10,11,12],ymm0[13,14,15] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm1 = mem[0,1],ymm1[2,3],mem[4,5],ymm1[6,7] +; AVX2-FAST-NEXT: vpblendd $51, (%rsp), %ymm11, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm1 = mem[0,1],ymm11[2,3],mem[4,5],ymm11[6,7] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[8,9,4,5,4,5,6,7,8,9,10,11,12,13,14,15] ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[10,11,6,7,4,5,6,7,8,9,10,11,12,13,14,15] ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; AVX2-FAST-NEXT: vpblendd $183, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm2 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm2 = mem[0,1,2],ymm15[3],mem[4,5],ymm15[6],mem[7] -; AVX2-FAST-NEXT: vpermd %ymm2, %ymm4, %ymm2 -; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd $183, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm2 = mem[0,1,2],ymm2[3],mem[4,5],ymm2[6],mem[7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = <0,4,7,3,6,u,u,u> +; AVX2-FAST-NEXT: vpermd %ymm2, %ymm3, %ymm2 +; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm2, %ymm2 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3,4,5,6,7] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1],ymm10[2,3],ymm6[4,5],ymm10[6,7] -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [0,4,7,0,0,4,7,0] -; AVX2-FAST-NEXT: # ymm0 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpermd %ymm13, %ymm0, %ymm2 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = -; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [2,6,1,5,2,6,1,5] -; AVX2-FAST-NEXT: # ymm9 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpermd %ymm1, %ymm9, %ymm1 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1],ymm13[2,3],ymm6[4,5],ymm13[6,7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = +; AVX2-FAST-NEXT: vpermd %ymm10, %ymm1, %ymm2 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = +; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm14, %ymm0 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = -; AVX2-FAST-NEXT: vpshufb %ymm15, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1,2,3,4],ymm2[5,6,7],ymm1[8,9,10,11,12],ymm2[13,14,15] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $72, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm1 = ymm1[0,1,2],mem[3],ymm1[4,5],mem[6],ymm1[7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm3 +; AVX2-FAST-NEXT: vpshufb %ymm15, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm0[0,1,2,3,4],ymm2[5,6,7],ymm0[8,9,10,11,12],ymm2[13,14,15] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd $72, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm0 = ymm0[0,1,2],mem[3],ymm0[4,5],mem[6],ymm0[7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm3 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm11 = [0,1,2,3,4,5,6,7,10,11,6,7,4,5,6,7] ; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm3, %xmm3 -; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,4,6,7] -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] -; AVX2-FAST-NEXT: vpblendd $18, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm4 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm4 = ymm14[0],mem[1],ymm14[2,3],mem[4],ymm14[5,6,7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = <1,4,0,3,7,u,u,u> -; AVX2-FAST-NEXT: vpermd %ymm4, %ymm13, %ymm4 +; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,4,6,7] +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] +; AVX2-FAST-NEXT: vpblendd $18, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm5 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm5 = ymm7[0],mem[1],ymm7[2,3],mem[4],ymm7[5,6,7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = <1,4,0,3,7,u,u,u> +; AVX2-FAST-NEXT: vpermd %ymm5, %ymm10, %ymm5 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = -; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm4, %ymm4 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm5, %ymm5 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm5[2,3,4,5,6,7] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1,2,3,4,5,6,7],ymm3[8],ymm2[9,10,11,12,13,14,15] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm2 = mem[0,1],ymm2[2,3],mem[4,5],ymm2[6,7] -; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm3, %ymm3 -; AVX2-FAST-NEXT: vpermd %ymm2, %ymm9, %ymm2 +; AVX2-FAST-NEXT: vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm2 = ymm2[0,1],mem[2,3],ymm2[4,5],mem[6,7] +; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm3 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm3, %ymm3 +; AVX2-FAST-NEXT: vpermd %ymm2, %ymm14, %ymm2 ; AVX2-FAST-NEXT: vpshufb %ymm15, %ymm2, %ymm2 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5,6,7],ymm2[8,9,10,11,12],ymm3[13,14,15] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm3 = mem[0],ymm3[1],mem[2,3],ymm3[4],mem[5,6,7] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $72, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm4 = ymm4[0,1,2],mem[3],ymm4[4,5],mem[6],ymm4[7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm4, %xmm5 -; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm5, %xmm5 -; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,4,6,7] -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] -; AVX2-FAST-NEXT: vpermd %ymm3, %ymm13, %ymm3 +; AVX2-FAST-NEXT: vpblendd $18, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm3 = ymm3[0],mem[1],ymm3[2,3],mem[4],ymm3[5,6,7] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd $72, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm5 = ymm5[0,1,2],mem[3],ymm5[4,5],mem[6],ymm5[7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm5, %xmm7 +; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm7, %xmm7 +; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,6,4,6,7] +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm5[4],xmm7[4],xmm5[5],xmm7[5],xmm5[6],xmm7[6],xmm5[7],xmm7[7] +; AVX2-FAST-NEXT: vpermd %ymm3, %ymm10, %ymm3 ; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm3, %ymm3 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1],ymm3[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1],ymm3[2,3,4,5,6,7] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1,2,3,4,5,6,7],ymm3[8],ymm2[9,10,11,12,13,14,15] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm3 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm3 = ymm1[0,1],mem[2,3],ymm1[4,5],mem[6,7] -; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm4, %ymm4 -; AVX2-FAST-NEXT: vpermd %ymm3, %ymm9, %ymm3 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm3 = ymm0[0,1],mem[2,3],ymm0[4,5],mem[6,7] +; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm5 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm5, %ymm5 +; AVX2-FAST-NEXT: vpermd %ymm3, %ymm14, %ymm3 ; AVX2-FAST-NEXT: vpshufb %ymm15, %ymm3, %ymm3 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5,6,7],ymm3[8,9,10,11,12],ymm4[13,14,15] -; AVX2-FAST-NEXT: vmovdqu (%rsp), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $72, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm4 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm4 = ymm1[0,1,2],mem[3],ymm1[4,5],mem[6],ymm1[7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm4, %xmm5 -; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm5, %xmm5 -; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,4,6,7] -; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm5 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm5 = mem[0],ymm1[1],mem[2,3],ymm1[4],mem[5,6,7] -; AVX2-FAST-NEXT: vpermd %ymm5, %ymm13, %ymm5 -; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm5, %ymm5 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm5[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm3[1,2,3,4,5,6,7],ymm4[8],ymm3[9,10,11,12,13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] -; AVX2-FAST-NEXT: vpermd %ymm7, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1],ymm12[2,3],ymm8[4,5],ymm12[6,7] -; AVX2-FAST-NEXT: vpermd %ymm4, %ymm9, %ymm4 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm5[5,6,7],ymm3[8,9,10,11,12],ymm5[13,14,15] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd $72, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm5 = ymm0[0,1,2],mem[3],ymm0[4,5],mem[6],ymm0[7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm5, %xmm7 +; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm7, %xmm7 +; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,6,4,6,7] +; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm5[4],xmm7[4],xmm5[5],xmm7[5],xmm5[6],xmm7[6],xmm5[7],xmm7[7] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd $18, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm7 = ymm0[0],mem[1],ymm0[2,3],mem[4],ymm0[5,6,7] +; AVX2-FAST-NEXT: vpermd %ymm7, %ymm10, %ymm7 +; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm7, %ymm7 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1],ymm7[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm5[0],ymm3[1,2,3,4,5,6,7],ymm5[8],ymm3[9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5,6,7] +; AVX2-FAST-NEXT: vpermd %ymm4, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0,1],ymm8[2,3],ymm9[4,5],ymm8[6,7] +; AVX2-FAST-NEXT: vpermd %ymm4, %ymm14, %ymm4 ; AVX2-FAST-NEXT: vpshufb %ymm15, %ymm4, %ymm4 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0,1,2,3,4],ymm0[5,6,7],ymm4[8,9,10,11,12],ymm0[13,14,15] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $183, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm4 = mem[0,1,2],ymm4[3],mem[4,5],ymm4[6],mem[7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm4[0,1,2,3,4],ymm1[5,6,7],ymm4[8,9,10,11,12],ymm1[13,14,15] +; AVX2-FAST-NEXT: vmovdqu (%rsp), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd $72, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm4 = ymm0[0,1,2],mem[3],ymm0[4,5],mem[6],ymm0[7] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm4, %xmm5 ; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm5, %xmm5 ; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,4,6,7] ; AVX2-FAST-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $18, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm5 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm5 = ymm1[0],mem[1],ymm1[2,3],mem[4],ymm1[5,6,7] -; AVX2-FAST-NEXT: vpermd %ymm5, %ymm13, %ymm1 -; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1],ymm1[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7],ymm1[8],ymm0[9,10,11,12,13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd $18, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm5 = ymm0[0],mem[1],ymm0[2,3],mem[4],ymm0[5,6,7] +; AVX2-FAST-NEXT: vpermd %ymm5, %ymm10, %ymm0 +; AVX2-FAST-NEXT: vpshufb %ymm6, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1,2,3,4,5,6,7],ymm0[8],ymm1[9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm1, 96(%rsi) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-3.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-3.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-3.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-3.ll @@ -285,14 +285,13 @@ ; AVX2-SLOW-NEXT: vmovaps (%rdi), %ymm0 ; AVX2-SLOW-NEXT: vmovaps 32(%rdi), %ymm1 ; AVX2-SLOW-NEXT: vmovaps 64(%rdi), %ymm2 -; AVX2-SLOW-NEXT: vbroadcastsd {{.*#+}} ymm3 = [21474836482,21474836482,21474836482,21474836482] +; AVX2-SLOW-NEXT: vmovaps {{.*#+}} ymm3 = ; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm3, %ymm3 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7] ; AVX2-SLOW-NEXT: vmovaps {{.*#+}} ymm5 = <0,3,6,1,4,7,u,u> ; AVX2-SLOW-NEXT: vpermps %ymm4, %ymm5, %ymm4 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7] -; AVX2-SLOW-NEXT: vbroadcastf128 {{.*#+}} ymm4 = [0,0,3,6,0,0,3,6] -; AVX2-SLOW-NEXT: # ymm4 = mem[0,1,0,1] +; AVX2-SLOW-NEXT: vmovaps {{.*#+}} ymm4 = ; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm4, %ymm4 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] ; AVX2-SLOW-NEXT: vmovaps {{.*#+}} ymm6 = <1,4,7,2,5,u,u,u> @@ -315,14 +314,13 @@ ; AVX2-FAST-NEXT: vmovaps (%rdi), %ymm0 ; AVX2-FAST-NEXT: vmovaps 32(%rdi), %ymm1 ; AVX2-FAST-NEXT: vmovaps 64(%rdi), %ymm2 -; AVX2-FAST-NEXT: vbroadcastsd {{.*#+}} ymm3 = [21474836482,21474836482,21474836482,21474836482] +; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm3 = ; AVX2-FAST-NEXT: vpermps %ymm2, %ymm3, %ymm3 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7] ; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm5 = <0,3,6,1,4,7,u,u> ; AVX2-FAST-NEXT: vpermps %ymm4, %ymm5, %ymm4 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7] -; AVX2-FAST-NEXT: vbroadcastf128 {{.*#+}} ymm4 = [0,0,3,6,0,0,3,6] -; AVX2-FAST-NEXT: # ymm4 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm4 = ; AVX2-FAST-NEXT: vpermps %ymm2, %ymm4, %ymm4 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] ; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm6 = <1,4,7,2,5,u,u,u> @@ -345,14 +343,13 @@ ; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdi), %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%rdi), %ymm2 -; AVX2-FAST-PERLANE-NEXT: vbroadcastsd {{.*#+}} ymm3 = [21474836482,21474836482,21474836482,21474836482] +; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} ymm3 = ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm2, %ymm3, %ymm3 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} ymm5 = <0,3,6,1,4,7,u,u> ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm4, %ymm5, %ymm4 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastf128 {{.*#+}} ymm4 = [0,0,3,6,0,0,3,6] -; AVX2-FAST-PERLANE-NEXT: # ymm4 = mem[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} ymm4 = ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm2, %ymm4, %ymm4 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} ymm6 = <1,4,7,2,5,u,u,u> @@ -557,7 +554,7 @@ ; AVX2-SLOW-NEXT: vmovaps 32(%rdi), %ymm3 ; AVX2-SLOW-NEXT: vmovaps 64(%rdi), %ymm4 ; AVX2-SLOW-NEXT: vmovaps 96(%rdi), %ymm5 -; AVX2-SLOW-NEXT: vbroadcastsd {{.*#+}} ymm6 = [21474836482,21474836482,21474836482,21474836482] +; AVX2-SLOW-NEXT: vmovaps {{.*#+}} ymm6 = ; AVX2-SLOW-NEXT: vpermps %ymm4, %ymm6, %ymm7 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm2[0],ymm3[1],ymm2[2,3],ymm3[4],ymm2[5,6],ymm3[7] ; AVX2-SLOW-NEXT: vmovaps {{.*#+}} ymm9 = <0,3,6,1,4,7,u,u> @@ -567,8 +564,7 @@ ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm5[0],ymm1[1],ymm5[2,3],ymm1[4],ymm5[5,6],ymm1[7] ; AVX2-SLOW-NEXT: vpermps %ymm8, %ymm9, %ymm8 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm8[0,1,2,3,4,5],ymm6[6,7] -; AVX2-SLOW-NEXT: vbroadcastf128 {{.*#+}} ymm8 = [0,0,3,6,0,0,3,6] -; AVX2-SLOW-NEXT: # ymm8 = mem[0,1,0,1] +; AVX2-SLOW-NEXT: vmovaps {{.*#+}} ymm8 = ; AVX2-SLOW-NEXT: vpermps %ymm4, %ymm8, %ymm9 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm10 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7] ; AVX2-SLOW-NEXT: vmovaps {{.*#+}} ymm11 = <1,4,7,2,5,u,u,u> @@ -606,7 +602,7 @@ ; AVX2-FAST-NEXT: vmovaps 32(%rdi), %ymm3 ; AVX2-FAST-NEXT: vmovaps 64(%rdi), %ymm4 ; AVX2-FAST-NEXT: vmovaps 96(%rdi), %ymm5 -; AVX2-FAST-NEXT: vbroadcastsd {{.*#+}} ymm6 = [21474836482,21474836482,21474836482,21474836482] +; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm6 = ; AVX2-FAST-NEXT: vpermps %ymm4, %ymm6, %ymm7 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm2[0],ymm3[1],ymm2[2,3],ymm3[4],ymm2[5,6],ymm3[7] ; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm9 = <0,3,6,1,4,7,u,u> @@ -616,8 +612,7 @@ ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm5[0],ymm0[1],ymm5[2,3],ymm0[4],ymm5[5,6],ymm0[7] ; AVX2-FAST-NEXT: vpermps %ymm8, %ymm9, %ymm8 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm8[0,1,2,3,4,5],ymm6[6,7] -; AVX2-FAST-NEXT: vbroadcastf128 {{.*#+}} ymm8 = [0,0,3,6,0,0,3,6] -; AVX2-FAST-NEXT: # ymm8 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm8 = ; AVX2-FAST-NEXT: vpermps %ymm4, %ymm8, %ymm9 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm10 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7] ; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm11 = <1,4,7,2,5,u,u,u> @@ -654,7 +649,7 @@ ; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rdi), %ymm3 ; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%rdi), %ymm4 ; AVX2-FAST-PERLANE-NEXT: vmovaps 96(%rdi), %ymm5 -; AVX2-FAST-PERLANE-NEXT: vbroadcastsd {{.*#+}} ymm6 = [21474836482,21474836482,21474836482,21474836482] +; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} ymm6 = ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm4, %ymm6, %ymm7 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm2[0],ymm3[1],ymm2[2,3],ymm3[4],ymm2[5,6],ymm3[7] ; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} ymm9 = <0,3,6,1,4,7,u,u> @@ -664,8 +659,7 @@ ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm5[0],ymm1[1],ymm5[2,3],ymm1[4],ymm5[5,6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm8, %ymm9, %ymm8 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm8[0,1,2,3,4,5],ymm6[6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastf128 {{.*#+}} ymm8 = [0,0,3,6,0,0,3,6] -; AVX2-FAST-PERLANE-NEXT: # ymm8 = mem[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} ymm8 = ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm4, %ymm8, %ymm9 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm10 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} ymm11 = <1,4,7,2,5,u,u,u> @@ -1105,7 +1099,7 @@ ; AVX2-SLOW-NEXT: vmovaps 64(%rdi), %ymm2 ; AVX2-SLOW-NEXT: vmovaps 96(%rdi), %ymm15 ; AVX2-SLOW-NEXT: vmovaps 160(%rdi), %ymm12 -; AVX2-SLOW-NEXT: vbroadcastsd {{.*#+}} ymm8 = [21474836482,21474836482,21474836482,21474836482] +; AVX2-SLOW-NEXT: vmovaps {{.*#+}} ymm8 = ; AVX2-SLOW-NEXT: vpermps %ymm12, %ymm8, %ymm0 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm15[0],ymm10[1],ymm15[2,3],ymm10[4],ymm15[5,6],ymm10[7] ; AVX2-SLOW-NEXT: vmovaps {{.*#+}} ymm11 = <0,3,6,1,4,7,u,u> @@ -1132,8 +1126,7 @@ ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm15[0,1],ymm10[2],ymm15[3,4],ymm10[5],ymm15[6,7] ; AVX2-SLOW-NEXT: vmovaps {{.*#+}} ymm1 = <1,4,7,2,5,u,u,u> ; AVX2-SLOW-NEXT: vpermps %ymm8, %ymm1, %ymm8 -; AVX2-SLOW-NEXT: vbroadcastf128 {{.*#+}} ymm0 = [0,0,3,6,0,0,3,6] -; AVX2-SLOW-NEXT: # ymm0 = mem[0,1,0,1] +; AVX2-SLOW-NEXT: vmovaps {{.*#+}} ymm0 = ; AVX2-SLOW-NEXT: vpermps %ymm12, %ymm0, %ymm11 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm11[5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -1209,7 +1202,7 @@ ; AVX2-FAST-NEXT: vmovaps 64(%rdi), %ymm7 ; AVX2-FAST-NEXT: vmovaps 96(%rdi), %ymm11 ; AVX2-FAST-NEXT: vmovaps 160(%rdi), %ymm12 -; AVX2-FAST-NEXT: vbroadcastsd {{.*#+}} ymm8 = [21474836482,21474836482,21474836482,21474836482] +; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm8 = ; AVX2-FAST-NEXT: vpermps %ymm12, %ymm8, %ymm0 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0],ymm9[1],ymm11[2,3],ymm9[4],ymm11[5,6],ymm9[7] ; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm13 = <0,3,6,1,4,7,u,u> @@ -1231,31 +1224,30 @@ ; AVX2-FAST-NEXT: vmovaps 192(%rdi), %ymm14 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0],ymm1[1],ymm14[2,3],ymm1[4],ymm14[5,6],ymm1[7] ; AVX2-FAST-NEXT: vpermps %ymm0, %ymm13, %ymm13 -; AVX2-FAST-NEXT: vmovaps 256(%rdi), %ymm1 -; AVX2-FAST-NEXT: vpermps %ymm1, %ymm8, %ymm8 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1,2,3,4,5],ymm8[6,7] -; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovaps 256(%rdi), %ymm0 +; AVX2-FAST-NEXT: vpermps %ymm0, %ymm8, %ymm8 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm13[0,1,2,3,4,5],ymm8[6,7] +; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm11[0,1],ymm9[2],ymm11[3,4],ymm9[5],ymm11[6,7] ; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm2 = <1,4,7,2,5,u,u,u> ; AVX2-FAST-NEXT: vpermps %ymm8, %ymm2, %ymm8 -; AVX2-FAST-NEXT: vbroadcastf128 {{.*#+}} ymm0 = [0,0,3,6,0,0,3,6] -; AVX2-FAST-NEXT: # ymm0 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpermps %ymm12, %ymm0, %ymm13 +; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm1 = +; AVX2-FAST-NEXT: vpermps %ymm12, %ymm1, %ymm13 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm13[5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm3[0,1],ymm6[2],ymm3[3,4],ymm6[5],ymm3[6,7] ; AVX2-FAST-NEXT: vpermps %ymm8, %ymm2, %ymm8 -; AVX2-FAST-NEXT: vpermps %ymm4, %ymm0, %ymm13 +; AVX2-FAST-NEXT: vpermps %ymm4, %ymm1, %ymm13 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm13[5,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm13 = ymm15[0,1],ymm5[2],ymm15[3,4],ymm5[5],ymm15[6,7] ; AVX2-FAST-NEXT: vpermps %ymm13, %ymm2, %ymm13 -; AVX2-FAST-NEXT: vpermps %ymm7, %ymm0, %ymm10 +; AVX2-FAST-NEXT: vpermps %ymm7, %ymm1, %ymm10 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm10 = ymm13[0,1,2,3,4],ymm10[5,6,7] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0,1],ymm6[2],ymm14[3,4],ymm6[5],ymm14[6,7] ; AVX2-FAST-NEXT: vpermps %ymm13, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vpermps %ymm1, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm0[5,6,7] +; AVX2-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm9[0,1],ymm11[2],ymm9[3,4],ymm11[5],ymm9[6,7] ; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm9 = [0,1,0,3,0,1,4,7] ; AVX2-FAST-NEXT: vpermps %ymm12, %ymm9, %ymm11 @@ -1271,10 +1263,10 @@ ; AVX2-FAST-NEXT: vpermps %ymm7, %ymm9, %ymm5 ; AVX2-FAST-NEXT: vpermps %ymm4, %ymm12, %ymm4 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm5[5,6,7] -; AVX2-FAST-NEXT: vpermps %ymm1, %ymm9, %ymm1 +; AVX2-FAST-NEXT: vpermps %ymm0, %ymm9, %ymm0 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1],ymm14[2],ymm6[3,4],ymm14[5],ymm6[6,7] ; AVX2-FAST-NEXT: vpermps %ymm5, %ymm12, %ymm5 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1,2,3,4],ymm1[5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3,4],ymm0[5,6,7] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm5, 64(%rsi) ; AVX2-FAST-NEXT: vmovups (%rsp), %ymm5 # 32-byte Reload @@ -1283,12 +1275,12 @@ ; AVX2-FAST-NEXT: vmovaps %ymm5, 96(%rsi) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm5, 32(%rsi) -; AVX2-FAST-NEXT: vmovaps %ymm0, 64(%rdx) +; AVX2-FAST-NEXT: vmovaps %ymm1, 64(%rdx) ; AVX2-FAST-NEXT: vmovaps %ymm10, (%rdx) ; AVX2-FAST-NEXT: vmovaps %ymm8, 96(%rdx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, 32(%rdx) -; AVX2-FAST-NEXT: vmovaps %ymm1, 64(%rcx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm1, 32(%rdx) +; AVX2-FAST-NEXT: vmovaps %ymm0, 64(%rcx) ; AVX2-FAST-NEXT: vmovaps %ymm4, (%rcx) ; AVX2-FAST-NEXT: vmovaps %ymm3, 96(%rcx) ; AVX2-FAST-NEXT: vmovaps %ymm2, 32(%rcx) @@ -1310,7 +1302,7 @@ ; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%rdi), %ymm2 ; AVX2-FAST-PERLANE-NEXT: vmovaps 96(%rdi), %ymm15 ; AVX2-FAST-PERLANE-NEXT: vmovaps 160(%rdi), %ymm12 -; AVX2-FAST-PERLANE-NEXT: vbroadcastsd {{.*#+}} ymm8 = [21474836482,21474836482,21474836482,21474836482] +; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} ymm8 = ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm12, %ymm8, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm15[0],ymm10[1],ymm15[2,3],ymm10[4],ymm15[5,6],ymm10[7] ; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} ymm11 = <0,3,6,1,4,7,u,u> @@ -1337,8 +1329,7 @@ ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm15[0,1],ymm10[2],ymm15[3,4],ymm10[5],ymm15[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} ymm1 = <1,4,7,2,5,u,u,u> ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm8, %ymm1, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vbroadcastf128 {{.*#+}} ymm0 = [0,0,3,6,0,0,3,6] -; AVX2-FAST-PERLANE-NEXT: # ymm0 = mem[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} ymm0 = ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm12, %ymm0, %ymm11 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm11[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -2250,7 +2241,7 @@ ; AVX2-SLOW-NEXT: vmovaps 96(%rdi), %ymm12 ; AVX2-SLOW-NEXT: vmovaps 160(%rdi), %ymm1 ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vbroadcastsd {{.*#+}} ymm0 = [21474836482,21474836482,21474836482,21474836482] +; AVX2-SLOW-NEXT: vmovaps {{.*#+}} ymm0 = ; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm0, %ymm1 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm12[0],ymm2[1],ymm12[2,3],ymm2[4],ymm12[5,6],ymm2[7] ; AVX2-SLOW-NEXT: vmovaps {{.*#+}} ymm13 = <0,3,6,1,4,7,u,u> @@ -2327,8 +2318,7 @@ ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm12[0,1],ymm7[2],ymm12[3,4],ymm7[5],ymm12[6,7] ; AVX2-SLOW-NEXT: vmovaps {{.*#+}} ymm15 = <1,4,7,2,5,u,u,u> ; AVX2-SLOW-NEXT: vpermps %ymm0, %ymm15, %ymm1 -; AVX2-SLOW-NEXT: vbroadcastf128 {{.*#+}} ymm0 = [0,0,3,6,0,0,3,6] -; AVX2-SLOW-NEXT: # ymm0 = mem[0,1,0,1] +; AVX2-SLOW-NEXT: vmovaps {{.*#+}} ymm0 = ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload ; AVX2-SLOW-NEXT: vpermps %ymm8, %ymm0, %ymm14 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm14[5,6,7] @@ -2495,7 +2485,7 @@ ; AVX2-FAST-NEXT: vmovaps 96(%rdi), %ymm14 ; AVX2-FAST-NEXT: vmovaps 160(%rdi), %ymm1 ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vbroadcastsd {{.*#+}} ymm0 = [21474836482,21474836482,21474836482,21474836482] +; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm0 = ; AVX2-FAST-NEXT: vpermps %ymm1, %ymm0, %ymm1 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm14[0],ymm10[1],ymm14[2,3],ymm10[4],ymm14[5,6],ymm10[7] ; AVX2-FAST-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -2571,8 +2561,7 @@ ; AVX2-FAST-NEXT: # ymm0 = ymm14[0,1],mem[2],ymm14[3,4],mem[5],ymm14[6,7] ; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm15 = <1,4,7,2,5,u,u,u> ; AVX2-FAST-NEXT: vpermps %ymm0, %ymm15, %ymm1 -; AVX2-FAST-NEXT: vbroadcastf128 {{.*#+}} ymm0 = [0,0,3,6,0,0,3,6] -; AVX2-FAST-NEXT: # ymm0 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm0 = ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload ; AVX2-FAST-NEXT: vpermps %ymm11, %ymm0, %ymm14 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm14[5,6,7] @@ -2732,7 +2721,7 @@ ; AVX2-FAST-PERLANE-NEXT: vmovaps 96(%rdi), %ymm12 ; AVX2-FAST-PERLANE-NEXT: vmovaps 160(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vbroadcastsd {{.*#+}} ymm0 = [21474836482,21474836482,21474836482,21474836482] +; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} ymm0 = ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm0, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm12[0],ymm2[1],ymm12[2,3],ymm2[4],ymm12[5,6],ymm2[7] ; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} ymm13 = <0,3,6,1,4,7,u,u> @@ -2809,8 +2798,7 @@ ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm12[0,1],ymm7[2],ymm12[3,4],ymm7[5],ymm12[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} ymm15 = <1,4,7,2,5,u,u,u> ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm0, %ymm15, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vbroadcastf128 {{.*#+}} ymm0 = [0,0,3,6,0,0,3,6] -; AVX2-FAST-PERLANE-NEXT: # ymm0 = mem[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} ymm0 = ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm8, %ymm0, %ymm14 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm14[5,6,7] diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-4.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-4.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-4.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-4.ll @@ -332,10 +332,11 @@ ; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm4 ; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vbroadcastsd {{.*#+}} ymm3 = [17179869184,17179869184,17179869184,17179869184] -; AVX2-ONLY-NEXT: vpermps %ymm2, %ymm3, %ymm5 -; AVX2-ONLY-NEXT: vpermps %ymm1, %ymm3, %ymm3 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm5[6,7] +; AVX2-ONLY-NEXT: vmovaps {{.*#+}} ymm3 = +; AVX2-ONLY-NEXT: vpermps %ymm2, %ymm3, %ymm3 +; AVX2-ONLY-NEXT: vmovaps {{.*#+}} ymm5 = +; AVX2-ONLY-NEXT: vpermps %ymm1, %ymm5, %ymm5 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3,4,5],ymm3[6,7] ; AVX2-ONLY-NEXT: vmovaps {{.*#+}} xmm5 = ; AVX2-ONLY-NEXT: vpermps %ymm4, %ymm5, %ymm6 ; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm7 @@ -344,27 +345,30 @@ ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm9 = xmm7[0],xmm8[0],xmm7[1],xmm8[1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm9[0,1],xmm6[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm6[0,1,2,3],ymm3[4,5,6,7] -; AVX2-ONLY-NEXT: vbroadcastsd {{.*#+}} ymm6 = [21474836481,21474836481,21474836481,21474836481] -; AVX2-ONLY-NEXT: vpermps %ymm2, %ymm6, %ymm9 -; AVX2-ONLY-NEXT: vpermps %ymm1, %ymm6, %ymm6 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm9[6,7] +; AVX2-ONLY-NEXT: vmovaps {{.*#+}} ymm6 = +; AVX2-ONLY-NEXT: vpermps %ymm2, %ymm6, %ymm6 +; AVX2-ONLY-NEXT: vmovaps {{.*#+}} ymm9 = +; AVX2-ONLY-NEXT: vpermps %ymm1, %ymm9, %ymm9 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm9[0,1,2,3,4,5],ymm6[6,7] ; AVX2-ONLY-NEXT: vmovaps 48(%rdi), %xmm9 ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm10 = xmm5[0],xmm9[0],xmm5[1],xmm9[1] ; AVX2-ONLY-NEXT: vmovaps {{.*#+}} xmm11 = <1,5,u,u> ; AVX2-ONLY-NEXT: vpermps %ymm0, %ymm11, %ymm11 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm11[0,1],xmm10[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm10[0,1,2,3],ymm6[4,5,6,7] -; AVX2-ONLY-NEXT: vbroadcastsd {{.*#+}} ymm10 = [25769803778,25769803778,25769803778,25769803778] -; AVX2-ONLY-NEXT: vpermps %ymm2, %ymm10, %ymm11 -; AVX2-ONLY-NEXT: vpermps %ymm1, %ymm10, %ymm10 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm11[6,7] +; AVX2-ONLY-NEXT: vmovaps {{.*#+}} ymm10 = +; AVX2-ONLY-NEXT: vpermps %ymm2, %ymm10, %ymm10 +; AVX2-ONLY-NEXT: vmovaps {{.*#+}} ymm11 = +; AVX2-ONLY-NEXT: vpermps %ymm1, %ymm11, %ymm11 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0,1,2,3,4,5],ymm10[6,7] ; AVX2-ONLY-NEXT: vmovaps {{.*#+}} xmm11 = ; AVX2-ONLY-NEXT: vpermps %ymm4, %ymm11, %ymm4 ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm7 = xmm7[2],xmm8[2],xmm7[3],xmm8[3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm7[0,1],xmm4[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm10[4,5,6,7] -; AVX2-ONLY-NEXT: vbroadcastsd {{.*#+}} ymm7 = [30064771075,30064771075,30064771075,30064771075] +; AVX2-ONLY-NEXT: vmovaps {{.*#+}} ymm7 = ; AVX2-ONLY-NEXT: vpermps %ymm2, %ymm7, %ymm2 +; AVX2-ONLY-NEXT: vmovaps {{.*#+}} ymm7 = ; AVX2-ONLY-NEXT: vpermps %ymm1, %ymm7, %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm5[2],xmm9[2],xmm5[3],xmm9[3] @@ -655,107 +659,114 @@ ; ; AVX2-ONLY-LABEL: load_i32_stride4_vf16: ; AVX2-ONLY: # %bb.0: -; AVX2-ONLY-NEXT: subq $104, %rsp -; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm8 -; AVX2-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: subq $120, %rsp ; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm7 ; AVX2-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %ymm4 -; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm5 -; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %ymm12 -; AVX2-ONLY-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %ymm3 -; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vbroadcastsd {{.*#+}} ymm0 = [17179869184,17179869184,17179869184,17179869184] -; AVX2-ONLY-NEXT: vpermps %ymm2, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vpermps %ymm3, %ymm0, %ymm6 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vmovaps 144(%rdi), %xmm10 -; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %xmm11 -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm6 = xmm11[0],xmm10[0],xmm11[1],xmm10[1] -; AVX2-ONLY-NEXT: vmovaps {{.*#+}} xmm9 = -; AVX2-ONLY-NEXT: vpermps %ymm12, %ymm9, %ymm12 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1],xmm12[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm4 +; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %ymm8 +; AVX2-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %ymm5 +; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovaps {{.*#+}} ymm0 = +; AVX2-ONLY-NEXT: vpermps %ymm3, %ymm0, %ymm1 +; AVX2-ONLY-NEXT: vmovaps {{.*#+}} ymm6 = +; AVX2-ONLY-NEXT: vpermps %ymm5, %ymm6, %ymm9 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm9[0,1,2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vmovaps 144(%rdi), %xmm9 +; AVX2-ONLY-NEXT: vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %xmm10 +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm11 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] +; AVX2-ONLY-NEXT: vmovaps {{.*#+}} xmm12 = +; AVX2-ONLY-NEXT: vpermps %ymm8, %ymm12, %ymm13 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm11 = xmm11[0,1],xmm13[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm11[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermps %ymm5, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vpermps %ymm4, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vpermps %ymm7, %ymm9, %ymm1 +; AVX2-ONLY-NEXT: vpermps %ymm2, %ymm6, %ymm1 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vpermps %ymm7, %ymm12, %ymm1 ; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm12 ; AVX2-ONLY-NEXT: vmovaps 16(%rdi), %xmm13 ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm6 = xmm12[0],xmm13[0],xmm12[1],xmm13[1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm6[0,1],xmm1[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastsd {{.*#+}} ymm0 = [21474836481,21474836481,21474836481,21474836481] -; AVX2-ONLY-NEXT: vpermps %ymm5, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vpermps %ymm4, %ymm0, %ymm6 +; AVX2-ONLY-NEXT: vmovaps {{.*#+}} ymm9 = +; AVX2-ONLY-NEXT: vpermps %ymm4, %ymm9, %ymm1 +; AVX2-ONLY-NEXT: vmovaps {{.*#+}} ymm14 = +; AVX2-ONLY-NEXT: vpermps %ymm2, %ymm14, %ymm6 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm8 +; AVX2-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm15 ; AVX2-ONLY-NEXT: vmovaps 48(%rdi), %xmm6 -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm9 = xmm15[0],xmm6[0],xmm15[1],xmm6[1] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm15[0],xmm6[0],xmm15[1],xmm6[1] ; AVX2-ONLY-NEXT: vmovaps {{.*#+}} xmm7 = <1,5,u,u> -; AVX2-ONLY-NEXT: vpermps %ymm8, %ymm7, %ymm14 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm9 = xmm14[0,1],xmm9[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm9[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm1, (%rsp) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermps %ymm2, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vpermps %ymm3, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vpermps %ymm8, %ymm7, %ymm8 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm8[0,1],xmm0[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermps %ymm3, %ymm9, %ymm0 +; AVX2-ONLY-NEXT: vpermps %ymm5, %ymm14, %ymm1 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %ymm14 ; AVX2-ONLY-NEXT: vpermps %ymm14, %ymm7, %ymm7 ; AVX2-ONLY-NEXT: vmovaps 176(%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %xmm0 -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm8 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm8[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm9[4,5,6,7] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm9 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm9[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastsd {{.*#+}} ymm7 = [25769803778,25769803778,25769803778,25769803778] -; AVX2-ONLY-NEXT: vpermps %ymm2, %ymm7, %ymm8 -; AVX2-ONLY-NEXT: vpermps %ymm3, %ymm7, %ymm9 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1,2,3,4,5],ymm8[6,7] -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm9 = xmm11[2],xmm10[2],xmm11[3],xmm10[3] +; AVX2-ONLY-NEXT: vmovaps {{.*#+}} ymm7 = +; AVX2-ONLY-NEXT: vpermps %ymm3, %ymm7, %ymm7 +; AVX2-ONLY-NEXT: vmovaps {{.*#+}} ymm9 = +; AVX2-ONLY-NEXT: vpermps %ymm5, %ymm9, %ymm8 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1,2,3,4,5],ymm7[6,7] +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm8 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm8 = xmm10[2],mem[2],xmm10[3],mem[3] ; AVX2-ONLY-NEXT: vmovaps {{.*#+}} xmm10 = ; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm11 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm9 = xmm9[0,1],xmm11[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] -; AVX2-ONLY-NEXT: vpermps %ymm5, %ymm7, %ymm9 -; AVX2-ONLY-NEXT: vpermps %ymm4, %ymm7, %ymm7 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm9[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0,1],xmm11[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps {{.*#+}} ymm8 = +; AVX2-ONLY-NEXT: vpermps %ymm4, %ymm8, %ymm8 +; AVX2-ONLY-NEXT: vpermps %ymm2, %ymm9, %ymm9 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1,2,3,4,5],ymm8[6,7] ; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm9 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm10 = xmm12[2],xmm13[2],xmm12[3],xmm13[3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm9 = xmm10[0,1],xmm9[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7] -; AVX2-ONLY-NEXT: vbroadcastsd {{.*#+}} ymm9 = [30064771075,30064771075,30064771075,30064771075] -; AVX2-ONLY-NEXT: vpermps %ymm5, %ymm9, %ymm5 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps {{.*#+}} ymm9 = ; AVX2-ONLY-NEXT: vpermps %ymm4, %ymm9, %ymm4 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7] -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm5 = xmm15[2],xmm6[2],xmm15[3],xmm6[3] +; AVX2-ONLY-NEXT: vmovaps {{.*#+}} ymm10 = +; AVX2-ONLY-NEXT: vpermps %ymm2, %ymm10, %ymm2 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6,7] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm4 = xmm15[2],xmm6[2],xmm15[3],xmm6[3] ; AVX2-ONLY-NEXT: vmovaps {{.*#+}} xmm6 = <3,7,u,u> -; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm10 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm10[0,1],xmm5[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] -; AVX2-ONLY-NEXT: vpermps %ymm2, %ymm9, %ymm2 +; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm11 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm11[0,1],xmm4[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7] ; AVX2-ONLY-NEXT: vpermps %ymm3, %ymm9, %ymm3 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] +; AVX2-ONLY-NEXT: vpermps %ymm5, %ymm10, %ymm4 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7] ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX2-ONLY-NEXT: vpermps %ymm14, %ymm6, %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm1, 32(%rsi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm1, (%rsi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm1, 32(%rdx) -; AVX2-ONLY-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm1, (%rdx) -; AVX2-ONLY-NEXT: vmovaps %ymm8, 32(%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm7, (%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm7, 32(%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm8, (%rcx) ; AVX2-ONLY-NEXT: vmovaps %ymm0, 32(%r8) -; AVX2-ONLY-NEXT: vmovaps %ymm4, (%r8) -; AVX2-ONLY-NEXT: addq $104, %rsp +; AVX2-ONLY-NEXT: vmovaps %ymm2, (%r8) +; AVX2-ONLY-NEXT: addq $120, %rsp ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq ; @@ -1356,70 +1367,70 @@ ; ; AVX2-ONLY-LABEL: load_i32_stride4_vf32: ; AVX2-ONLY: # %bb.0: -; AVX2-ONLY-NEXT: subq $712, %rsp # imm = 0x2C8 -; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %ymm7 +; AVX2-ONLY-NEXT: subq $728, %rsp # imm = 0x2D8 +; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %ymm8 +; AVX2-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 448(%rdi), %ymm6 +; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 480(%rdi), %ymm7 ; AVX2-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 448(%rdi), %ymm5 +; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %ymm5 ; AVX2-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 480(%rdi), %ymm6 -; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %ymm4 +; AVX2-ONLY-NEXT: vmovaps 320(%rdi), %ymm4 +; AVX2-ONLY-NEXT: vmovaps 352(%rdi), %ymm9 +; AVX2-ONLY-NEXT: vmovaps {{.*#+}} ymm0 = +; AVX2-ONLY-NEXT: vpermps %ymm9, %ymm0, %ymm2 +; AVX2-ONLY-NEXT: vmovaps {{.*#+}} ymm1 = +; AVX2-ONLY-NEXT: vpermps %ymm4, %ymm1, %ymm3 +; AVX2-ONLY-NEXT: vmovaps %ymm4, %ymm10 ; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 320(%rdi), %ymm3 -; AVX2-ONLY-NEXT: vmovaps 352(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vbroadcastsd {{.*#+}} ymm0 = [17179869184,17179869184,17179869184,17179869184] -; AVX2-ONLY-NEXT: vpermps %ymm2, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vmovaps %ymm2, %ymm10 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] +; AVX2-ONLY-NEXT: vmovaps 272(%rdi), %xmm4 +; AVX2-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %xmm3 +; AVX2-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] +; AVX2-ONLY-NEXT: vmovaps {{.*#+}} xmm4 = +; AVX2-ONLY-NEXT: vpermps %ymm5, %ymm4, %ymm5 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm5[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermps %ymm3, %ymm0, %ymm2 -; AVX2-ONLY-NEXT: vmovaps %ymm3, %ymm9 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vmovaps 272(%rdi), %xmm3 +; AVX2-ONLY-NEXT: vpermps %ymm7, %ymm0, %ymm2 +; AVX2-ONLY-NEXT: vpermps %ymm6, %ymm1, %ymm3 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] +; AVX2-ONLY-NEXT: vmovaps 400(%rdi), %xmm5 +; AVX2-ONLY-NEXT: vmovaps %xmm5, (%rsp) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 384(%rdi), %xmm3 ; AVX2-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; AVX2-ONLY-NEXT: vmovaps {{.*#+}} xmm3 = -; AVX2-ONLY-NEXT: vpermps %ymm4, %ymm3, %ymm4 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermps %ymm6, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vpermps %ymm5, %ymm0, %ymm2 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vmovaps 400(%rdi), %xmm4 -; AVX2-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 384(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vmovaps %xmm2, (%rsp) # 16-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] -; AVX2-ONLY-NEXT: vpermps %ymm7, %ymm3, %ymm4 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1] +; AVX2-ONLY-NEXT: vpermps %ymm8, %ymm4, %ymm5 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm5[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %ymm7 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %ymm8 -; AVX2-ONLY-NEXT: vpermps %ymm8, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vpermps %ymm7, %ymm0, %ymm2 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %ymm5 -; AVX2-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 144(%rdi), %xmm4 -; AVX2-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] -; AVX2-ONLY-NEXT: vpermps %ymm5, %ymm3, %ymm4 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %ymm14 +; AVX2-ONLY-NEXT: vpermps %ymm8, %ymm0, %ymm2 +; AVX2-ONLY-NEXT: vpermps %ymm7, %ymm1, %ymm3 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] +; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %ymm6 +; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 144(%rdi), %xmm5 +; AVX2-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %xmm3 +; AVX2-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1] +; AVX2-ONLY-NEXT: vpermps %ymm6, %ymm4, %ymm5 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm5[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm15 -; AVX2-ONLY-NEXT: vpermps %ymm15, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vpermps %ymm14, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vpermps %ymm15, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %ymm14 +; AVX2-ONLY-NEXT: vpermps %ymm14, %ymm1, %ymm1 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermps %ymm1, %ymm3, %ymm1 +; AVX2-ONLY-NEXT: vpermps %ymm1, %ymm4, %ymm1 ; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm3 ; AVX2-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovaps 16(%rdi), %xmm2 @@ -1428,38 +1439,38 @@ ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastsd {{.*#+}} ymm3 = [21474836481,21474836481,21474836481,21474836481] -; AVX2-ONLY-NEXT: vpermps %ymm10, %ymm3, %ymm0 -; AVX2-ONLY-NEXT: vpermps %ymm9, %ymm3, %ymm1 -; AVX2-ONLY-NEXT: vmovaps %ymm9, %ymm4 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %ymm5 -; AVX2-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 304(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vmovaps {{.*#+}} ymm6 = +; AVX2-ONLY-NEXT: vpermps %ymm9, %ymm6, %ymm0 +; AVX2-ONLY-NEXT: vmovaps {{.*#+}} ymm4 = +; AVX2-ONLY-NEXT: vpermps %ymm10, %ymm4, %ymm1 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 304(%rdi), %xmm3 +; AVX2-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %xmm2 ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; AVX2-ONLY-NEXT: vmovaps {{.*#+}} xmm6 = <1,5,u,u> -; AVX2-ONLY-NEXT: vpermps %ymm5, %ymm6, %ymm2 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; AVX2-ONLY-NEXT: vmovaps {{.*#+}} xmm3 = <1,5,u,u> +; AVX2-ONLY-NEXT: vpermps %ymm0, %ymm3, %ymm5 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermps %ymm15, %ymm3, %ymm0 -; AVX2-ONLY-NEXT: vpermps %ymm14, %ymm3, %ymm1 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vpermps %ymm15, %ymm6, %ymm1 +; AVX2-ONLY-NEXT: vpermps %ymm14, %ymm4, %ymm2 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm12 +; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm13 ; AVX2-ONLY-NEXT: vmovaps 48(%rdi), %xmm5 -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm10 = xmm12[0],xmm5[0],xmm12[1],xmm5[1] -; AVX2-ONLY-NEXT: vpermps %ymm0, %ymm6, %ymm11 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm11[0,1],xmm10[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm9[4,5,6,7] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm11 = xmm13[0],xmm5[0],xmm13[1],xmm5[1] +; AVX2-ONLY-NEXT: vpermps %ymm0, %ymm3, %ymm12 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm11 = xmm12[0,1],xmm11[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm10[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermps %ymm8, %ymm3, %ymm9 -; AVX2-ONLY-NEXT: vpermps %ymm7, %ymm3, %ymm10 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm9[6,7] +; AVX2-ONLY-NEXT: vpermps %ymm8, %ymm6, %ymm10 +; AVX2-ONLY-NEXT: vpermps %ymm7, %ymm4, %ymm11 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],ymm10[6,7] ; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 176(%rdi), %xmm2 @@ -1467,98 +1478,102 @@ ; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %xmm0 ; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; AVX2-ONLY-NEXT: vpermps %ymm1, %ymm6, %ymm13 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm13[0,1],xmm0[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm10[4,5,6,7] +; AVX2-ONLY-NEXT: vpermps %ymm1, %ymm3, %ymm12 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm12[0,1],xmm0[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm11[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vpermps %ymm2, %ymm3, %ymm0 ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vpermps %ymm1, %ymm3, %ymm3 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovaps 384(%rdi), %ymm3 -; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermps %ymm3, %ymm6, %ymm13 +; AVX2-ONLY-NEXT: vpermps %ymm1, %ymm6, %ymm2 +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vpermps %ymm0, %ymm4, %ymm4 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5],ymm2[6,7] +; AVX2-ONLY-NEXT: vmovaps 384(%rdi), %ymm4 +; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermps %ymm4, %ymm3, %ymm6 ; AVX2-ONLY-NEXT: vmovaps 432(%rdi), %xmm3 ; AVX2-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %xmm9 -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm11 = xmm9[0],xmm3[0],xmm9[1],xmm3[1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm11 = xmm13[0,1],xmm11[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %xmm4 +; AVX2-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm12 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1],xmm12[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm2[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps {{.*#+}} ymm3 = +; AVX2-ONLY-NEXT: vpermps %ymm9, %ymm3, %ymm6 +; AVX2-ONLY-NEXT: vmovaps {{.*#+}} ymm12 = +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-ONLY-NEXT: vpermps %ymm4, %ymm12, %ymm11 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm11[0,1,2,3,4,5],ymm6[6,7] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm11 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm11 = xmm2[2],mem[2],xmm2[3],mem[3] +; AVX2-ONLY-NEXT: vmovaps {{.*#+}} xmm2 = +; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm10 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm11[0,1],xmm10[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm10[0,1,2,3],ymm6[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermps %ymm1, %ymm3, %ymm6 +; AVX2-ONLY-NEXT: vpermps %ymm0, %ymm12, %ymm10 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm10[0,1,2,3,4,5],ymm6[6,7] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpckhps (%rsp), %xmm0, %xmm10 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm10 = xmm0[2],mem[2],xmm0[3],mem[3] +; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm11 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm10[0,1],xmm11[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm6[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastsd {{.*#+}} ymm0 = [25769803778,25769803778,25769803778,25769803778] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-ONLY-NEXT: vpermps %ymm6, %ymm0, %ymm11 -; AVX2-ONLY-NEXT: vpermps %ymm4, %ymm0, %ymm13 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm13[0,1,2,3,4,5],ymm11[6,7] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm13 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm13 = xmm3[2],mem[2],xmm3[3],mem[3] -; AVX2-ONLY-NEXT: vmovaps {{.*#+}} xmm3 = -; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm10 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm13[0,1],xmm10[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermps %ymm2, %ymm0, %ymm10 -; AVX2-ONLY-NEXT: vpermps %ymm1, %ymm0, %ymm11 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0,1,2,3,4,5],ymm10[6,7] -; AVX2-ONLY-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm11 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm11 = xmm1[2],mem[2],xmm1[3],mem[3] -; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm13 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm11 = xmm11[0,1],xmm13[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm11[0,1,2,3],ymm10[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermps %ymm8, %ymm0, %ymm10 -; AVX2-ONLY-NEXT: vpermps %ymm7, %ymm0, %ymm11 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0,1,2,3,4,5],ymm10[6,7] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm11 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm11 = xmm1[2],mem[2],xmm1[3],mem[3] -; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm13 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm11 = xmm11[0,1],xmm13[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm10[4,5,6,7] -; AVX2-ONLY-NEXT: vpermps %ymm15, %ymm0, %ymm10 -; AVX2-ONLY-NEXT: vpermps %ymm14, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm10[6,7] -; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm1 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vpermps %ymm8, %ymm3, %ymm6 +; AVX2-ONLY-NEXT: vpermps %ymm7, %ymm12, %ymm10 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm10[0,1,2,3,4,5],ymm6[6,7] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm10 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm10 = xmm0[2],mem[2],xmm0[3],mem[3] +; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm11 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm10[0,1],xmm11[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm10[0,1,2,3],ymm6[4,5,6,7] +; AVX2-ONLY-NEXT: vpermps %ymm15, %ymm3, %ymm1 +; AVX2-ONLY-NEXT: vpermps %ymm14, %ymm12, %ymm6 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm10 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm10 = xmm2[2],mem[2],xmm2[3],mem[3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm10[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vbroadcastsd {{.*#+}} ymm10 = [30064771075,30064771075,30064771075,30064771075] -; AVX2-ONLY-NEXT: vpermps %ymm15, %ymm10, %ymm0 +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm6 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm6 = xmm2[2],mem[2],xmm2[3],mem[3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm6[0,1],xmm0[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps {{.*#+}} ymm6 = +; AVX2-ONLY-NEXT: vpermps %ymm15, %ymm6, %ymm0 +; AVX2-ONLY-NEXT: vmovaps {{.*#+}} ymm10 = ; AVX2-ONLY-NEXT: vpermps %ymm14, %ymm10, %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm12[2],xmm5[2],xmm12[3],xmm5[3] +; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm13[2],xmm5[2],xmm13[3],xmm5[3] ; AVX2-ONLY-NEXT: vmovaps {{.*#+}} xmm2 = <3,7,u,u> ; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm5 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm5[0,1],xmm1[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vpermps %ymm6, %ymm10, %ymm1 +; AVX2-ONLY-NEXT: vpermps %ymm9, %ymm6, %ymm1 ; AVX2-ONLY-NEXT: vpermps %ymm4, %ymm10, %ymm5 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm5 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm5 = xmm3[2],mem[2],xmm3[3],mem[3] -; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm6 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3] +; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm9 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm9[0,1],xmm5[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vpermps %ymm8, %ymm10, %ymm5 -; AVX2-ONLY-NEXT: vpermps %ymm7, %ymm10, %ymm6 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5],ymm5[6,7] +; AVX2-ONLY-NEXT: vpermps %ymm8, %ymm6, %ymm5 +; AVX2-ONLY-NEXT: vpermps %ymm7, %ymm10, %ymm7 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0,1,2,3,4,5],ymm5[6,7] ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm6 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm6 = xmm3[2],mem[2],xmm3[3],mem[3] -; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm7 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm7[0,1],xmm6[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] -; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm6 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm7 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm7 = xmm3[2],mem[2],xmm3[3],mem[3] +; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm8 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm8[0,1],xmm7[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0,1,2,3],ymm5[4,5,6,7] +; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm7 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3,4,5],ymm6[6,7] -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm3 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm3 = xmm9[2],mem[2],xmm9[3],mem[3] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3] ; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm6[4,5,6,7] @@ -1583,12 +1598,12 @@ ; AVX2-ONLY-NEXT: vmovaps %ymm3, 96(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm3, 64(%rcx) -; AVX2-ONLY-NEXT: vmovaps %ymm13, (%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm12, (%rcx) ; AVX2-ONLY-NEXT: vmovaps %ymm2, 96(%r8) ; AVX2-ONLY-NEXT: vmovaps %ymm5, 32(%r8) ; AVX2-ONLY-NEXT: vmovaps %ymm1, 64(%r8) ; AVX2-ONLY-NEXT: vmovaps %ymm0, (%r8) -; AVX2-ONLY-NEXT: addq $712, %rsp # imm = 0x2C8 +; AVX2-ONLY-NEXT: addq $728, %rsp # imm = 0x2D8 ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq ; @@ -2833,365 +2848,363 @@ ; ; AVX2-ONLY-LABEL: load_i32_stride4_vf64: ; AVX2-ONLY: # %bb.0: -; AVX2-ONLY-NEXT: subq $1960, %rsp # imm = 0x7A8 +; AVX2-ONLY-NEXT: subq $2008, %rsp # imm = 0x7D8 ; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %ymm8 ; AVX2-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 448(%rdi), %ymm3 -; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 480(%rdi), %ymm4 +; AVX2-ONLY-NEXT: vmovaps 448(%rdi), %ymm4 +; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 480(%rdi), %ymm5 ; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %ymm9 ; AVX2-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vbroadcastsd {{.*#+}} ymm6 = [17179869184,17179869184,17179869184,17179869184] -; AVX2-ONLY-NEXT: vpermps %ymm1, %ymm6, %ymm0 -; AVX2-ONLY-NEXT: vmovaps %ymm1, %ymm5 -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermps %ymm2, %ymm6, %ymm1 -; AVX2-ONLY-NEXT: vmovaps %ymm2, %ymm10 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovaps 144(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vmovaps 192(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovaps 224(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vmovaps {{.*#+}} ymm0 = +; AVX2-ONLY-NEXT: vpermps %ymm2, %ymm0, %ymm1 +; AVX2-ONLY-NEXT: vmovaps %ymm2, %ymm15 +; AVX2-ONLY-NEXT: vmovaps {{.*#+}} ymm6 = +; AVX2-ONLY-NEXT: vpermps %ymm3, %ymm6, %ymm2 +; AVX2-ONLY-NEXT: vmovaps %ymm3, %ymm10 +; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vmovaps 144(%rdi), %xmm3 +; AVX2-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %xmm2 ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] ; AVX2-ONLY-NEXT: vmovaps {{.*#+}} xmm7 = -; AVX2-ONLY-NEXT: vpermps %ymm9, %ymm7, %ymm2 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermps %ymm4, %ymm6, %ymm0 -; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermps %ymm3, %ymm6, %ymm1 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovaps 400(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vpermps %ymm9, %ymm7, %ymm3 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermps %ymm5, %ymm0, %ymm1 +; AVX2-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermps %ymm4, %ymm6, %ymm2 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vmovaps 400(%rdi), %xmm3 +; AVX2-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 384(%rdi), %xmm2 ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 384(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; AVX2-ONLY-NEXT: vpermps %ymm8, %ymm7, %ymm2 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] -; AVX2-ONLY-NEXT: vmovaps 704(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 736(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vpermps %ymm1, %ymm6, %ymm0 -; AVX2-ONLY-NEXT: vmovaps %ymm1, %ymm9 +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; AVX2-ONLY-NEXT: vpermps %ymm8, %ymm7, %ymm3 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermps %ymm2, %ymm6, %ymm1 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovaps 672(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovaps 704(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 656(%rdi), %xmm2 -; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 640(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; AVX2-ONLY-NEXT: vpermps %ymm3, %ymm7, %ymm2 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 960(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vmovaps 992(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermps %ymm0, %ymm6, %ymm0 -; AVX2-ONLY-NEXT: vpermps %ymm2, %ymm6, %ymm1 -; AVX2-ONLY-NEXT: vmovaps %ymm2, %ymm14 +; AVX2-ONLY-NEXT: vmovaps 736(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vpermps %ymm2, %ymm0, %ymm1 +; AVX2-ONLY-NEXT: vmovaps %ymm2, %ymm8 ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovaps 928(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vpermps %ymm3, %ymm6, %ymm2 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vmovaps 672(%rdi), %ymm4 +; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 656(%rdi), %xmm3 +; AVX2-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 640(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; AVX2-ONLY-NEXT: vpermps %ymm4, %ymm7, %ymm3 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 960(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 912(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vmovaps 992(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermps %ymm1, %ymm0, %ymm1 +; AVX2-ONLY-NEXT: vpermps %ymm3, %ymm6, %ymm2 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vmovaps 928(%rdi), %ymm4 +; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 912(%rdi), %xmm3 +; AVX2-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 896(%rdi), %xmm2 ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 896(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; AVX2-ONLY-NEXT: vpermps %ymm3, %ymm7, %ymm2 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 320(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; AVX2-ONLY-NEXT: vpermps %ymm4, %ymm7, %ymm3 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 320(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 352(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vpermps %ymm1, %ymm6, %ymm0 -; AVX2-ONLY-NEXT: vmovaps %ymm1, %ymm3 -; AVX2-ONLY-NEXT: vpermps %ymm2, %ymm6, %ymm1 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %ymm8 -; AVX2-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 272(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermps %ymm1, %ymm0, %ymm1 +; AVX2-ONLY-NEXT: vpermps %ymm3, %ymm6, %ymm2 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %ymm9 +; AVX2-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 272(%rdi), %xmm3 +; AVX2-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %xmm2 ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; AVX2-ONLY-NEXT: vpermps %ymm8, %ymm7, %ymm2 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 576(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; AVX2-ONLY-NEXT: vpermps %ymm9, %ymm7, %ymm3 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 576(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 608(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vpermps %ymm2, %ymm0, %ymm1 +; AVX2-ONLY-NEXT: vmovaps %ymm2, %ymm13 ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 608(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vpermps %ymm1, %ymm6, %ymm0 -; AVX2-ONLY-NEXT: vmovaps %ymm1, %ymm11 -; AVX2-ONLY-NEXT: vpermps %ymm2, %ymm6, %ymm1 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovaps 544(%rdi), %ymm8 -; AVX2-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 528(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vpermps %ymm3, %ymm6, %ymm2 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vmovaps 544(%rdi), %ymm9 +; AVX2-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 528(%rdi), %xmm3 +; AVX2-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 512(%rdi), %xmm2 ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 512(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; AVX2-ONLY-NEXT: vpermps %ymm8, %ymm7, %ymm2 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 832(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 864(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vpermps %ymm1, %ymm6, %ymm0 -; AVX2-ONLY-NEXT: vmovaps %ymm1, %ymm13 +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; AVX2-ONLY-NEXT: vpermps %ymm9, %ymm7, %ymm3 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermps %ymm2, %ymm6, %ymm1 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovaps 800(%rdi), %ymm8 -; AVX2-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 784(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vmovaps 832(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 864(%rdi), %ymm2 +; AVX2-ONLY-NEXT: vpermps %ymm2, %ymm0, %ymm1 +; AVX2-ONLY-NEXT: vmovaps %ymm2, %ymm14 +; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermps %ymm3, %ymm6, %ymm2 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vmovaps 800(%rdi), %ymm9 +; AVX2-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 784(%rdi), %xmm3 +; AVX2-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 768(%rdi), %xmm2 ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 768(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; AVX2-ONLY-NEXT: vpermps %ymm8, %ymm7, %ymm2 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vpermps %ymm1, %ymm6, %ymm0 -; AVX2-ONLY-NEXT: vmovaps %ymm1, %ymm15 +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; AVX2-ONLY-NEXT: vpermps %ymm9, %ymm7, %ymm3 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermps %ymm2, %ymm6, %ymm1 -; AVX2-ONLY-NEXT: vmovaps %ymm2, %ymm12 -; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 96(%rdi), %ymm12 +; AVX2-ONLY-NEXT: vpermps %ymm12, %ymm0, %ymm0 +; AVX2-ONLY-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 64(%rdi), %ymm11 +; AVX2-ONLY-NEXT: vpermps %ymm11, %ymm6, %ymm1 +; AVX2-ONLY-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpermps %ymm1, %ymm7, %ymm1 -; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm2 +; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm3 +; AVX2-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 16(%rdi), %xmm2 ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 16(%rdi), %xmm6 -; AVX2-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm6 = xmm2[0],xmm6[0],xmm2[1],xmm6[1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm6[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastsd {{.*#+}} ymm6 = [21474836481,21474836481,21474836481,21474836481] -; AVX2-ONLY-NEXT: vpermps %ymm5, %ymm6, %ymm0 -; AVX2-ONLY-NEXT: vmovaps %ymm10, %ymm2 -; AVX2-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermps %ymm10, %ymm6, %ymm1 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %ymm5 -; AVX2-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 176(%rdi), %xmm7 -; AVX2-ONLY-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1] -; AVX2-ONLY-NEXT: vmovaps {{.*#+}} xmm7 = <1,5,u,u> -; AVX2-ONLY-NEXT: vpermps %ymm5, %ymm7, %ymm10 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm10[0,1],xmm1[2,3] +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermps %ymm4, %ymm6, %ymm0 -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-ONLY-NEXT: vpermps %ymm8, %ymm6, %ymm1 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovaps 384(%rdi), %ymm4 -; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps {{.*#+}} ymm6 = +; AVX2-ONLY-NEXT: vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermps %ymm15, %ymm6, %ymm0 +; AVX2-ONLY-NEXT: vmovaps {{.*#+}} ymm7 = +; AVX2-ONLY-NEXT: vpermps %ymm10, %ymm7, %ymm1 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovaps 128(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 176(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 160(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; AVX2-ONLY-NEXT: vmovaps {{.*#+}} xmm4 = <1,5,u,u> +; AVX2-ONLY-NEXT: vpermps %ymm3, %ymm4, %ymm10 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm10[0,1],xmm2[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermps %ymm5, %ymm6, %ymm1 +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-ONLY-NEXT: vpermps %ymm15, %ymm7, %ymm2 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vmovaps 384(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 432(%rdi), %xmm5 ; AVX2-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1] -; AVX2-ONLY-NEXT: vpermps %ymm4, %ymm7, %ymm10 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm10[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermps %ymm9, %ymm6, %ymm0 +; AVX2-ONLY-NEXT: vmovaps 416(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1] +; AVX2-ONLY-NEXT: vpermps %ymm3, %ymm4, %ymm10 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm10[0,1],xmm2[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermps %ymm8, %ymm6, %ymm1 +; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm2 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vmovaps 640(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 688(%rdi), %xmm5 +; AVX2-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 672(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1] +; AVX2-ONLY-NEXT: vpermps %ymm3, %ymm4, %ymm10 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm10[0,1],xmm2[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-ONLY-NEXT: vpermps %ymm8, %ymm6, %ymm1 +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-ONLY-NEXT: vpermps %ymm9, %ymm7, %ymm2 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vmovaps 896(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 944(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 928(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; AVX2-ONLY-NEXT: vpermps %ymm3, %ymm4, %ymm10 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm10[0,1],xmm2[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermps %ymm12, %ymm6, %ymm1 +; AVX2-ONLY-NEXT: vpermps %ymm11, %ymm7, %ymm2 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 48(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; AVX2-ONLY-NEXT: vpermps %ymm3, %ymm4, %ymm10 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm10[0,1],xmm2[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm1 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-ONLY-NEXT: vpermps %ymm5, %ymm6, %ymm1 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovaps 640(%rdi), %ymm4 -; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 688(%rdi), %xmm9 -; AVX2-ONLY-NEXT: vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 672(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1] -; AVX2-ONLY-NEXT: vpermps %ymm4, %ymm7, %ymm10 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm10[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vpermps %ymm5, %ymm7, %ymm2 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-ONLY-NEXT: vpermps %ymm4, %ymm6, %ymm0 -; AVX2-ONLY-NEXT: vpermps %ymm14, %ymm6, %ymm1 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovaps 896(%rdi), %ymm9 -; AVX2-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 944(%rdi), %xmm10 -; AVX2-ONLY-NEXT: vmovaps %xmm10, (%rsp) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 928(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm10[0],xmm1[1],xmm10[1] -; AVX2-ONLY-NEXT: vpermps %ymm9, %ymm7, %ymm10 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm10[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps 304(%rdi), %xmm3 +; AVX2-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; AVX2-ONLY-NEXT: vpermps %ymm0, %ymm4, %ymm10 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm10[0,1],xmm2[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermps %ymm13, %ymm6, %ymm1 +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-ONLY-NEXT: vpermps %ymm13, %ymm7, %ymm2 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vmovaps 512(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermps %ymm15, %ymm6, %ymm0 -; AVX2-ONLY-NEXT: vpermps %ymm12, %ymm6, %ymm1 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovaps (%rdi), %ymm9 -; AVX2-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm10 -; AVX2-ONLY-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 48(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm10[0],xmm1[0],xmm10[1],xmm1[1] -; AVX2-ONLY-NEXT: vpermps %ymm9, %ymm7, %ymm10 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm10[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovaps 560(%rdi), %xmm3 +; AVX2-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 544(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; AVX2-ONLY-NEXT: vpermps %ymm0, %ymm4, %ymm10 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm10[0,1],xmm2[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermps %ymm14, %ymm6, %ymm1 +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-ONLY-NEXT: vpermps %ymm14, %ymm7, %ymm2 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vmovaps 768(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermps %ymm3, %ymm6, %ymm0 -; AVX2-ONLY-NEXT: vmovaps %ymm3, %ymm15 -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vpermps %ymm3, %ymm6, %ymm1 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovaps 256(%rdi), %ymm9 -; AVX2-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 304(%rdi), %xmm10 -; AVX2-ONLY-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 288(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm10[0],xmm1[1],xmm10[1] -; AVX2-ONLY-NEXT: vpermps %ymm9, %ymm7, %ymm10 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm10[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vpermps %ymm0, %ymm4, %ymm0 +; AVX2-ONLY-NEXT: vmovaps 816(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vmovaps 800(%rdi), %xmm3 +; AVX2-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermps %ymm11, %ymm6, %ymm0 -; AVX2-ONLY-NEXT: vmovaps %ymm11, %ymm12 +; AVX2-ONLY-NEXT: vmovaps {{.*#+}} ymm3 = +; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm2 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vmovaps {{.*#+}} ymm1 = ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-ONLY-NEXT: vpermps %ymm11, %ymm6, %ymm1 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovaps 512(%rdi), %ymm9 -; AVX2-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovaps 560(%rdi), %xmm10 -; AVX2-ONLY-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 544(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm10[0],xmm1[1],xmm10[1] -; AVX2-ONLY-NEXT: vpermps %ymm9, %ymm7, %ymm10 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm10[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermps %ymm13, %ymm6, %ymm0 -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-ONLY-NEXT: vpermps %ymm13, %ymm6, %ymm1 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovaps 768(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermps %ymm1, %ymm7, %ymm1 -; AVX2-ONLY-NEXT: vmovaps 816(%rdi), %xmm6 -; AVX2-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovaps 800(%rdi), %xmm7 -; AVX2-ONLY-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm10 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm10[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastsd {{.*#+}} ymm0 = [25769803778,25769803778,25769803778,25769803778] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-ONLY-NEXT: vpermps %ymm9, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vpermps %ymm2, %ymm0, %ymm10 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] +; AVX2-ONLY-NEXT: vpermps %ymm11, %ymm1, %ymm10 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm2[6,7] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] ; AVX2-ONLY-NEXT: vmovaps {{.*#+}} xmm2 = ; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm7 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm7[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm10[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-ONLY-NEXT: vpermps %ymm14, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vpermps %ymm8, %ymm0, %ymm7 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm7 = xmm7[2],mem[2],xmm7[3],mem[3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm7[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm10[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vpermps %ymm15, %ymm1, %ymm7 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm7 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm7 = xmm6[2],mem[2],xmm6[3],mem[3] ; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm10 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm10[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-ONLY-NEXT: vpermps %ymm8, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vpermps %ymm5, %ymm0, %ymm7 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm7 = xmm7[2],mem[2],xmm7[3],mem[3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-ONLY-NEXT: vpermps %ymm12, %ymm3, %ymm0 +; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm7 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm7 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm7 = xmm6[2],mem[2],xmm6[3],mem[3] ; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm10 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm10[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermps %ymm4, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-ONLY-NEXT: vpermps %ymm5, %ymm0, %ymm7 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm7 = xmm7[2],mem[2],xmm7[3],mem[3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermps %ymm8, %ymm3, %ymm0 +; AVX2-ONLY-NEXT: vpermps %ymm9, %ymm1, %ymm7 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm7 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm7 = xmm6[2],mem[2],xmm6[3],mem[3] ; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm10 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm10[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermps %ymm15, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vpermps %ymm3, %ymm0, %ymm7 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm7 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm7 = xmm3[2],mem[2],xmm3[3],mem[3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vpermps %ymm6, %ymm3, %ymm0 +; AVX2-ONLY-NEXT: vpermps %ymm5, %ymm1, %ymm7 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm7 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm7 = xmm4[2],mem[2],xmm4[3],mem[3] ; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm10 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm10[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermps %ymm12, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vpermps %ymm11, %ymm0, %ymm7 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm7 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm7 = xmm3[2],mem[2],xmm3[3],mem[3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-ONLY-NEXT: vpermps %ymm9, %ymm3, %ymm0 +; AVX2-ONLY-NEXT: vpermps %ymm13, %ymm1, %ymm7 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm7 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm7 = xmm4[2],mem[2],xmm4[3],mem[3] ; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm10 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm10[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-ONLY-NEXT: vpermps %ymm11, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vpermps %ymm13, %ymm0, %ymm7 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm7 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm7 = xmm3[2],mem[2],xmm3[3],mem[3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-ONLY-NEXT: vpermps %ymm13, %ymm3, %ymm0 +; AVX2-ONLY-NEXT: vpermps %ymm14, %ymm1, %ymm7 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm7 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm7 = xmm4[2],mem[2],xmm4[3],mem[3] ; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm10 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm10[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm1[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-ONLY-NEXT: vpermps %ymm6, %ymm0, %ymm1 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-ONLY-NEXT: vpermps %ymm4, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vpermps %ymm4, %ymm3, %ymm0 +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-ONLY-NEXT: vpermps %ymm15, %ymm1, %ymm1 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm1 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload @@ -3199,81 +3212,82 @@ ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vbroadcastsd {{.*#+}} ymm0 = [30064771075,30064771075,30064771075,30064771075] -; AVX2-ONLY-NEXT: vpermps %ymm9, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm7 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm7 = xmm1[2],mem[2],xmm1[3],mem[3] -; AVX2-ONLY-NEXT: vmovaps {{.*#+}} xmm1 = <3,7,u,u> -; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm10 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vmovaps {{.*#+}} ymm2 = +; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vmovaps {{.*#+}} ymm1 = +; AVX2-ONLY-NEXT: vpermps %ymm11, %ymm1, %ymm3 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm7 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm7 = xmm0[2],mem[2],xmm0[3],mem[3] +; AVX2-ONLY-NEXT: vmovaps {{.*#+}} xmm0 = <3,7,u,u> +; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm10 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm10[0,1],xmm7[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm7[0,1,2,3],ymm2[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermps %ymm14, %ymm0, %ymm2 -; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm7[0,1,2,3,4,5],ymm2[6,7] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm7 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm7 = xmm3[2],mem[2],xmm3[3],mem[3] -; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm10 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm7[0,1,2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm3 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm7 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm7[0,1,2,3,4,5],ymm3[6,7] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm7 = xmm7[2],mem[2],xmm7[3],mem[3] +; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm10 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm10[0,1],xmm7[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm7[0,1,2,3],ymm2[4,5,6,7] -; AVX2-ONLY-NEXT: vpermps %ymm8, %ymm0, %ymm2 -; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm7[0,1,2,3,4,5],ymm2[6,7] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm7 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm7 = xmm3[2],mem[2],xmm3[3],mem[3] -; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm9 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm9[0,1],xmm7[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm7[0,1,2,3],ymm2[4,5,6,7] -; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: vpermps %ymm5, %ymm0, %ymm5 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1,2,3,4,5],ymm2[6,7] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhps (%rsp), %xmm3, %xmm5 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm5 = xmm3[2],mem[2],xmm3[3],mem[3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm7[0,1,2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vpermps %ymm12, %ymm2, %ymm3 ; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm7 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm7[0,1],xmm5[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm2[4,5,6,7] -; AVX2-ONLY-NEXT: vpermps %ymm6, %ymm0, %ymm2 -; AVX2-ONLY-NEXT: vpermps %ymm4, %ymm0, %ymm3 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3] -; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm4 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] -; AVX2-ONLY-NEXT: vpermps %ymm15, %ymm0, %ymm3 -; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm7[0,1,2,3,4,5],ymm3[6,7] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm7 = xmm7[2],mem[2],xmm7[3],mem[3] +; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm11[0,1],xmm7[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm7[0,1,2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vpermps %ymm8, %ymm2, %ymm3 +; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm7 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm7[0,1,2,3,4,5],ymm3[6,7] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm7 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm7 = xmm5[2],mem[2],xmm5[3],mem[3] +; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm8[0,1],xmm7[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm7[0,1,2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vpermps %ymm4, %ymm2, %ymm3 +; AVX2-ONLY-NEXT: vpermps %ymm15, %ymm1, %ymm4 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-ONLY-NEXT: vmovaps (%rsp), %xmm4 # 16-byte Reload ; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm4 = xmm4[2],mem[2],xmm4[3],mem[3] -; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm7 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm7[0,1],xmm4[2,3] +; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] -; AVX2-ONLY-NEXT: vpermps %ymm12, %ymm0, %ymm4 +; AVX2-ONLY-NEXT: vpermps %ymm6, %ymm2, %ymm4 +; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm5 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm5 = xmm5[2],mem[2],xmm5[3],mem[3] ; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm7[0,1,2,3,4,5],ymm4[6,7] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm7[0,1],xmm5[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] +; AVX2-ONLY-NEXT: vpermps %ymm9, %ymm2, %ymm5 +; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm7 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0,1,2,3,4,5],ymm5[6,7] ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm7 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm7 = xmm6[2],mem[2],xmm6[3],mem[3] -; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm8 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm8[0,1],xmm7[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm7[0,1,2,3],ymm4[4,5,6,7] -; AVX2-ONLY-NEXT: vpermps %ymm11, %ymm0, %ymm4 -; AVX2-ONLY-NEXT: vpermps %ymm13, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm4[6,7] -; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload -; AVX2-ONLY-NEXT: # xmm4 = xmm4[2],mem[2],xmm4[3],mem[3] -; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3] -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm9[0,1],xmm7[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm7[0,1,2,3],ymm5[4,5,6,7] +; AVX2-ONLY-NEXT: vpermps %ymm13, %ymm2, %ymm2 +; AVX2-ONLY-NEXT: vpermps %ymm14, %ymm1, %ymm1 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] +; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload +; AVX2-ONLY-NEXT: # xmm2 = xmm2[2],mem[2],xmm2[3],mem[3] +; AVX2-ONLY-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3] +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm1, 192(%rsi) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload @@ -3323,15 +3337,15 @@ ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm1, 32(%rcx) ; AVX2-ONLY-NEXT: vmovaps %ymm0, 192(%r8) -; AVX2-ONLY-NEXT: vmovaps %ymm8, 128(%r8) -; AVX2-ONLY-NEXT: vmovaps %ymm3, 64(%r8) -; AVX2-ONLY-NEXT: vmovaps %ymm2, (%r8) -; AVX2-ONLY-NEXT: vmovaps %ymm5, 224(%r8) -; AVX2-ONLY-NEXT: vmovaps %ymm9, 160(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm9, 128(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm4, 64(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm3, (%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm8, 224(%r8) +; AVX2-ONLY-NEXT: vmovaps %ymm11, 160(%r8) ; AVX2-ONLY-NEXT: vmovaps %ymm10, 96(%r8) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm0, 32(%r8) -; AVX2-ONLY-NEXT: addq $1960, %rsp # imm = 0x7A8 +; AVX2-ONLY-NEXT: addq $2008, %rsp # imm = 0x7D8 ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-5.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-5.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-5.ll @@ -492,8 +492,7 @@ ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm7 = ymm1[0,1],ymm2[2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vpermd %ymm7, %ymm6, %ymm6 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm7 = ymm3[0,1],ymm4[2,3],ymm3[4,5],ymm4[6,7] -; AVX2-ONLY-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [5,2,7,0,5,2,7,0] -; AVX2-ONLY-NEXT: # ymm8 = mem[0,1,0,1] +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm8 = ; AVX2-ONLY-NEXT: vpermd %ymm7, %ymm8, %ymm7 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2],ymm7[3,4,5,6,7] ; AVX2-ONLY-NEXT: vpbroadcastd 144(%rdi), %ymm7 @@ -505,10 +504,10 @@ ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm8 = ymm3[0,1,2,3,4,5,6],ymm8[7] ; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm8 = ymm8[2,3,0,1,6,7,4,5] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2],ymm8[3,4,5,6,7] -; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} ymm8 = [21474836480,21474836480,21474836480,21474836480] +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm8 = ; AVX2-ONLY-NEXT: vpermd %ymm0, %ymm8, %ymm8 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm8[6,7] -; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} ymm8 = [25769803777,25769803777,25769803777,25769803777] +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm8 = ; AVX2-ONLY-NEXT: vpermd %ymm0, %ymm8, %ymm8 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm9 = ymm3[0,1,2,3],ymm4[4,5],ymm3[6,7] ; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[3,0,2,2,7,4,6,6] @@ -522,7 +521,7 @@ ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5],ymm2[6,7] ; AVX2-ONLY-NEXT: vpermd %ymm1, %ymm4, %ymm1 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm3[3,4,5,6,7] -; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} ymm2 = [30064771074,30064771074,30064771074,30064771074] +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm2 = ; AVX2-ONLY-NEXT: vpermd %ymm0, %ymm2, %ymm0 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovdqa %ymm5, (%rsi) @@ -993,8 +992,7 @@ ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm11 = ymm3[0,1],ymm5[2,3],ymm3[4,5,6,7] ; AVX2-ONLY-NEXT: vpermd %ymm11, %ymm12, %ymm11 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm13 = ymm8[0,1],ymm9[2,3],ymm8[4,5],ymm9[6,7] -; AVX2-ONLY-NEXT: vbroadcasti128 {{.*#+}} ymm14 = [5,2,7,0,5,2,7,0] -; AVX2-ONLY-NEXT: # ymm14 = mem[0,1,0,1] +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm14 = ; AVX2-ONLY-NEXT: vpermd %ymm13, %ymm14, %ymm13 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2],ymm13[3,4,5,6,7] ; AVX2-ONLY-NEXT: vpbroadcastd 144(%rdi), %ymm13 @@ -1015,7 +1013,7 @@ ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm15 = ymm8[0,1,2,3,4,5,6],ymm15[7] ; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm15 = ymm15[2,3,0,1,6,7,4,5] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2],ymm15[3,4,5,6,7] -; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} ymm15 = [21474836480,21474836480,21474836480,21474836480] +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm15 = ; AVX2-ONLY-NEXT: vpermd %ymm0, %ymm15, %ymm7 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],ymm7[6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm7 = ymm6[0,1,2,3],ymm4[4,5],ymm6[6,7] @@ -1032,7 +1030,7 @@ ; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm10 = ymm3[12,13,14,15],ymm5[0,1,2,3,4,5,6,7,8,9,10,11],ymm3[28,29,30,31],ymm5[16,17,18,19,20,21,22,23,24,25,26,27] ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,3,2,3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2],ymm15[3,4,5,6,7] -; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} ymm15 = [25769803777,25769803777,25769803777,25769803777] +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm15 = ; AVX2-ONLY-NEXT: vpermd %ymm0, %ymm15, %ymm11 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm11[6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm11 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7] @@ -1048,7 +1046,7 @@ ; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm5 = <4,1,6,u> ; AVX2-ONLY-NEXT: vpermd %ymm3, %ymm5, %ymm3 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm8[3,4,5,6,7] -; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} ymm8 = [30064771074,30064771074,30064771074,30064771074] +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm8 = ; AVX2-ONLY-NEXT: vpermd %ymm0, %ymm8, %ymm0 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm6[4,5],ymm4[6,7] @@ -2037,104 +2035,100 @@ ; ; AVX2-ONLY-LABEL: load_i32_stride5_vf32: ; AVX2-ONLY: # %bb.0: -; AVX2-ONLY-NEXT: subq $1032, %rsp # imm = 0x408 -; AVX2-ONLY-NEXT: vmovdqa 384(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovdqa 416(%rdi), %ymm13 -; AVX2-ONLY-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: subq $1000, %rsp # imm = 0x3E8 +; AVX2-ONLY-NEXT: vmovdqa 384(%rdi), %ymm10 +; AVX2-ONLY-NEXT: vmovdqu %ymm10, (%rsp) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 416(%rdi), %ymm7 ; AVX2-ONLY-NEXT: vmovdqa 352(%rdi), %ymm6 ; AVX2-ONLY-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 320(%rdi), %ymm7 -; AVX2-ONLY-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 544(%rdi), %ymm10 -; AVX2-ONLY-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 576(%rdi), %ymm11 -; AVX2-ONLY-NEXT: vmovdqa 512(%rdi), %ymm15 -; AVX2-ONLY-NEXT: vmovdqa 480(%rdi), %ymm8 +; AVX2-ONLY-NEXT: vmovdqa 320(%rdi), %ymm8 ; AVX2-ONLY-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 224(%rdi), %ymm9 -; AVX2-ONLY-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 544(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 576(%rdi), %ymm14 +; AVX2-ONLY-NEXT: vmovdqa 512(%rdi), %ymm15 +; AVX2-ONLY-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 480(%rdi), %ymm11 +; AVX2-ONLY-NEXT: vmovdqa 224(%rdi), %ymm13 ; AVX2-ONLY-NEXT: vmovdqa 256(%rdi), %ymm12 -; AVX2-ONLY-NEXT: vmovdqa 192(%rdi), %ymm4 -; AVX2-ONLY-NEXT: vmovdqa 160(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm5 = [0,5,2,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm4[2,3],ymm1[4,5],ymm4[6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermd %ymm1, %ymm5, %ymm1 +; AVX2-ONLY-NEXT: vmovdqa 192(%rdi), %ymm5 +; AVX2-ONLY-NEXT: vmovdqa 160(%rdi), %ymm9 +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm4 = [0,5,2,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1],ymm5[2,3],ymm9[4,5],ymm5[6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermd %ymm1, %ymm4, %ymm1 ; AVX2-ONLY-NEXT: vinserti128 $1, 288(%rdi), %ymm1, %ymm2 ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm3 = ymm12[0,1,0,3] ; AVX2-ONLY-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm9[4],ymm3[5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm13[4],ymm3[5,6,7] +; AVX2-ONLY-NEXT: vmovdqa %ymm13, %ymm4 +; AVX2-ONLY-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3],ymm1[4,5,6],ymm2[7] ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1],ymm15[2,3],ymm8[4,5],ymm15[6,7] -; AVX2-ONLY-NEXT: vmovdqa %ymm15, %ymm9 -; AVX2-ONLY-NEXT: vpermd %ymm1, %ymm5, %ymm1 -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm2 = ymm11[0,1,0,3] -; AVX2-ONLY-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm10[4],ymm2[5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1],ymm15[2,3],ymm11[4,5],ymm15[6,7] +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm3 = [0,5,2,7] +; AVX2-ONLY-NEXT: vpermd %ymm1, %ymm3, %ymm1 +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm2 = ymm14[0,1,0,3] +; AVX2-ONLY-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm0[4],ymm2[5,6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX2-ONLY-NEXT: vinserti128 $1, 608(%rdi), %ymm1, %ymm1 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1],ymm6[2,3],ymm7[4,5],ymm6[6,7] -; AVX2-ONLY-NEXT: vpermd %ymm1, %ymm5, %ymm1 -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm2 = ymm13[0,1,0,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm0[4],ymm2[5,6,7] -; AVX2-ONLY-NEXT: vmovdqa %ymm0, %ymm10 -; AVX2-ONLY-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1],ymm6[2,3],ymm8[4,5],ymm6[6,7] +; AVX2-ONLY-NEXT: vpermd %ymm1, %ymm3, %ymm1 +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm2 = ymm7[0,1,0,3] +; AVX2-ONLY-NEXT: vmovdqa %ymm7, %ymm8 +; AVX2-ONLY-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm10[4],ymm2[5,6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX2-ONLY-NEXT: vinserti128 $1, 448(%rdi), %ymm1, %ymm1 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa (%rdi), %ymm7 -; AVX2-ONLY-NEXT: vmovdqa 32(%rdi), %ymm3 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1],ymm3[2,3],ymm7[4,5],ymm3[6,7] -; AVX2-ONLY-NEXT: vmovdqa %ymm3, %ymm13 -; AVX2-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermd %ymm1, %ymm5, %ymm0 -; AVX2-ONLY-NEXT: vmovdqa 64(%rdi), %ymm3 +; AVX2-ONLY-NEXT: vmovdqa (%rdi), %ymm6 +; AVX2-ONLY-NEXT: vmovdqa 32(%rdi), %ymm7 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1],ymm7[2,3],ymm6[4,5],ymm7[6,7] +; AVX2-ONLY-NEXT: vpermd %ymm1, %ymm3, %ymm0 +; AVX2-ONLY-NEXT: vmovdqa 64(%rdi), %ymm15 ; AVX2-ONLY-NEXT: vmovdqa 96(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm1 = ymm2[0,1,0,3] -; AVX2-ONLY-NEXT: vmovdqa %ymm2, %ymm14 +; AVX2-ONLY-NEXT: vmovdqa %ymm2, %ymm13 ; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4],ymm1[5,6,7] -; AVX2-ONLY-NEXT: vmovdqa %ymm3, %ymm15 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm15[4],ymm1[5,6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vinserti128 $1, 128(%rdi), %ymm0, %ymm0 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm0 = <1,6,3,u> -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1],ymm4[2,3],ymm8[4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1],ymm5[2,3],ymm9[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpermd %ymm1, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1],ymm12[2,3],ymm5[4,5],ymm12[6,7] -; AVX2-ONLY-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [5,2,7,0,5,2,7,0] -; AVX2-ONLY-NEXT: # ymm3 = mem[0,1,0,1] -; AVX2-ONLY-NEXT: vpermd %ymm2, %ymm3, %ymm2 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1],ymm12[2,3],ymm4[4,5],ymm12[6,7] +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm4 = +; AVX2-ONLY-NEXT: vpermd %ymm2, %ymm4, %ymm2 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] ; AVX2-ONLY-NEXT: vpbroadcastd 304(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1],ymm9[2,3],ymm4[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1],ymm3[2,3],ymm11[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa %ymm11, %ymm5 +; AVX2-ONLY-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpermd %ymm1, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1],ymm11[2,3],ymm6[4,5],ymm11[6,7] -; AVX2-ONLY-NEXT: vpermd %ymm2, %ymm3, %ymm2 +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0,1],ymm14[2,3],ymm10[4,5],ymm14[6,7] +; AVX2-ONLY-NEXT: vpermd %ymm2, %ymm4, %ymm2 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] ; AVX2-ONLY-NEXT: vpbroadcastd 624(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1],ymm13[2,3],ymm7[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1],ymm7[2,3],ymm6[4,5,6,7] ; AVX2-ONLY-NEXT: vpermd %ymm1, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm15[0,1],ymm14[2,3],ymm15[4,5],ymm14[6,7] -; AVX2-ONLY-NEXT: vmovdqa %ymm15, %ymm14 -; AVX2-ONLY-NEXT: vpermd %ymm2, %ymm3, %ymm2 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm15[0,1],ymm13[2,3],ymm15[4,5],ymm13[6,7] +; AVX2-ONLY-NEXT: vpermd %ymm2, %ymm4, %ymm2 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] ; AVX2-ONLY-NEXT: vpbroadcastd 144(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] @@ -2143,145 +2137,144 @@ ; AVX2-ONLY-NEXT: vpblendd $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm1 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm1 = mem[0,1],ymm11[2,3],mem[4,5,6,7] ; AVX2-ONLY-NEXT: vpermd %ymm1, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm1 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm1 = ymm10[0,1],mem[2,3],ymm10[4,5],mem[6,7] -; AVX2-ONLY-NEXT: vpermd %ymm1, %ymm3, %ymm1 +; AVX2-ONLY-NEXT: vmovdqu (%rsp), %ymm13 # 32-byte Reload +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1],ymm8[2,3],ymm13[4,5],ymm8[6,7] +; AVX2-ONLY-NEXT: vpermd %ymm1, %ymm4, %ymm1 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX2-ONLY-NEXT: vpbroadcastd 464(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm13 = <2,7,4,u> -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3],ymm12[4,5],ymm8[6,7] -; AVX2-ONLY-NEXT: vpermd %ymm1, %ymm13, %ymm1 +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm12 = <2,7,4,u> +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3],ymm14[4,5],ymm9[6,7] +; AVX2-ONLY-NEXT: vpermd %ymm1, %ymm12, %ymm1 ; AVX2-ONLY-NEXT: vinserti128 $1, 256(%rdi), %ymm0, %ymm2 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3,4,5,6],ymm2[7] -; AVX2-ONLY-NEXT: vmovdqa %ymm5, %ymm7 -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,3,0,1,6,7,4,5] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 288(%rdi), %ymm2 -; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} ymm0 = [21474836480,21474836480,21474836480,21474836480] -; AVX2-ONLY-NEXT: vpermd %ymm2, %ymm0, %ymm2 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm9[4,5],ymm4[6,7] -; AVX2-ONLY-NEXT: vpermd %ymm1, %ymm13, %ymm1 -; AVX2-ONLY-NEXT: vinserti128 $1, 576(%rdi), %ymm0, %ymm2 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3,4,5,6],ymm2[7] +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5,6],ymm2[7] ; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,3,0,1,6,7,4,5] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 608(%rdi), %ymm9 -; AVX2-ONLY-NEXT: vpermd %ymm9, %ymm0, %ymm4 -; AVX2-ONLY-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 288(%rdi), %ymm8 +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm2 = +; AVX2-ONLY-NEXT: vpermd %ymm8, %ymm2, %ymm4 +; AVX2-ONLY-NEXT: vmovdqa %ymm8, %ymm9 +; AVX2-ONLY-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm4[6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm15[0,1,2,3],ymm5[4,5],ymm15[6,7] -; AVX2-ONLY-NEXT: vpermd %ymm1, %ymm13, %ymm1 -; AVX2-ONLY-NEXT: vinserti128 $1, 96(%rdi), %ymm0, %ymm4 -; AVX2-ONLY-NEXT: vmovdqa %ymm14, %ymm2 -; AVX2-ONLY-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm14[0,1,2,3,4,5,6],ymm4[7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm3[4,5],ymm5[6,7] +; AVX2-ONLY-NEXT: vpermd %ymm1, %ymm12, %ymm1 +; AVX2-ONLY-NEXT: vinserti128 $1, 576(%rdi), %ymm0, %ymm4 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm10[0,1,2,3,4,5,6],ymm4[7] +; AVX2-ONLY-NEXT: vmovdqa %ymm10, %ymm5 ; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[2,3,0,1,6,7,4,5] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm4[3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 128(%rdi), %ymm14 -; AVX2-ONLY-NEXT: vpermd %ymm14, %ymm0, %ymm10 -; AVX2-ONLY-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm10[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm1[0,1,2],ymm4[3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 608(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm11[4,5],ymm3[6,7] -; AVX2-ONLY-NEXT: vpermd %ymm1, %ymm13, %ymm4 +; AVX2-ONLY-NEXT: vpermd %ymm1, %ymm2, %ymm10 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm10[6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3],ymm7[4,5],ymm6[6,7] +; AVX2-ONLY-NEXT: vpermd %ymm4, %ymm12, %ymm4 +; AVX2-ONLY-NEXT: vmovdqa %ymm12, %ymm3 +; AVX2-ONLY-NEXT: vinserti128 $1, 96(%rdi), %ymm0, %ymm10 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm10 = ymm15[0,1,2,3,4,5,6],ymm10[7] +; AVX2-ONLY-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm10 = ymm10[2,3,0,1,6,7,4,5] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm10 = ymm4[0,1,2],ymm10[3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa 128(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpermd %ymm1, %ymm2, %ymm8 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1,2,3,4,5],ymm8[6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm8 = ymm12[0,1,2,3],ymm11[4,5],ymm12[6,7] +; AVX2-ONLY-NEXT: vpermd %ymm8, %ymm3, %ymm4 ; AVX2-ONLY-NEXT: vinserti128 $1, 416(%rdi), %ymm0, %ymm8 -; AVX2-ONLY-NEXT: vmovdqu (%rsp), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm8 = ymm1[0,1,2,3,4,5,6],ymm8[7] +; AVX2-ONLY-NEXT: vmovdqa %ymm13, %ymm3 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3,4,5,6],ymm8[7] ; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm8 = ymm8[2,3,0,1,6,7,4,5] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm8 = ymm4[0,1,2],ymm8[3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa 448(%rdi), %ymm4 -; AVX2-ONLY-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermd %ymm4, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3,4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqa 448(%rdi), %ymm11 +; AVX2-ONLY-NEXT: vpermd %ymm11, %ymm2, %ymm2 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3,4,5],ymm2[6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1,2,3],ymm7[4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[3,0,2,2,7,4,6,6] ; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm13[4,5],ymm7[6,7] -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,0,2,2,7,4,6,6] -; AVX2-ONLY-NEXT: vpalignr $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm8 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm8 = mem[12,13,14,15],ymm12[0,1,2,3,4,5,6,7,8,9,10,11],mem[28,29,30,31],ymm12[16,17,18,19,20,21,22,23,24,25,26,27] +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm8 = ymm13[12,13,14,15],ymm14[0,1,2,3,4,5,6,7,8,9,10,11],ymm13[28,29,30,31],ymm14[16,17,18,19,20,21,22,23,24,25,26,27] ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,3,2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2],ymm0[3,4,5,6,7] -; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} ymm8 = [25769803777,25769803777,25769803777,25769803777] -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-ONLY-NEXT: vpermd %ymm11, %ymm8, %ymm10 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm10[6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm12[4,5],ymm6[6,7] -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,0,2,2,7,4,6,6] -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm10 = ymm6[12,13,14,15],ymm4[0,1,2,3,4,5,6,7,8,9,10,11],ymm6[28,29,30,31],ymm4[16,17,18,19,20,21,22,23,24,25,26,27] -; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,3,2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2],ymm0[3,4,5,6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2],ymm2[3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm8 = ; AVX2-ONLY-NEXT: vpermd %ymm9, %ymm8, %ymm10 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm10[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm10[6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpblendd $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm0 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm0 = ymm2[0,1,2,3],mem[4,5],ymm2[6,7] -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,0,2,2,7,4,6,6] -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm10 = ymm15[12,13,14,15],ymm5[0,1,2,3,4,5,6,7,8,9,10,11],ymm15[28,29,30,31],ymm5[16,17,18,19,20,21,22,23,24,25,26,27] +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm9[4,5],ymm5[6,7] +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[3,0,2,2,7,4,6,6] +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm10 = ymm5[12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10,11],ymm5[28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26,27] ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,3,2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2],ymm0[3,4,5,6,7] -; AVX2-ONLY-NEXT: vpermd %ymm14, %ymm8, %ymm10 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm10[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0,1,2],ymm2[3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vpermd %ymm6, %ymm8, %ymm10 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm10[6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm5[4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,0,2,2,7,4,6,6] +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm15[0,1,2,3],ymm14[4,5],ymm15[6,7] +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[3,0,2,2,7,4,6,6] ; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm10 = ymm3[12,13,14,15],ymm15[0,1,2,3,4,5,6,7,8,9,10,11],ymm3[28,29,30,31],ymm15[16,17,18,19,20,21,22,23,24,25,26,27] -; AVX2-ONLY-NEXT: vmovdqa %ymm3, %ymm14 +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm10 = ymm15[12,13,14,15],ymm0[0,1,2,3,4,5,6,7,8,9,10,11],ymm15[28,29,30,31],ymm0[16,17,18,19,20,21,22,23,24,25,26,27] ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,3,2,3] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2],ymm0[3,4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-ONLY-NEXT: vpermd %ymm1, %ymm8, %ymm8 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm8[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0,1,2],ymm2[3,4,5,6,7] +; AVX2-ONLY-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm10 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm10[6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm4[4,5],ymm3[6,7] +; AVX2-ONLY-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[3,0,2,2,7,4,6,6] ; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-ONLY-NEXT: vpblendd $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm7[0,1],ymm13[0,1] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm13[5],ymm7[6,7] +; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm10 = ymm12[12,13,14,15],ymm0[0,1,2,3,4,5,6,7,8,9,10,11],ymm12[28,29,30,31],ymm0[16,17,18,19,20,21,22,23,24,25,26,27] +; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,3,2,3] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0,1,2],ymm2[3,4,5,6,7] +; AVX2-ONLY-NEXT: vpermd %ymm11, %ymm8, %ymm8 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm8[6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpblendd $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm2 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm2 = mem[0,1,2,3],ymm13[4,5],mem[6,7] +; AVX2-ONLY-NEXT: vmovdqa %ymm7, %ymm3 +; AVX2-ONLY-NEXT: vperm2i128 $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm7 = mem[0,1],ymm7[0,1] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm3[5],ymm7[6,7] ; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm8 = <4,1,6,u> -; AVX2-ONLY-NEXT: vpermd %ymm0, %ymm8, %ymm0 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm7[3,4,5,6,7] -; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} ymm7 = [30064771074,30064771074,30064771074,30064771074] -; AVX2-ONLY-NEXT: vpermd %ymm11, %ymm7, %ymm3 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm3[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm6[4,5],ymm4[6,7] -; AVX2-ONLY-NEXT: vperm2i128 $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm9 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm9 = mem[0,1],ymm12[0,1] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4],ymm12[5],ymm9[6,7] +; AVX2-ONLY-NEXT: vpermd %ymm2, %ymm8, %ymm2 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm7[3,4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm7 = +; AVX2-ONLY-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm3 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm1[0,1,2,3],ymm5[4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vmovdqa %ymm9, %ymm10 +; AVX2-ONLY-NEXT: vperm2i128 $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm9 = mem[0,1],ymm9[0,1] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4],ymm10[5],ymm9[6,7] ; AVX2-ONLY-NEXT: vpermd %ymm3, %ymm8, %ymm3 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm9[3,4,5,6,7] -; AVX2-ONLY-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm2 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm15[0,1,2,3],ymm14[4,5],ymm15[6,7] -; AVX2-ONLY-NEXT: vperm2i128 $2, (%rsp), %ymm5, %ymm9 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm9 = mem[0,1],ymm5[0,1] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4],ymm5[5],ymm9[6,7] +; AVX2-ONLY-NEXT: vpermd %ymm6, %ymm7, %ymm1 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0,1,2,3],ymm12[4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vperm2i128 $2, (%rsp), %ymm4, %ymm9 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm9 = mem[0,1],ymm4[0,1] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4],ymm4[5],ymm9[6,7] ; AVX2-ONLY-NEXT: vpermd %ymm3, %ymm8, %ymm3 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm9[3,4,5,6,7] -; AVX2-ONLY-NEXT: vpermd %ymm1, %ymm7, %ymm1 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-ONLY-NEXT: vperm2i128 $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm3 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm3 = mem[0,1],ymm4[0,1] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5],ymm3[6,7] -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-ONLY-NEXT: vpblendd $48, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm5 # 32-byte Folded Reload -; AVX2-ONLY-NEXT: # ymm5 = ymm4[0,1,2,3],mem[4,5],ymm4[6,7] +; AVX2-ONLY-NEXT: vpermd %ymm11, %ymm7, %ymm0 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vperm2i128 $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm3 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm3 = mem[0,1],ymm14[0,1] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm14[5],ymm3[6,7] +; AVX2-ONLY-NEXT: vpblendd $207, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm5 # 32-byte Folded Reload +; AVX2-ONLY-NEXT: # ymm5 = mem[0,1,2,3],ymm15[4,5],mem[6,7] ; AVX2-ONLY-NEXT: vpermd %ymm5, %ymm8, %ymm5 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2],ymm3[3,4,5,6,7] ; AVX2-ONLY-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm4 # 32-byte Folded Reload @@ -2319,10 +2312,10 @@ ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX2-ONLY-NEXT: vmovaps %ymm4, 32(%r8) ; AVX2-ONLY-NEXT: vmovdqa %ymm3, (%r9) -; AVX2-ONLY-NEXT: vmovdqa %ymm1, 64(%r9) -; AVX2-ONLY-NEXT: vmovdqa %ymm2, 96(%r9) -; AVX2-ONLY-NEXT: vmovdqa %ymm0, 32(%r9) -; AVX2-ONLY-NEXT: addq $1032, %rsp # imm = 0x408 +; AVX2-ONLY-NEXT: vmovdqa %ymm0, 64(%r9) +; AVX2-ONLY-NEXT: vmovdqa %ymm1, 96(%r9) +; AVX2-ONLY-NEXT: vmovdqa %ymm2, 32(%r9) +; AVX2-ONLY-NEXT: addq $1000, %rsp # imm = 0x3E8 ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq ; @@ -4428,8 +4421,7 @@ ; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload ; AVX2-ONLY-NEXT: vpblendd $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm3 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm3 = ymm10[0,1],mem[2,3],ymm10[4,5],mem[6,7] -; AVX2-ONLY-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [5,2,7,0,5,2,7,0] -; AVX2-ONLY-NEXT: # ymm1 = mem[0,1,0,1] +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm1 = ; AVX2-ONLY-NEXT: vpermd %ymm3, %ymm1, %ymm3 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3,4,5,6,7] ; AVX2-ONLY-NEXT: vpbroadcastd 304(%rdi), %ymm3 @@ -4520,7 +4512,7 @@ ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqa 288(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} ymm0 = [21474836480,21474836480,21474836480,21474836480] +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm0 = ; AVX2-ONLY-NEXT: vpermd %ymm3, %ymm0, %ymm3 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -4623,7 +4615,7 @@ ; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm13 = ymm12[12,13,14,15],ymm2[0,1,2,3,4,5,6,7,8,9,10,11],ymm12[28,29,30,31],ymm2[16,17,18,19,20,21,22,23,24,25,26,27] ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,3,2,3] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2],ymm0[3,4,5,6,7] -; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} ymm0 = [25769803777,25769803777,25769803777,25769803777] +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm0 = ; AVX2-ONLY-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1,2,3,4,5],ymm15[6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -4712,7 +4704,7 @@ ; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm7 = <4,1,6,u> ; AVX2-ONLY-NEXT: vpermd %ymm4, %ymm7, %ymm4 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2],ymm0[3,4,5,6,7] -; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} ymm6 = [30064771074,30064771074,30064771074,30064771074] +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm6 = ; AVX2-ONLY-NEXT: vpermd %ymm1, %ymm6, %ymm1 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-6.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-6.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-6.ll @@ -631,14 +631,14 @@ ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm9 = ymm8[0,2,2,2,4,6,6,6] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm9[3,4,5,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-SLOW-NEXT: vbroadcastsd {{.*#+}} ymm10 = [8589934596,8589934596,8589934596,8589934596] +; AVX2-SLOW-NEXT: vmovaps {{.*#+}} ymm10 = ; AVX2-SLOW-NEXT: vpermps %ymm9, %ymm10, %ymm10 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm10[6,7] ; AVX2-SLOW-NEXT: vmovaps {{.*#+}} xmm10 = <1,7,5,u> ; AVX2-SLOW-NEXT: vpermps %ymm6, %ymm10, %ymm6 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm8 = ymm8[1,3,2,3,5,7,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2],ymm8[3,4,5,6,7] -; AVX2-SLOW-NEXT: vbroadcastsd {{.*#+}} ymm8 = [12884901893,12884901893,12884901893,12884901893] +; AVX2-SLOW-NEXT: vmovaps {{.*#+}} ymm8 = ; AVX2-SLOW-NEXT: vpermps %ymm9, %ymm8, %ymm8 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm8[6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm7[0,1],ymm5[2,3],ymm7[4,5],ymm5[6,7] @@ -677,8 +677,7 @@ ; AVX2-SLOW-NEXT: vpermps %ymm3, %ymm11, %ymm4 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm7[2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7] -; AVX2-SLOW-NEXT: vbroadcastf128 {{.*#+}} ymm1 = [0,2,0,6,0,2,0,6] -; AVX2-SLOW-NEXT: # ymm1 = mem[0,1,0,1] +; AVX2-SLOW-NEXT: vmovaps {{.*#+}} ymm1 = ; AVX2-SLOW-NEXT: vpermps %ymm0, %ymm1, %ymm1 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3,4],ymm1[5,6,7] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm4 = ymm5[1,1,1,1,5,5,5,5] @@ -686,8 +685,7 @@ ; AVX2-SLOW-NEXT: vmovaps {{.*#+}} xmm5 = <5,3,u,u> ; AVX2-SLOW-NEXT: vpermps %ymm3, %ymm5, %ymm3 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vbroadcastf128 {{.*#+}} ymm4 = [0,3,1,7,0,3,1,7] -; AVX2-SLOW-NEXT: # ymm4 = mem[0,1,0,1] +; AVX2-SLOW-NEXT: vmovaps {{.*#+}} ymm4 = ; AVX2-SLOW-NEXT: vpermps %ymm0, %ymm4, %ymm0 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3,4],ymm0[5,6,7] ; AVX2-SLOW-NEXT: vmovaps %ymm2, (%rsi) @@ -716,14 +714,14 @@ ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm9 = ymm8[0,2,2,2,4,6,6,6] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm9[3,4,5,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm9 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FAST-NEXT: vbroadcastsd {{.*#+}} ymm10 = [8589934596,8589934596,8589934596,8589934596] +; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm10 = ; AVX2-FAST-NEXT: vpermps %ymm9, %ymm10, %ymm10 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm10[6,7] ; AVX2-FAST-NEXT: vmovaps {{.*#+}} xmm10 = <1,7,5,u> ; AVX2-FAST-NEXT: vpermps %ymm6, %ymm10, %ymm6 ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm8 = ymm8[1,3,2,3,5,7,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2],ymm8[3,4,5,6,7] -; AVX2-FAST-NEXT: vbroadcastsd {{.*#+}} ymm8 = [12884901893,12884901893,12884901893,12884901893] +; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm8 = ; AVX2-FAST-NEXT: vpermps %ymm9, %ymm8, %ymm8 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm8[6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm7[0,1],ymm5[2,3],ymm7[4,5],ymm5[6,7] @@ -762,8 +760,7 @@ ; AVX2-FAST-NEXT: vpermps %ymm3, %ymm11, %ymm4 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm7[2,3,4,5,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7] -; AVX2-FAST-NEXT: vbroadcastf128 {{.*#+}} ymm1 = [0,2,0,6,0,2,0,6] -; AVX2-FAST-NEXT: # ymm1 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm1 = ; AVX2-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm1 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm5[1,1,1,1,5,5,5,5] @@ -771,8 +768,7 @@ ; AVX2-FAST-NEXT: vmovaps {{.*#+}} xmm5 = <5,3,u,u> ; AVX2-FAST-NEXT: vpermps %ymm3, %ymm5, %ymm3 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vbroadcastf128 {{.*#+}} ymm4 = [0,3,1,7,0,3,1,7] -; AVX2-FAST-NEXT: # ymm4 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm4 = ; AVX2-FAST-NEXT: vpermps %ymm0, %ymm4, %ymm0 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3,4],ymm0[5,6,7] ; AVX2-FAST-NEXT: vmovaps %ymm2, (%rsi) @@ -801,14 +797,14 @@ ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm9 = ymm8[0,2,2,2,4,6,6,6] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm9[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastsd {{.*#+}} ymm10 = [8589934596,8589934596,8589934596,8589934596] +; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} ymm10 = ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm9, %ymm10, %ymm10 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm10[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} xmm10 = <1,7,5,u> ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm6, %ymm10, %ymm6 ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm8 = ymm8[1,3,2,3,5,7,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2],ymm8[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastsd {{.*#+}} ymm8 = [12884901893,12884901893,12884901893,12884901893] +; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} ymm8 = ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm9, %ymm8, %ymm8 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm8[6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm7[0,1],ymm5[2,3],ymm7[4,5],ymm5[6,7] @@ -847,8 +843,7 @@ ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm3, %ymm11, %ymm4 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm7[2,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastf128 {{.*#+}} ymm1 = [0,2,0,6,0,2,0,6] -; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} ymm1 = ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm0, %ymm1, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm4 = ymm5[1,1,1,1,5,5,5,5] @@ -856,8 +851,7 @@ ; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} xmm5 = <5,3,u,u> ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm3, %ymm5, %ymm3 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastf128 {{.*#+}} ymm4 = [0,3,1,7,0,3,1,7] -; AVX2-FAST-PERLANE-NEXT: # ymm4 = mem[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} ymm4 = ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm0, %ymm4, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3,4],ymm0[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, (%rsi) @@ -1415,69 +1409,69 @@ ; AVX2-SLOW-NEXT: subq $200, %rsp ; AVX2-SLOW-NEXT: vmovaps 288(%rdi), %ymm8 ; AVX2-SLOW-NEXT: vmovaps 224(%rdi), %ymm10 -; AVX2-SLOW-NEXT: vmovaps 192(%rdi), %ymm5 -; AVX2-SLOW-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 128(%rdi), %ymm0 -; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovaps 192(%rdi), %ymm4 +; AVX2-SLOW-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovaps 128(%rdi), %ymm1 +; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps 160(%rdi), %ymm2 ; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 96(%rdi), %ymm7 +; AVX2-SLOW-NEXT: vmovaps 96(%rdi), %ymm5 ; AVX2-SLOW-NEXT: vmovaps (%rdi), %ymm3 ; AVX2-SLOW-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps 32(%rdi), %ymm11 ; AVX2-SLOW-NEXT: vmovaps 64(%rdi), %ymm9 -; AVX2-SLOW-NEXT: vmovaps {{.*#+}} xmm1 = <0,6,4,u> -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm3[0,1,2,3],ymm11[4,5],ymm3[6,7] -; AVX2-SLOW-NEXT: vpermps %ymm4, %ymm1, %ymm6 -; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm9[0,1],ymm7[0,1] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],ymm7[6,7] +; AVX2-SLOW-NEXT: vmovaps {{.*#+}} xmm0 = <0,6,4,u> +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm11[4,5],ymm3[6,7] +; AVX2-SLOW-NEXT: vpermps %ymm3, %ymm0, %ymm6 +; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm9[0,1],ymm5[0,1] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],ymm5[6,7] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm13 = ymm12[0,2,2,2,4,6,6,6] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2],ymm13[3,4,5,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm13 = ymm2[0,1,2,3],ymm0[4,5,6,7] -; AVX2-SLOW-NEXT: vbroadcastsd {{.*#+}} ymm0 = [8589934596,8589934596,8589934596,8589934596] -; AVX2-SLOW-NEXT: vpermps %ymm13, %ymm0, %ymm14 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm6[0,1,2,3,4,5],ymm14[6,7] -; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm5[0,1,2,3],ymm10[4,5],ymm5[6,7] -; AVX2-SLOW-NEXT: vpermps %ymm6, %ymm1, %ymm14 -; AVX2-SLOW-NEXT: vmovaps 256(%rdi), %ymm5 -; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm15 = ymm5[0,1],ymm8[0,1] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm15[0,1,2,3,4,5],ymm8[6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm15 = ymm2[0,2,2,2,4,6,6,6] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm14[0,1,2],ymm15[3,4,5,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm13 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX2-SLOW-NEXT: vmovaps {{.*#+}} ymm7 = +; AVX2-SLOW-NEXT: vpermps %ymm13, %ymm7, %ymm14 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5],ymm14[6,7] +; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm4[0,1,2,3],ymm10[4,5],ymm4[6,7] +; AVX2-SLOW-NEXT: vpermps %ymm6, %ymm0, %ymm14 +; AVX2-SLOW-NEXT: vmovaps 256(%rdi), %ymm4 +; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm15 = ymm4[0,1],ymm8[0,1] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm15[0,1,2,3,4,5],ymm8[6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm15 = ymm1[0,2,2,2,4,6,6,6] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm14[0,1,2],ymm15[3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovaps 320(%rdi), %ymm14 ; AVX2-SLOW-NEXT: vmovaps 352(%rdi), %ymm15 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm15[0,1,2,3],ymm14[4,5,6,7] -; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] -; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps {{.*#+}} xmm0 = <1,7,5,u> -; AVX2-SLOW-NEXT: vpermps %ymm4, %ymm0, %ymm3 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm4 = ymm12[1,3,2,3,5,7,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm4[3,4,5,6,7] -; AVX2-SLOW-NEXT: vbroadcastsd {{.*#+}} ymm4 = [12884901893,12884901893,12884901893,12884901893] -; AVX2-SLOW-NEXT: vpermps %ymm13, %ymm4, %ymm12 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm14[4,5,6,7] +; AVX2-SLOW-NEXT: vpermps %ymm0, %ymm7, %ymm7 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm7[6,7] +; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovaps {{.*#+}} xmm2 = <1,7,5,u> +; AVX2-SLOW-NEXT: vpermps %ymm3, %ymm2, %ymm3 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm7 = ymm12[1,3,2,3,5,7,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm7[3,4,5,6,7] +; AVX2-SLOW-NEXT: vmovaps {{.*#+}} ymm7 = +; AVX2-SLOW-NEXT: vpermps %ymm13, %ymm7, %ymm12 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm12[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermps %ymm6, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm2[1,3,2,3,5,7,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7] -; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm4, %ymm1 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX2-SLOW-NEXT: vpermps %ymm6, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,3,2,3,5,7,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7] +; AVX2-SLOW-NEXT: vpermps %ymm0, %ymm7, %ymm0 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps (%rdi), %xmm0 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm11[0,0,2,3,4,4,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm9[0,1],ymm7[2,3],ymm9[4,5],ymm7[6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm9[0,1],ymm5[2,3],ymm9[4,5],ymm5[6,7] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,0,2,3,6,4,6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,0,3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,3,2,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm12[0,0,0,0,4,4,4,4] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm3 = ymm6[0,0,2,3,4,4,6,7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm3 = ymm13[0,0,2,3,4,4,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] @@ -1486,27 +1480,28 @@ ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm1[2,3,2,3] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm3 = ymm10[0,0,2,3,4,4,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm3[1,2,3],ymm2[4],ymm3[5,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1],ymm8[2,3],ymm5[4,5],ymm8[6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1],ymm8[2,3],ymm4[4,5],ymm8[6,7] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm3 = ymm3[2,0,2,3,6,4,6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,2,0,3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,3,2,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3,4,5,6,7] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm3 = ymm15[0,0,0,0,4,4,4,4] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm4 = ymm14[0,0,2,3,4,4,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3],ymm4[4,5,6],ymm3[7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm6 = ymm14[0,0,2,3,4,4,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm6[0,1,2],ymm3[3],ymm6[4,5,6],ymm3[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm11[0,1,3,3,4,5,7,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm7[3,3,3,3,7,7,7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm5[3,3,3,3,7,7,7,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm9[1],ymm2[2,3,4],ymm9[5],ymm2[6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,0,3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm12[0,1,0,1,4,5,4,5] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm3 = ymm6[0,1,3,3,4,5,7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm3 = ymm13[0,1,3,3,4,5,7,7] +; AVX2-SLOW-NEXT: vmovaps %ymm13, %ymm6 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm13 = ymm0[0,1,2,3,4],ymm2[5,6,7] @@ -1514,7 +1509,7 @@ ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm10[0,1,3,3,4,5,7,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm8[3,3,3,3,7,7,7,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm5[1],ymm1[2,3,4],ymm5[5],ymm1[6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm4[1],ymm1[2,3,4],ymm4[5],ymm1[6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] @@ -1525,42 +1520,40 @@ ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm1 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm1 = ymm10[0,1,2,3],mem[4,5,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm8[4,5,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm8[4,5,6,7] ; AVX2-SLOW-NEXT: vmovaps 272(%rdi), %xmm4 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm4[2,3],ymm2[4,5,6,7] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,0,2,4,5,4,6] -; AVX2-SLOW-NEXT: vmovaps {{.*#+}} xmm5 = <4,2,u,u> -; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm5, %ymm8 +; AVX2-SLOW-NEXT: vmovaps {{.*#+}} xmm7 = <4,2,u,u> +; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm7, %ymm8 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1],ymm0[2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm15[0,1],ymm14[2,3],ymm15[4,5,6,7] -; AVX2-SLOW-NEXT: vbroadcastf128 {{.*#+}} ymm10 = [0,2,0,6,0,2,0,6] -; AVX2-SLOW-NEXT: # ymm10 = mem[0,1,0,1] +; AVX2-SLOW-NEXT: vmovaps {{.*#+}} ymm10 = ; AVX2-SLOW-NEXT: vpermps %ymm8, %ymm10, %ymm14 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm14[5,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm9[0,1,2,3],ymm5[4,5,6,7] ; AVX2-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm9 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm9 = ymm11[0,1,2,3],mem[4,5,6,7] ; AVX2-SLOW-NEXT: vmovaps 80(%rdi), %xmm11 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm14 = ymm7[0,1],ymm11[2,3],ymm7[4,5,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm14 = ymm5[0,1],ymm11[2,3],ymm5[4,5,6,7] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm14 = ymm14[0,1,0,2,4,5,4,6] -; AVX2-SLOW-NEXT: vpermps %ymm9, %ymm5, %ymm5 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm14[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpermps %ymm9, %ymm7, %ymm7 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm14[2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm14 = ymm12[0,1],ymm6[2,3],ymm12[4,5,6,7] ; AVX2-SLOW-NEXT: vpermps %ymm14, %ymm10, %ymm10 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm10[5,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm10[5,6,7] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm2[1,1,1,1,5,5,5,5] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm4[3],ymm2[4,5,6,7] ; AVX2-SLOW-NEXT: vmovaps {{.*#+}} xmm4 = <5,3,u,u> ; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm4, %ymm1 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vbroadcastf128 {{.*#+}} ymm2 = [0,3,1,7,0,3,1,7] -; AVX2-SLOW-NEXT: # ymm2 = mem[0,1,0,1] +; AVX2-SLOW-NEXT: vmovaps {{.*#+}} ymm2 = ; AVX2-SLOW-NEXT: vpermps %ymm8, %ymm2, %ymm8 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm8[5,6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm7 = ymm7[1,1,1,1,5,5,5,5] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2],ymm11[3],ymm7[4,5,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm5 = ymm5[1,1,1,1,5,5,5,5] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2],ymm11[3],ymm5[4,5,6,7] ; AVX2-SLOW-NEXT: vpermps %ymm9, %ymm4, %ymm4 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm7[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm5[2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpermps %ymm14, %ymm2, %ymm2 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3,4],ymm2[5,6,7] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload @@ -1578,7 +1571,7 @@ ; AVX2-SLOW-NEXT: vmovaps %ymm3, 32(%r8) ; AVX2-SLOW-NEXT: vmovaps %ymm13, (%r8) ; AVX2-SLOW-NEXT: vmovaps %ymm0, 32(%r9) -; AVX2-SLOW-NEXT: vmovaps %ymm5, (%r9) +; AVX2-SLOW-NEXT: vmovaps %ymm7, (%r9) ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-SLOW-NEXT: vmovaps %ymm1, 32(%rax) ; AVX2-SLOW-NEXT: vmovaps %ymm2, (%rax) @@ -1591,97 +1584,98 @@ ; AVX2-FAST-NEXT: subq $200, %rsp ; AVX2-FAST-NEXT: vmovaps 288(%rdi), %ymm8 ; AVX2-FAST-NEXT: vmovaps 224(%rdi), %ymm10 -; AVX2-FAST-NEXT: vmovaps 192(%rdi), %ymm5 -; AVX2-FAST-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 128(%rdi), %ymm0 -; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovaps 192(%rdi), %ymm4 +; AVX2-FAST-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovaps 128(%rdi), %ymm1 +; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps 160(%rdi), %ymm2 ; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 96(%rdi), %ymm7 +; AVX2-FAST-NEXT: vmovaps 96(%rdi), %ymm5 ; AVX2-FAST-NEXT: vmovaps (%rdi), %ymm3 ; AVX2-FAST-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps 32(%rdi), %ymm11 ; AVX2-FAST-NEXT: vmovaps 64(%rdi), %ymm9 -; AVX2-FAST-NEXT: vmovaps {{.*#+}} xmm1 = <0,6,4,u> -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm3[0,1,2,3],ymm11[4,5],ymm3[6,7] -; AVX2-FAST-NEXT: vpermps %ymm4, %ymm1, %ymm6 -; AVX2-FAST-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm9[0,1],ymm7[0,1] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],ymm7[6,7] +; AVX2-FAST-NEXT: vmovaps {{.*#+}} xmm0 = <0,6,4,u> +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm11[4,5],ymm3[6,7] +; AVX2-FAST-NEXT: vpermps %ymm3, %ymm0, %ymm6 +; AVX2-FAST-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm9[0,1],ymm5[0,1] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],ymm5[6,7] ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm13 = ymm12[0,2,2,2,4,6,6,6] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2],ymm13[3,4,5,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm13 = ymm2[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FAST-NEXT: vbroadcastsd {{.*#+}} ymm0 = [8589934596,8589934596,8589934596,8589934596] -; AVX2-FAST-NEXT: vpermps %ymm13, %ymm0, %ymm14 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm6[0,1,2,3,4,5],ymm14[6,7] -; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm5[0,1,2,3],ymm10[4,5],ymm5[6,7] -; AVX2-FAST-NEXT: vpermps %ymm6, %ymm1, %ymm14 -; AVX2-FAST-NEXT: vmovaps 256(%rdi), %ymm5 -; AVX2-FAST-NEXT: vperm2f128 {{.*#+}} ymm15 = ymm5[0,1],ymm8[0,1] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm15[0,1,2,3,4,5],ymm8[6,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm15 = ymm2[0,2,2,2,4,6,6,6] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm14[0,1,2],ymm15[3,4,5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm13 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm7 = +; AVX2-FAST-NEXT: vpermps %ymm13, %ymm7, %ymm14 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5],ymm14[6,7] +; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm4[0,1,2,3],ymm10[4,5],ymm4[6,7] +; AVX2-FAST-NEXT: vpermps %ymm6, %ymm0, %ymm14 +; AVX2-FAST-NEXT: vmovaps 256(%rdi), %ymm4 +; AVX2-FAST-NEXT: vperm2f128 {{.*#+}} ymm15 = ymm4[0,1],ymm8[0,1] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm15[0,1,2,3,4,5],ymm8[6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm15 = ymm1[0,2,2,2,4,6,6,6] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm14[0,1,2],ymm15[3,4,5,6,7] ; AVX2-FAST-NEXT: vmovaps 320(%rdi), %ymm14 ; AVX2-FAST-NEXT: vmovaps 352(%rdi), %ymm15 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm15[0,1,2,3],ymm14[4,5,6,7] -; AVX2-FAST-NEXT: vpermps %ymm1, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] -; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps {{.*#+}} xmm0 = <1,7,5,u> -; AVX2-FAST-NEXT: vpermps %ymm4, %ymm0, %ymm3 -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm12[1,3,2,3,5,7,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm4[3,4,5,6,7] -; AVX2-FAST-NEXT: vbroadcastsd {{.*#+}} ymm4 = [12884901893,12884901893,12884901893,12884901893] -; AVX2-FAST-NEXT: vpermps %ymm13, %ymm4, %ymm12 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm14[4,5,6,7] +; AVX2-FAST-NEXT: vpermps %ymm0, %ymm7, %ymm7 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm7[6,7] +; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovaps {{.*#+}} xmm2 = <1,7,5,u> +; AVX2-FAST-NEXT: vpermps %ymm3, %ymm2, %ymm3 +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm7 = ymm12[1,3,2,3,5,7,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm7[3,4,5,6,7] +; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm7 = +; AVX2-FAST-NEXT: vpermps %ymm13, %ymm7, %ymm12 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm12[6,7] ; AVX2-FAST-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermps %ymm6, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm2[1,3,2,3,5,7,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7] -; AVX2-FAST-NEXT: vpermps %ymm1, %ymm4, %ymm1 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX2-FAST-NEXT: vpermps %ymm6, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,3,2,3,5,7,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7] +; AVX2-FAST-NEXT: vpermps %ymm0, %ymm7, %ymm0 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps (%rdi), %xmm0 ; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm11[0,0,2,3,4,4,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm9[0,1],ymm7[2,3],ymm9[4,5],ymm7[6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm9[0,1],ymm5[2,3],ymm9[4,5],ymm5[6,7] ; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm3 = [2,0,6,4,2,0,6,7] ; AVX2-FAST-NEXT: vpermps %ymm2, %ymm3, %ymm2 ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,3,2,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm12[0,0,0,0,4,4,4,4] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm6[0,0,2,3,4,4,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2],ymm2[3],ymm4[4,5,6],ymm2[7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm6 = ymm13[0,0,2,3,4,4,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm6[0,1,2],ymm2[3],ymm6[4,5,6],ymm2[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm1, (%rsp) # 32-byte Spill -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1],ymm8[2,3],ymm5[4,5],ymm8[6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1],ymm8[2,3],ymm4[4,5],ymm8[6,7] ; AVX2-FAST-NEXT: vpermps %ymm1, %ymm3, %ymm1 ; AVX2-FAST-NEXT: vmovaps 192(%rdi), %xmm2 ; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm3 = xmm2[2,3,2,3] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm10[0,0,2,3,4,4,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm4[1,2,3],ymm3[4],ymm4[5,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm6 = ymm10[0,0,2,3,4,4,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm6[1,2,3],ymm3[4],ymm6[5,6,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,3,2,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3,4,5,6,7] ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm3 = ymm15[0,0,0,0,4,4,4,4] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm14[0,0,2,3,4,4,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3],ymm4[4,5,6],ymm3[7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm6 = ymm14[0,0,2,3,4,4,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm6[0,1,2],ymm3[3],ymm6[4,5,6],ymm3[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm3[5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm11[0,1,3,3,4,5,7,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm7[3,3,3,3,7,7,7,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm5[3,3,3,3,7,7,7,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm9[1],ymm1[2,3,4],ymm9[5],ymm1[6,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm12[0,1,0,1,4,5,4,5] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm3 = ymm6[0,1,3,3,4,5,7,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm3 = ymm13[0,1,3,3,4,5,7,7] +; AVX2-FAST-NEXT: vmovaps %ymm13, %ymm6 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3],ymm3[4,5,6],ymm1[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm13 = ymm0[0,1,2,3,4],ymm1[5,6,7] @@ -1689,7 +1683,7 @@ ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm10[0,1,3,3,4,5,7,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm8[3,3,3,3,7,7,7,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm5[1],ymm1[2,3,4],ymm5[5],ymm1[6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm4[1],ymm1[2,3,4],ymm4[5],ymm1[6,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] @@ -1700,42 +1694,40 @@ ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm1 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm1 = ymm10[0,1,2,3],mem[4,5,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm8[4,5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm8[4,5,6,7] ; AVX2-FAST-NEXT: vmovaps 272(%rdi), %xmm4 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm4[2,3],ymm2[4,5,6,7] ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,0,2,4,5,4,6] -; AVX2-FAST-NEXT: vmovaps {{.*#+}} xmm5 = <4,2,u,u> -; AVX2-FAST-NEXT: vpermps %ymm1, %ymm5, %ymm8 +; AVX2-FAST-NEXT: vmovaps {{.*#+}} xmm7 = <4,2,u,u> +; AVX2-FAST-NEXT: vpermps %ymm1, %ymm7, %ymm8 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1],ymm0[2,3,4,5,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm15[0,1],ymm14[2,3],ymm15[4,5,6,7] -; AVX2-FAST-NEXT: vbroadcastf128 {{.*#+}} ymm10 = [0,2,0,6,0,2,0,6] -; AVX2-FAST-NEXT: # ymm10 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm10 = ; AVX2-FAST-NEXT: vpermps %ymm8, %ymm10, %ymm14 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm14[5,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm9[0,1,2,3],ymm5[4,5,6,7] ; AVX2-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm9 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm9 = ymm11[0,1,2,3],mem[4,5,6,7] ; AVX2-FAST-NEXT: vmovaps 80(%rdi), %xmm11 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm14 = ymm7[0,1],ymm11[2,3],ymm7[4,5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm14 = ymm5[0,1],ymm11[2,3],ymm5[4,5,6,7] ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm14 = ymm14[0,1,0,2,4,5,4,6] -; AVX2-FAST-NEXT: vpermps %ymm9, %ymm5, %ymm5 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm14[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpermps %ymm9, %ymm7, %ymm7 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm14[2,3,4,5,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm14 = ymm12[0,1],ymm6[2,3],ymm12[4,5,6,7] ; AVX2-FAST-NEXT: vpermps %ymm14, %ymm10, %ymm10 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm10[5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm10[5,6,7] ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm2[1,1,1,1,5,5,5,5] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm4[3],ymm2[4,5,6,7] ; AVX2-FAST-NEXT: vmovaps {{.*#+}} xmm4 = <5,3,u,u> ; AVX2-FAST-NEXT: vpermps %ymm1, %ymm4, %ymm1 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vbroadcastf128 {{.*#+}} ymm2 = [0,3,1,7,0,3,1,7] -; AVX2-FAST-NEXT: # ymm2 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm2 = ; AVX2-FAST-NEXT: vpermps %ymm8, %ymm2, %ymm8 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm8[5,6,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm7 = ymm7[1,1,1,1,5,5,5,5] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2],ymm11[3],ymm7[4,5,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm5 = ymm5[1,1,1,1,5,5,5,5] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2],ymm11[3],ymm5[4,5,6,7] ; AVX2-FAST-NEXT: vpermps %ymm9, %ymm4, %ymm4 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm7[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm5[2,3,4,5,6,7] ; AVX2-FAST-NEXT: vpermps %ymm14, %ymm2, %ymm2 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload @@ -1753,7 +1745,7 @@ ; AVX2-FAST-NEXT: vmovaps %ymm3, 32(%r8) ; AVX2-FAST-NEXT: vmovaps %ymm13, (%r8) ; AVX2-FAST-NEXT: vmovaps %ymm0, 32(%r9) -; AVX2-FAST-NEXT: vmovaps %ymm5, (%r9) +; AVX2-FAST-NEXT: vmovaps %ymm7, (%r9) ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FAST-NEXT: vmovaps %ymm1, 32(%rax) ; AVX2-FAST-NEXT: vmovaps %ymm2, (%rax) @@ -1766,69 +1758,69 @@ ; AVX2-FAST-PERLANE-NEXT: subq $200, %rsp ; AVX2-FAST-PERLANE-NEXT: vmovaps 288(%rdi), %ymm8 ; AVX2-FAST-PERLANE-NEXT: vmovaps 224(%rdi), %ymm10 -; AVX2-FAST-PERLANE-NEXT: vmovaps 192(%rdi), %ymm5 -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 128(%rdi), %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovaps 192(%rdi), %ymm4 +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovaps 128(%rdi), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 160(%rdi), %ymm2 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 96(%rdi), %ymm7 +; AVX2-FAST-PERLANE-NEXT: vmovaps 96(%rdi), %ymm5 ; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdi), %ymm3 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rdi), %ymm11 ; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%rdi), %ymm9 -; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} xmm1 = <0,6,4,u> -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm3[0,1,2,3],ymm11[4,5],ymm3[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm4, %ymm1, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm9[0,1],ymm7[0,1] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],ymm7[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} xmm0 = <0,6,4,u> +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm11[4,5],ymm3[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm3, %ymm0, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm9[0,1],ymm5[0,1] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],ymm5[6,7] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm13 = ymm12[0,2,2,2,4,6,6,6] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2],ymm13[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm13 = ymm2[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastsd {{.*#+}} ymm0 = [8589934596,8589934596,8589934596,8589934596] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm13, %ymm0, %ymm14 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm6[0,1,2,3,4,5],ymm14[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm5[0,1,2,3],ymm10[4,5],ymm5[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm6, %ymm1, %ymm14 -; AVX2-FAST-PERLANE-NEXT: vmovaps 256(%rdi), %ymm5 -; AVX2-FAST-PERLANE-NEXT: vperm2f128 {{.*#+}} ymm15 = ymm5[0,1],ymm8[0,1] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm15[0,1,2,3,4,5],ymm8[6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm15 = ymm2[0,2,2,2,4,6,6,6] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm14[0,1,2],ymm15[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm13 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} ymm7 = +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm13, %ymm7, %ymm14 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5],ymm14[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm4[0,1,2,3],ymm10[4,5],ymm4[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm6, %ymm0, %ymm14 +; AVX2-FAST-PERLANE-NEXT: vmovaps 256(%rdi), %ymm4 +; AVX2-FAST-PERLANE-NEXT: vperm2f128 {{.*#+}} ymm15 = ymm4[0,1],ymm8[0,1] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm15[0,1,2,3,4,5],ymm8[6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm15 = ymm1[0,2,2,2,4,6,6,6] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm14[0,1,2],ymm15[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps 320(%rdi), %ymm14 ; AVX2-FAST-PERLANE-NEXT: vmovaps 352(%rdi), %ymm15 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm15[0,1,2,3],ymm14[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} xmm0 = <1,7,5,u> -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm4, %ymm0, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm4 = ymm12[1,3,2,3,5,7,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm4[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastsd {{.*#+}} ymm4 = [12884901893,12884901893,12884901893,12884901893] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm13, %ymm4, %ymm12 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm14[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm0, %ymm7, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm7[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} xmm2 = <1,7,5,u> +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm3, %ymm2, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm7 = ymm12[1,3,2,3,5,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm7[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} ymm7 = +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm13, %ymm7, %ymm12 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm12[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm6, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm2[1,3,2,3,5,7,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm4, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm6, %ymm2, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,3,2,3,5,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm0, %ymm7, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdi), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm11[0,0,2,3,4,4,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm9[0,1],ymm7[2,3],ymm9[4,5],ymm7[6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm9[0,1],ymm5[2,3],ymm9[4,5],ymm5[6,7] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,0,2,3,6,4,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,0,3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm12[0,0,0,0,4,4,4,4] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm3 = ymm6[0,0,2,3,4,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm3 = ymm13[0,0,2,3,4,4,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] @@ -1837,27 +1829,28 @@ ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm2 = xmm1[2,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm3 = ymm10[0,0,2,3,4,4,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm3[1,2,3],ymm2[4],ymm3[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1],ymm8[2,3],ymm5[4,5],ymm8[6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1],ymm8[2,3],ymm4[4,5],ymm8[6,7] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm3 = ymm3[2,0,2,3,6,4,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,2,0,3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm3 = ymm15[0,0,0,0,4,4,4,4] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm4 = ymm14[0,0,2,3,4,4,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3],ymm4[4,5,6],ymm3[7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm6 = ymm14[0,0,2,3,4,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm6[0,1,2],ymm3[3],ymm6[4,5,6],ymm3[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm11[0,1,3,3,4,5,7,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm7[3,3,3,3,7,7,7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm5[3,3,3,3,7,7,7,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm9[1],ymm2[2,3,4],ymm9[5],ymm2[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,0,3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm12[0,1,0,1,4,5,4,5] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm3 = ymm6[0,1,3,3,4,5,7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm3 = ymm13[0,1,3,3,4,5,7,7] +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm13, %ymm6 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm13 = ymm0[0,1,2,3,4],ymm2[5,6,7] @@ -1865,7 +1858,7 @@ ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm10[0,1,3,3,4,5,7,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm8[3,3,3,3,7,7,7,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm5[1],ymm1[2,3,4],ymm5[5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm4[1],ymm1[2,3,4],ymm4[5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] @@ -1876,42 +1869,40 @@ ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm1 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm1 = ymm10[0,1,2,3],mem[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm8[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm8[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps 272(%rdi), %xmm4 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm4[2,3],ymm2[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,0,2,4,5,4,6] -; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} xmm5 = <4,2,u,u> -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm5, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} xmm7 = <4,2,u,u> +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm7, %ymm8 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1],ymm0[2,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm15[0,1],ymm14[2,3],ymm15[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastf128 {{.*#+}} ymm10 = [0,2,0,6,0,2,0,6] -; AVX2-FAST-PERLANE-NEXT: # ymm10 = mem[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} ymm10 = ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm8, %ymm10, %ymm14 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm14[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm9[0,1,2,3],ymm5[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm9 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm9 = ymm11[0,1,2,3],mem[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps 80(%rdi), %xmm11 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm14 = ymm7[0,1],ymm11[2,3],ymm7[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm14 = ymm5[0,1],ymm11[2,3],ymm5[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm14 = ymm14[0,1,0,2,4,5,4,6] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm9, %ymm5, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm14[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm9, %ymm7, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm14[2,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm14 = ymm12[0,1],ymm6[2,3],ymm12[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm14, %ymm10, %ymm10 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm10[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm10[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm2[1,1,1,1,5,5,5,5] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm4[3],ymm2[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} xmm4 = <5,3,u,u> ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm4, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastf128 {{.*#+}} ymm2 = [0,3,1,7,0,3,1,7] -; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} ymm2 = ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm8, %ymm2, %ymm8 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm8[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm7 = ymm7[1,1,1,1,5,5,5,5] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2],ymm11[3],ymm7[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm5 = ymm5[1,1,1,1,5,5,5,5] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2],ymm11[3],ymm5[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm9, %ymm4, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm7[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm5[2,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm14, %ymm2, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload @@ -1929,7 +1920,7 @@ ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, 32(%r8) ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm13, (%r8) ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 32(%r9) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, (%r9) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm7, (%r9) ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 32(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, (%rax) @@ -3185,8 +3176,8 @@ ; AVX2-SLOW-NEXT: vmovaps 480(%rdi), %ymm6 ; AVX2-SLOW-NEXT: vmovaps 448(%rdi), %ymm7 ; AVX2-SLOW-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 416(%rdi), %ymm8 -; AVX2-SLOW-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovaps 416(%rdi), %ymm9 +; AVX2-SLOW-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps 128(%rdi), %ymm0 ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps 160(%rdi), %ymm4 @@ -3204,51 +3195,51 @@ ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm14, %ymm2 ; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm5[0,1],ymm3[0,1] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm11 = ymm1[0,1,2,3,4,5],ymm3[6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm3 = ymm11[0,2,2,2,4,6,6,6] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm1[0,1,2,3,4,5],ymm3[6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm3 = ymm8[0,2,2,2,4,6,6,6] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm2[0,1,2],ymm3[3,4,5,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm10 = ymm4[0,1,2,3],ymm0[4,5,6,7] -; AVX2-SLOW-NEXT: vbroadcastsd {{.*#+}} ymm2 = [8589934596,8589934596,8589934596,8589934596] -; AVX2-SLOW-NEXT: vpermps %ymm10, %ymm2, %ymm5 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm5[6,7] -; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovaps {{.*#+}} ymm11 = +; AVX2-SLOW-NEXT: vpermps %ymm10, %ymm11, %ymm5 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm5[6,7] +; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm7[0,1],ymm6[0,1] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm12 = ymm3[0,1,2,3,4,5],ymm6[6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm3[0,1,2,3,4,5],ymm6[6,7] ; AVX2-SLOW-NEXT: vmovaps 384(%rdi), %ymm0 ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm0[0,1,2,3],ymm8[4,5],ymm0[6,7] -; AVX2-SLOW-NEXT: vpermps %ymm6, %ymm14, %ymm5 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm7 = ymm12[0,2,2,2,4,6,6,6] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm0[0,1,2,3],ymm9[4,5],ymm0[6,7] +; AVX2-SLOW-NEXT: vpermps %ymm4, %ymm14, %ymm5 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm7 = ymm6[0,2,2,2,4,6,6,6] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2],ymm7[3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovaps 512(%rdi), %ymm0 ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps 544(%rdi), %ymm1 ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-SLOW-NEXT: vpermps %ymm4, %ymm2, %ymm9 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5],ymm9[6,7] -; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-SLOW-NEXT: vpermps %ymm3, %ymm11, %ymm9 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5],ymm9[6,7] +; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps 288(%rdi), %ymm1 ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps 256(%rdi), %ymm0 ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm0[0,1],ymm1[0,1] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm5[0,1,2,3,4,5],ymm1[6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm12 = ymm5[0,1,2,3,4,5],ymm1[6,7] ; AVX2-SLOW-NEXT: vmovaps 224(%rdi), %ymm0 ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps 192(%rdi), %ymm1 ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] -; AVX2-SLOW-NEXT: vpermps %ymm3, %ymm14, %ymm15 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm13 = ymm9[0,2,2,2,4,6,6,6] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] +; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm14, %ymm15 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm13 = ymm12[0,2,2,2,4,6,6,6] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm13 = ymm15[0,1,2],ymm13[3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovaps 320(%rdi), %ymm0 -; AVX2-SLOW-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps 352(%rdi), %ymm1 ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-SLOW-NEXT: vpermps %ymm8, %ymm2, %ymm15 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-SLOW-NEXT: vpermps %ymm9, %ymm11, %ymm15 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm13[0,1,2,3,4,5],ymm15[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps 608(%rdi), %ymm0 @@ -3258,42 +3249,42 @@ ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm13 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] ; AVX2-SLOW-NEXT: vpermps %ymm13, %ymm14, %ymm14 ; AVX2-SLOW-NEXT: vmovaps 672(%rdi), %ymm1 -; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 640(%rdi), %ymm15 -; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm15[0,1],ymm1[0,1] -; AVX2-SLOW-NEXT: vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovups %ymm1, (%rsp) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovaps 640(%rdi), %ymm0 +; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm0[0,1],ymm1[0,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm1[6,7] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm5 = ymm7[0,2,2,2,4,6,6,6] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm14[0,1,2],ymm5[3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovaps 704(%rdi), %ymm1 ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 736(%rdi), %ymm0 -; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm14 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-SLOW-NEXT: vpermps %ymm14, %ymm2, %ymm0 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5],ymm0[6,7] -; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps {{.*#+}} xmm0 = <1,7,5,u> -; AVX2-SLOW-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm11 = ymm11[1,3,2,3,5,7,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2],ymm11[3,4,5,6,7] -; AVX2-SLOW-NEXT: vbroadcastsd {{.*#+}} ymm11 = [12884901893,12884901893,12884901893,12884901893] -; AVX2-SLOW-NEXT: vpermps %ymm10, %ymm11, %ymm10 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm10[6,7] +; AVX2-SLOW-NEXT: vmovaps 736(%rdi), %ymm15 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1,2,3],ymm1[4,5,6,7] +; AVX2-SLOW-NEXT: vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpermps %ymm14, %ymm11, %ymm11 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm11[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermps %ymm6, %ymm0, %ymm5 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm6 = ymm12[1,3,2,3,5,7,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2],ymm6[3,4,5,6,7] -; AVX2-SLOW-NEXT: vpermps %ymm4, %ymm11, %ymm4 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7] -; AVX2-SLOW-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermps %ymm3, %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm3 = ymm9[1,3,2,3,5,7,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm3[3,4,5,6,7] -; AVX2-SLOW-NEXT: vpermps %ymm8, %ymm11, %ymm2 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] -; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermps %ymm13, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vmovaps {{.*#+}} xmm5 = <1,7,5,u> +; AVX2-SLOW-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm11 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm8 = ymm8[1,3,2,3,5,7,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm11[0,1,2],ymm8[3,4,5,6,7] +; AVX2-SLOW-NEXT: vmovaps {{.*#+}} ymm11 = +; AVX2-SLOW-NEXT: vpermps %ymm10, %ymm11, %ymm10 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm10[6,7] +; AVX2-SLOW-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpermps %ymm4, %ymm5, %ymm4 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm6 = ymm6[1,3,2,3,5,7,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2],ymm6[3,4,5,6,7] +; AVX2-SLOW-NEXT: vpermps %ymm3, %ymm11, %ymm3 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7] +; AVX2-SLOW-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm5, %ymm0 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm12[1,3,2,3,5,7,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7] +; AVX2-SLOW-NEXT: vpermps %ymm9, %ymm11, %ymm1 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpermps %ymm13, %ymm5, %ymm0 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm7[1,3,2,3,5,7,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX2-SLOW-NEXT: vpermps %ymm14, %ymm11, %ymm1 @@ -3301,8 +3292,8 @@ ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps (%rdi), %xmm2 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm2[2,3,2,3] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm13[0,0,2,3,4,4,6,7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm7[0,0,2,3,4,4,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload @@ -3311,8 +3302,8 @@ ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm9[0,0,0,0,4,4,4,4] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm8[0,0,0,0,4,4,4,4] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm3 = ymm10[0,0,2,3,4,4,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3],ymm3[4,5,6],ymm1[7] @@ -3321,8 +3312,8 @@ ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps 384(%rdi), %xmm1 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm1[2,3,2,3] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm3 = ymm8[0,0,2,3,4,4,6,7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm3 = ymm9[0,0,2,3,4,4,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm3[1,2,3],ymm0[4],ymm3[5,6,7] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-SLOW-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload @@ -3345,14 +3336,14 @@ ; AVX2-SLOW-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm4 = mem[0,0,2,3,4,4,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm4[1,2,3],ymm0[4],ymm4[5,6,7] -; AVX2-SLOW-NEXT: vblendps $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm4 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm4 = ymm15[0,1],mem[2,3],ymm15[4,5],mem[6,7] +; AVX2-SLOW-NEXT: vmovups (%rsp), %ymm3 # 32-byte Reload +; AVX2-SLOW-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm4 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm4 = mem[0,1],ymm3[2,3],mem[4,5],ymm3[6,7] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm4 = ymm4[2,0,2,3,6,4,6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,2,0,3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm4[3,4,5,6,7] -; AVX2-SLOW-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm4 = mem[0,0,0,0,4,4,4,4] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm4 = ymm15[0,0,0,0,4,4,4,4] ; AVX2-SLOW-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm5 = mem[0,0,2,3,4,4,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3],ymm5[4,5,6],ymm4[7] @@ -3361,68 +3352,67 @@ ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps 192(%rdi), %xmm0 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm4 = xmm0[2,3,2,3] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm5 = ymm7[0,0,2,3,4,4,6,7] +; AVX2-SLOW-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm5 = mem[0,0,2,3,4,4,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm5[1,2,3],ymm4[4],ymm5[5,6,7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm14[0,1],ymm15[2,3],ymm14[4,5],ymm15[6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm13[0,1],ymm14[2,3],ymm13[4,5],ymm14[6,7] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm5 = ymm5[2,0,2,3,6,4,6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,2,0,3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,3,2,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2],ymm5[3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm5 = ymm3[0,0,0,0,4,4,4,4] -; AVX2-SLOW-NEXT: vpermilps $224, (%rsp), %ymm6 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm6 = mem[0,0,2,3,4,4,6,7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm6 = ymm15[0,0,2,3,4,4,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2],ymm5[3],ymm6[4,5,6],ymm5[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,0,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm5[5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm2[3,3,3,3] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm4 = ymm13[0,1,3,3,4,5,7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm4 = ymm7[0,1,3,3,4,5,7,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm4[1,2,3],ymm2[4],ymm4[5,6,7] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm4 = ymm12[3,3,3,3,7,7,7,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm11[1],ymm4[2,3,4],ymm11[5],ymm4[6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,2,0,3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,3,2,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm4[3,4,5,6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm4 = ymm9[0,1,0,1,4,5,4,5] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm4 = ymm8[0,1,0,1,4,5,4,5] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm5 = ymm10[0,1,3,3,4,5,7,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3],ymm5[4,5,6],ymm4[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,0,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm4[5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm8[0,1,3,3,4,5,7,7] -; AVX2-SLOW-NEXT: vmovaps %ymm8, %ymm13 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm9[0,1,3,3,4,5,7,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm9[3,3,3,3,7,7,7,7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm8[3,3,3,3,7,7,7,7] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm5[1],ymm2[2,3,4],ymm5[5],ymm2[6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,0,3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,3,2,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm8[0,1,0,1,4,5,4,5] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm4 = ymm6[0,1,3,3,4,5,7,7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm10[0,1,0,1,4,5,4,5] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm4 = ymm9[0,1,3,3,4,5,7,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2],ymm2[3],ymm4[4,5,6],ymm2[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm7[0,1,3,3,4,5,7,7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm6[0,1,3,3,4,5,7,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm15[3,3,3,3,7,7,7,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm14[1],ymm1[2,3,4],ymm14[5],ymm1[6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm14[3,3,3,3,7,7,7,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm13[1],ymm1[2,3,4],ymm13[5],ymm1[6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm3[0,1,0,1,4,5,4,5] -; AVX2-SLOW-NEXT: vmovups (%rsp), %ymm10 # 32-byte Reload -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm10[0,1,3,3,4,5,7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm15[0,1,3,3,4,5,7,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] @@ -3432,57 +3422,54 @@ ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm3[0,1,3,3,4,5,7,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovups (%rsp), %ymm12 # 32-byte Reload ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm12[3,3,3,3,7,7,7,7] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm11[1],ymm1[2,3,4],ymm11[5],ymm1[6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] -; AVX2-SLOW-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm1 = mem[0,1,0,1,4,5,4,5] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm7[0,1,0,1,4,5,4,5] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm15[0,1,3,3,4,5,7,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm0 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm0 = ymm13[0,1,2,3],mem[4,5,6,7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm14 = ymm5[0,1,2,3],ymm9[4,5,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm14 = ymm5[0,1,2,3],ymm8[4,5,6,7] ; AVX2-SLOW-NEXT: vmovaps 464(%rdi), %xmm1 ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm14[0,1],ymm1[2,3],ymm14[4,5,6,7] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,1,0,2,4,5,4,6] ; AVX2-SLOW-NEXT: vmovaps {{.*#+}} xmm1 = <4,2,u,u> ; AVX2-SLOW-NEXT: vpermps %ymm0, %ymm1, %ymm4 -; AVX2-SLOW-NEXT: vmovaps %ymm1, %ymm9 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm2[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1],ymm6[2,3],ymm8[4,5,6,7] -; AVX2-SLOW-NEXT: vbroadcastf128 {{.*#+}} ymm2 = [0,2,0,6,0,2,0,6] -; AVX2-SLOW-NEXT: # ymm2 = mem[0,1,0,1] -; AVX2-SLOW-NEXT: vpermps %ymm7, %ymm2, %ymm5 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3,4],ymm5[5,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1],ymm9[2,3],ymm10[4,5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX2-SLOW-NEXT: vmovaps {{.*#+}} ymm2 = +; AVX2-SLOW-NEXT: vpermps %ymm0, %ymm2, %ymm5 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3,4],ymm5[5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm13 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm13 = ymm1[0,1,2,3],mem[4,5,6,7] +; AVX2-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm5 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm5 = ymm6[0,1,2,3],mem[4,5,6,7] +; AVX2-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm13 = ymm13[0,1,2,3],mem[4,5,6,7] ; AVX2-SLOW-NEXT: vmovaps 272(%rdi), %xmm6 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm13[0,1],ymm6[2,3],ymm13[4,5,6,7] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,1,0,2,4,5,4,6] -; AVX2-SLOW-NEXT: vmovaps %ymm9, %ymm1 -; AVX2-SLOW-NEXT: vpermps %ymm0, %ymm9, %ymm8 +; AVX2-SLOW-NEXT: vpermps %ymm5, %ymm1, %ymm8 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm8[0,1],ymm4[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm10 = mem[0,1],ymm10[2,3],mem[4,5,6,7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm10 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm10 = mem[0,1],ymm0[2,3],mem[4,5,6,7] ; AVX2-SLOW-NEXT: vpermps %ymm10, %ymm2, %ymm8 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3,4],ymm8[5,6,7] -; AVX2-SLOW-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm9 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm9 = ymm3[0,1,2,3],mem[4,5,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm12 = ymm11[0,1,2,3],ymm12[4,5,6,7] @@ -3490,56 +3477,55 @@ ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm12[0,1],ymm4[2,3],ymm12[4,5,6,7] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm8 = ymm8[0,1,0,2,4,5,4,6] ; AVX2-SLOW-NEXT: vpermps %ymm9, %ymm1, %ymm11 -; AVX2-SLOW-NEXT: vmovaps %ymm1, %ymm3 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm11[0,1],ymm8[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm15 = mem[0,1],ymm15[2,3],mem[4,5,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm15 = ymm7[0,1],ymm15[2,3],ymm7[4,5,6,7] ; AVX2-SLOW-NEXT: vpermps %ymm15, %ymm2, %ymm11 +; AVX2-SLOW-NEXT: vmovaps %ymm2, %ymm7 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1,2,3,4],ymm11[5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm8 = ymm0[0,1,2,3],mem[4,5,6,7] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm1 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX2-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm3 = ymm0[0,1,2,3],mem[4,5,6,7] ; AVX2-SLOW-NEXT: vmovaps 80(%rdi), %xmm0 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm11 = ymm8[0,1],ymm0[2,3],ymm8[4,5,6,7] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm11 = ymm11[0,1,0,2,4,5,4,6] -; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm3, %ymm5 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm11[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm11 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm11 = mem[0,1],ymm3[2,3],mem[4,5,6,7] -; AVX2-SLOW-NEXT: vpermps %ymm11, %ymm2, %ymm2 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm2[5,6,7] +; AVX2-SLOW-NEXT: vpermps %ymm3, %ymm1, %ymm2 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm11[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm1 = mem[0,1],ymm1[2,3],mem[4,5,6,7] +; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm7, %ymm11 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm11[5,6,7] +; AVX2-SLOW-NEXT: vmovups %ymm2, (%rsp) # 32-byte Spill ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm14[1,1,1,1,5,5,5,5] ; AVX2-SLOW-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm2 = ymm2[0,1,2],mem[3],ymm2[4,5,6,7] ; AVX2-SLOW-NEXT: vmovaps {{.*#+}} xmm14 = <5,3,u,u> -; AVX2-SLOW-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm3 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vbroadcastf128 {{.*#+}} ymm3 = [0,3,1,7,0,3,1,7] -; AVX2-SLOW-NEXT: # ymm3 = mem[0,1,0,1] -; AVX2-SLOW-NEXT: vpermps %ymm7, %ymm3, %ymm7 +; AVX2-SLOW-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm11 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm11[0,1],ymm2[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vmovaps {{.*#+}} ymm11 = +; AVX2-SLOW-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm7 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm7[5,6,7] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm7 = ymm13[1,1,1,1,5,5,5,5] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2],ymm6[3],ymm7[4,5,6,7] -; AVX2-SLOW-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm7 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpermps %ymm10, %ymm3, %ymm7 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm7[5,6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm7 = ymm12[1,1,1,1,5,5,5,5] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm7[0,1,2],ymm4[3],ymm7[4,5,6,7] -; AVX2-SLOW-NEXT: vpermps %ymm9, %ymm14, %ymm7 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm7[0,1],ymm4[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpermps %ymm15, %ymm3, %ymm7 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm7[5,6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm7 = ymm8[1,1,1,1,5,5,5,5] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1,2],ymm0[3],ymm7[4,5,6,7] -; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm14, %ymm1 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpermps %ymm11, %ymm3, %ymm1 +; AVX2-SLOW-NEXT: vpermps %ymm5, %ymm14, %ymm5 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm6[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpermps %ymm10, %ymm11, %ymm6 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm6[5,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm6 = ymm12[1,1,1,1,5,5,5,5] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm6[0,1,2],ymm4[3],ymm6[4,5,6,7] +; AVX2-SLOW-NEXT: vpermps %ymm9, %ymm14, %ymm6 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm6[0,1],ymm4[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpermps %ymm15, %ymm11, %ymm6 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm6[5,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm6 = ymm8[1,1,1,1,5,5,5,5] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2],ymm0[3],ymm6[4,5,6,7] +; AVX2-SLOW-NEXT: vpermps %ymm3, %ymm14, %ymm3 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm11, %ymm1 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm1, 96(%rsi) @@ -3575,14 +3561,15 @@ ; AVX2-SLOW-NEXT: vmovaps %ymm1, (%r8) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm1, 96(%r9) -; AVX2-SLOW-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm1, 32(%r9) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm1, 64(%r9) -; AVX2-SLOW-NEXT: vmovaps %ymm5, (%r9) +; AVX2-SLOW-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm1, (%r9) ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-SLOW-NEXT: vmovaps %ymm4, 96(%rax) -; AVX2-SLOW-NEXT: vmovaps %ymm6, 32(%rax) +; AVX2-SLOW-NEXT: vmovaps %ymm5, 32(%rax) ; AVX2-SLOW-NEXT: vmovaps %ymm2, 64(%rax) ; AVX2-SLOW-NEXT: vmovaps %ymm0, (%rax) ; AVX2-SLOW-NEXT: addq $1160, %rsp # imm = 0x488 @@ -3594,9 +3581,9 @@ ; AVX2-FAST-NEXT: subq $1160, %rsp # imm = 0x488 ; AVX2-FAST-NEXT: vmovaps 480(%rdi), %ymm6 ; AVX2-FAST-NEXT: vmovaps 448(%rdi), %ymm7 -; AVX2-FAST-NEXT: vmovups %ymm7, (%rsp) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 416(%rdi), %ymm8 -; AVX2-FAST-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovaps 416(%rdi), %ymm9 +; AVX2-FAST-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps 128(%rdi), %ymm0 ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps 160(%rdi), %ymm4 @@ -3614,51 +3601,51 @@ ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpermps %ymm1, %ymm14, %ymm2 ; AVX2-FAST-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm5[0,1],ymm3[0,1] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm11 = ymm1[0,1,2,3,4,5],ymm3[6,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm3 = ymm11[0,2,2,2,4,6,6,6] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm1[0,1,2,3,4,5],ymm3[6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm3 = ymm8[0,2,2,2,4,6,6,6] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm2[0,1,2],ymm3[3,4,5,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm10 = ymm4[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FAST-NEXT: vbroadcastsd {{.*#+}} ymm2 = [8589934596,8589934596,8589934596,8589934596] -; AVX2-FAST-NEXT: vpermps %ymm10, %ymm2, %ymm5 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm5[6,7] -; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm11 = +; AVX2-FAST-NEXT: vpermps %ymm10, %ymm11, %ymm5 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm5[6,7] +; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm7[0,1],ymm6[0,1] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm12 = ymm3[0,1,2,3,4,5],ymm6[6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm3[0,1,2,3,4,5],ymm6[6,7] ; AVX2-FAST-NEXT: vmovaps 384(%rdi), %ymm0 ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm0[0,1,2,3],ymm8[4,5],ymm0[6,7] -; AVX2-FAST-NEXT: vpermps %ymm6, %ymm14, %ymm5 -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm7 = ymm12[0,2,2,2,4,6,6,6] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm0[0,1,2,3],ymm9[4,5],ymm0[6,7] +; AVX2-FAST-NEXT: vpermps %ymm4, %ymm14, %ymm5 +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm7 = ymm6[0,2,2,2,4,6,6,6] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2],ymm7[3,4,5,6,7] ; AVX2-FAST-NEXT: vmovaps 512(%rdi), %ymm0 ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps 544(%rdi), %ymm1 ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FAST-NEXT: vpermps %ymm4, %ymm2, %ymm9 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5],ymm9[6,7] -; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vpermps %ymm3, %ymm11, %ymm9 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5],ymm9[6,7] +; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps 288(%rdi), %ymm1 ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps 256(%rdi), %ymm0 ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm0[0,1],ymm1[0,1] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm9 = ymm5[0,1,2,3,4,5],ymm1[6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm12 = ymm5[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FAST-NEXT: vmovaps 224(%rdi), %ymm0 ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps 192(%rdi), %ymm1 ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] -; AVX2-FAST-NEXT: vpermps %ymm3, %ymm14, %ymm15 -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm13 = ymm9[0,2,2,2,4,6,6,6] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] +; AVX2-FAST-NEXT: vpermps %ymm2, %ymm14, %ymm15 +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm13 = ymm12[0,2,2,2,4,6,6,6] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm13 = ymm15[0,1,2],ymm13[3,4,5,6,7] ; AVX2-FAST-NEXT: vmovaps 320(%rdi), %ymm0 ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps 352(%rdi), %ymm1 ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FAST-NEXT: vpermps %ymm8, %ymm2, %ymm15 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm9 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vpermps %ymm9, %ymm11, %ymm15 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm13[0,1,2,3,4,5],ymm15[6,7] ; AVX2-FAST-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps 608(%rdi), %ymm0 @@ -3668,42 +3655,42 @@ ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm13 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] ; AVX2-FAST-NEXT: vpermps %ymm13, %ymm14, %ymm14 ; AVX2-FAST-NEXT: vmovaps 672(%rdi), %ymm1 -; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 640(%rdi), %ymm15 -; AVX2-FAST-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm15[0,1],ymm1[0,1] -; AVX2-FAST-NEXT: vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovups %ymm1, (%rsp) # 32-byte Spill +; AVX2-FAST-NEXT: vmovaps 640(%rdi), %ymm0 +; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm0[0,1],ymm1[0,1] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm5 = ymm7[0,2,2,2,4,6,6,6] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm14[0,1,2],ymm5[3,4,5,6,7] ; AVX2-FAST-NEXT: vmovaps 704(%rdi), %ymm1 ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 736(%rdi), %ymm0 -; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm14 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FAST-NEXT: vpermps %ymm14, %ymm2, %ymm0 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5],ymm0[6,7] -; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps {{.*#+}} xmm0 = <1,7,5,u> -; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm11 = ymm11[1,3,2,3,5,7,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2],ymm11[3,4,5,6,7] -; AVX2-FAST-NEXT: vbroadcastsd {{.*#+}} ymm11 = [12884901893,12884901893,12884901893,12884901893] -; AVX2-FAST-NEXT: vpermps %ymm10, %ymm11, %ymm10 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm10[6,7] +; AVX2-FAST-NEXT: vmovaps 736(%rdi), %ymm15 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermps %ymm14, %ymm11, %ymm11 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm11[6,7] ; AVX2-FAST-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermps %ymm6, %ymm0, %ymm5 -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm6 = ymm12[1,3,2,3,5,7,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2],ymm6[3,4,5,6,7] -; AVX2-FAST-NEXT: vpermps %ymm4, %ymm11, %ymm4 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7] -; AVX2-FAST-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermps %ymm3, %ymm0, %ymm1 -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm3 = ymm9[1,3,2,3,5,7,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm3[3,4,5,6,7] -; AVX2-FAST-NEXT: vpermps %ymm8, %ymm11, %ymm2 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] -; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermps %ymm13, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vmovaps {{.*#+}} xmm5 = <1,7,5,u> +; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm11 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm8 = ymm8[1,3,2,3,5,7,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm11[0,1,2],ymm8[3,4,5,6,7] +; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm11 = +; AVX2-FAST-NEXT: vpermps %ymm10, %ymm11, %ymm10 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm10[6,7] +; AVX2-FAST-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermps %ymm4, %ymm5, %ymm4 +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm6 = ymm6[1,3,2,3,5,7,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2],ymm6[3,4,5,6,7] +; AVX2-FAST-NEXT: vpermps %ymm3, %ymm11, %ymm3 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7] +; AVX2-FAST-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermps %ymm2, %ymm5, %ymm0 +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm12[1,3,2,3,5,7,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7] +; AVX2-FAST-NEXT: vpermps %ymm9, %ymm11, %ymm1 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermps %ymm13, %ymm5, %ymm0 ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm7[1,3,2,3,5,7,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX2-FAST-NEXT: vpermps %ymm14, %ymm11, %ymm1 @@ -3711,14 +3698,14 @@ ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps (%rdi), %xmm2 ; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm2[2,3,2,3] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm9[0,0,2,3,4,4,6,7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm7[0,0,2,3,4,4,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm11[0,1],ymm12[2,3],ymm11[4,5],ymm12[6,7] -; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm7 = [2,0,6,4,2,0,6,7] -; AVX2-FAST-NEXT: vpermps %ymm3, %ymm7, %ymm3 +; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm0 = [2,0,6,4,2,0,6,7] +; AVX2-FAST-NEXT: vpermps %ymm3, %ymm0, %ymm3 ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,3,2,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm3[3,4,5,6,7] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload @@ -3731,13 +3718,13 @@ ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps 384(%rdi), %xmm1 ; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm3 = xmm1[2,3,2,3] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm0[0,0,2,3,4,4,6,7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm9[0,0,2,3,4,4,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm4[1,2,3],ymm3[4],ymm4[5,6,7] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps $51, (%rsp), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm4 = mem[0,1],ymm4[2,3],mem[4,5],ymm4[6,7] -; AVX2-FAST-NEXT: vpermps %ymm4, %ymm7, %ymm4 +; AVX2-FAST-NEXT: vpermps %ymm4, %ymm0, %ymm4 ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,3,2,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm4[3,4,5,6,7] ; AVX2-FAST-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload @@ -3754,40 +3741,40 @@ ; AVX2-FAST-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm5 = mem[0,0,2,3,4,4,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm5[1,2,3],ymm4[4],ymm5[5,6,7] -; AVX2-FAST-NEXT: vblendps $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm5 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm5 = ymm15[0,1],mem[2,3],ymm15[4,5],mem[6,7] -; AVX2-FAST-NEXT: vpermps %ymm5, %ymm7, %ymm5 +; AVX2-FAST-NEXT: vmovups (%rsp), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm5 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm5 = mem[0,1],ymm3[2,3],mem[4,5],ymm3[6,7] +; AVX2-FAST-NEXT: vpermps %ymm5, %ymm0, %ymm5 ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,3,2,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2],ymm5[3,4,5,6,7] -; AVX2-FAST-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm5 = mem[0,0,0,0,4,4,4,4] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm5 = ymm15[0,0,0,0,4,4,4,4] ; AVX2-FAST-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm6 = mem[0,0,2,3,4,4,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2],ymm5[3],ymm6[4,5,6],ymm5[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,0,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm5[5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm14[0,1],ymm15[2,3],ymm14[4,5],ymm15[6,7] -; AVX2-FAST-NEXT: vpermps %ymm4, %ymm7, %ymm4 -; AVX2-FAST-NEXT: vmovaps 192(%rdi), %xmm13 -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm5 = xmm13[2,3,2,3] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm6 = ymm7[0,0,2,3,4,4,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm13[0,1],ymm14[2,3],ymm13[4,5],ymm14[6,7] +; AVX2-FAST-NEXT: vpermps %ymm4, %ymm0, %ymm4 +; AVX2-FAST-NEXT: vmovaps 192(%rdi), %xmm0 +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm5 = xmm0[2,3,2,3] +; AVX2-FAST-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm6 = mem[0,0,2,3,4,4,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm6[1,2,3],ymm5[4],ymm6[5,6,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,3,2,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3,4,5,6,7] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm5 = ymm3[0,0,0,0,4,4,4,4] -; AVX2-FAST-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm6 = mem[0,0,2,3,4,4,6,7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm6 = ymm15[0,0,2,3,4,4,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2],ymm5[3],ymm6[4,5,6],ymm5[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,0,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm5[5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm2 = xmm2[3,3,3,3] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm9[0,1,3,3,4,5,7,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm7[0,1,3,3,4,5,7,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm4[1,2,3],ymm2[4],ymm4[5,6,7] ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm12[3,3,3,3,7,7,7,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm11[1],ymm4[2,3,4],ymm11[5],ymm4[6,7] @@ -3801,35 +3788,34 @@ ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm4[5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm0[0,1,3,3,4,5,7,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm9[0,1,3,3,4,5,7,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm9[3,3,3,3,7,7,7,7] -; AVX2-FAST-NEXT: vmovups (%rsp), %ymm5 # 32-byte Reload +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm8[3,3,3,3,7,7,7,7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm5[1],ymm2[2,3,4],ymm5[5],ymm2[6,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,0,3] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,3,2,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm8[0,1,0,1,4,5,4,5] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm6[0,1,3,3,4,5,7,7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm10[0,1,0,1,4,5,4,5] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm9[0,1,3,3,4,5,7,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2],ymm2[3],ymm4[4,5,6],ymm2[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm2[5,6,7] -; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm13[3,3,3,3] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm7[0,1,3,3,4,5,7,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm6[0,1,3,3,4,5,7,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm15[3,3,3,3,7,7,7,7] -; AVX2-FAST-NEXT: vmovaps %ymm15, %ymm13 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm14[1],ymm1[2,3,4],ymm14[5],ymm1[6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm14[3,3,3,3,7,7,7,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm13[1],ymm1[2,3,4],ymm13[5],ymm1[6,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm3[0,1,0,1,4,5,4,5] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm10[0,1,3,3,4,5,7,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm15[0,1,3,3,4,5,7,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] @@ -3839,15 +3825,15 @@ ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm3[0,1,3,3,4,5,7,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-FAST-NEXT: vmovups (%rsp), %ymm12 # 32-byte Reload ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm12[3,3,3,3,7,7,7,7] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm11[1],ymm1[2,3,4],ymm11[5],ymm1[6,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] -; AVX2-FAST-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm1 = mem[0,1,0,1,4,5,4,5] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm7[0,1,0,1,4,5,4,5] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm15[0,1,3,3,4,5,7,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] @@ -3858,7 +3844,7 @@ ; AVX2-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm14 = ymm5[0,1,2,3],ymm9[4,5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm14 = ymm5[0,1,2,3],ymm8[4,5,6,7] ; AVX2-FAST-NEXT: vmovaps 464(%rdi), %xmm1 ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm14[0,1],ymm1[2,3],ymm14[4,5,6,7] @@ -3866,25 +3852,24 @@ ; AVX2-FAST-NEXT: vmovaps {{.*#+}} xmm1 = <4,2,u,u> ; AVX2-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm4 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm2[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1],ymm6[2,3],ymm8[4,5,6,7] -; AVX2-FAST-NEXT: vbroadcastf128 {{.*#+}} ymm2 = [0,2,0,6,0,2,0,6] -; AVX2-FAST-NEXT: # ymm2 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpermps %ymm7, %ymm2, %ymm5 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1],ymm9[2,3],ymm10[4,5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm2 = +; AVX2-FAST-NEXT: vpermps %ymm0, %ymm2, %ymm5 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3,4],ymm5[5,6,7] -; AVX2-FAST-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm13 = mem[0,1,2,3],ymm13[4,5,6,7] +; AVX2-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm5 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm5 = ymm6[0,1,2,3],mem[4,5,6,7] +; AVX2-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm13 = ymm13[0,1,2,3],mem[4,5,6,7] ; AVX2-FAST-NEXT: vmovaps 272(%rdi), %xmm6 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm13[0,1],ymm6[2,3],ymm13[4,5,6,7] ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,1,0,2,4,5,4,6] -; AVX2-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm8 +; AVX2-FAST-NEXT: vpermps %ymm5, %ymm1, %ymm8 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm8[0,1],ymm4[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm10 = mem[0,1],ymm10[2,3],mem[4,5,6,7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm10 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm10 = mem[0,1],ymm0[2,3],mem[4,5,6,7] ; AVX2-FAST-NEXT: vpermps %ymm10, %ymm2, %ymm8 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3,4],ymm8[5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -3895,56 +3880,55 @@ ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm12[0,1],ymm4[2,3],ymm12[4,5,6,7] ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm8 = ymm8[0,1,0,2,4,5,4,6] ; AVX2-FAST-NEXT: vpermps %ymm9, %ymm1, %ymm11 -; AVX2-FAST-NEXT: vmovaps %ymm1, %ymm3 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm11[0,1],ymm8[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm15 = mem[0,1],ymm15[2,3],mem[4,5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm15 = ymm7[0,1],ymm15[2,3],ymm7[4,5,6,7] ; AVX2-FAST-NEXT: vpermps %ymm15, %ymm2, %ymm11 +; AVX2-FAST-NEXT: vmovaps %ymm2, %ymm7 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1,2,3,4],ymm11[5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm8 = ymm0[0,1,2,3],mem[4,5,6,7] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm1 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX2-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm3 = ymm0[0,1,2,3],mem[4,5,6,7] ; AVX2-FAST-NEXT: vmovaps 80(%rdi), %xmm0 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm11 = ymm8[0,1],ymm0[2,3],ymm8[4,5,6,7] ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm11 = ymm11[0,1,0,2,4,5,4,6] -; AVX2-FAST-NEXT: vpermps %ymm1, %ymm3, %ymm5 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm11[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm11 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm11 = mem[0,1],ymm3[2,3],mem[4,5,6,7] -; AVX2-FAST-NEXT: vpermps %ymm11, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm2[5,6,7] +; AVX2-FAST-NEXT: vpermps %ymm3, %ymm1, %ymm2 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm11[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm1 = mem[0,1],ymm1[2,3],mem[4,5,6,7] +; AVX2-FAST-NEXT: vpermps %ymm1, %ymm7, %ymm11 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm11[5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm2, (%rsp) # 32-byte Spill ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm14[1,1,1,1,5,5,5,5] ; AVX2-FAST-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm2 = ymm2[0,1,2],mem[3],ymm2[4,5,6,7] ; AVX2-FAST-NEXT: vmovaps {{.*#+}} xmm14 = <5,3,u,u> -; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm3 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vbroadcastf128 {{.*#+}} ymm3 = [0,3,1,7,0,3,1,7] -; AVX2-FAST-NEXT: # ymm3 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpermps %ymm7, %ymm3, %ymm7 +; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm11 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm11[0,1],ymm2[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm11 = +; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm7 # 32-byte Folded Reload ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm7[5,6,7] ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm7 = ymm13[1,1,1,1,5,5,5,5] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2],ymm6[3],ymm7[4,5,6,7] -; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm7 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vpermps %ymm10, %ymm3, %ymm7 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm7[5,6,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm7 = ymm12[1,1,1,1,5,5,5,5] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm7[0,1,2],ymm4[3],ymm7[4,5,6,7] -; AVX2-FAST-NEXT: vpermps %ymm9, %ymm14, %ymm7 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm7[0,1],ymm4[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vpermps %ymm15, %ymm3, %ymm7 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm7[5,6,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm7 = ymm8[1,1,1,1,5,5,5,5] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1,2],ymm0[3],ymm7[4,5,6,7] -; AVX2-FAST-NEXT: vpermps %ymm1, %ymm14, %ymm1 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vpermps %ymm11, %ymm3, %ymm1 +; AVX2-FAST-NEXT: vpermps %ymm5, %ymm14, %ymm5 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm6[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpermps %ymm10, %ymm11, %ymm6 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm6[5,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm6 = ymm12[1,1,1,1,5,5,5,5] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm6[0,1,2],ymm4[3],ymm6[4,5,6,7] +; AVX2-FAST-NEXT: vpermps %ymm9, %ymm14, %ymm6 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm6[0,1],ymm4[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpermps %ymm15, %ymm11, %ymm6 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm6[5,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm6 = ymm8[1,1,1,1,5,5,5,5] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2],ymm0[3],ymm6[4,5,6,7] +; AVX2-FAST-NEXT: vpermps %ymm3, %ymm14, %ymm3 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpermps %ymm1, %ymm11, %ymm1 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm1, 96(%rsi) @@ -3982,12 +3966,13 @@ ; AVX2-FAST-NEXT: vmovaps %ymm1, 96(%r9) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm1, 32(%r9) -; AVX2-FAST-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm1, 64(%r9) -; AVX2-FAST-NEXT: vmovaps %ymm5, (%r9) +; AVX2-FAST-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm1, (%r9) ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FAST-NEXT: vmovaps %ymm4, 96(%rax) -; AVX2-FAST-NEXT: vmovaps %ymm6, 32(%rax) +; AVX2-FAST-NEXT: vmovaps %ymm5, 32(%rax) ; AVX2-FAST-NEXT: vmovaps %ymm2, 64(%rax) ; AVX2-FAST-NEXT: vmovaps %ymm0, (%rax) ; AVX2-FAST-NEXT: addq $1160, %rsp # imm = 0x488 @@ -4000,8 +3985,8 @@ ; AVX2-FAST-PERLANE-NEXT: vmovaps 480(%rdi), %ymm6 ; AVX2-FAST-PERLANE-NEXT: vmovaps 448(%rdi), %ymm7 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 416(%rdi), %ymm8 -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovaps 416(%rdi), %ymm9 +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 128(%rdi), %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 160(%rdi), %ymm4 @@ -4019,51 +4004,51 @@ ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm14, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm5[0,1],ymm3[0,1] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm11 = ymm1[0,1,2,3,4,5],ymm3[6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm3 = ymm11[0,2,2,2,4,6,6,6] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm1[0,1,2,3,4,5],ymm3[6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm3 = ymm8[0,2,2,2,4,6,6,6] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm2[0,1,2],ymm3[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm10 = ymm4[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastsd {{.*#+}} ymm2 = [8589934596,8589934596,8589934596,8589934596] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm10, %ymm2, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm5[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} ymm11 = +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm10, %ymm11, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm5[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm7[0,1],ymm6[0,1] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm12 = ymm3[0,1,2,3,4,5],ymm6[6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm3[0,1,2,3,4,5],ymm6[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps 384(%rdi), %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm0[0,1,2,3],ymm8[4,5],ymm0[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm6, %ymm14, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm7 = ymm12[0,2,2,2,4,6,6,6] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm0[0,1,2,3],ymm9[4,5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm4, %ymm14, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm7 = ymm6[0,2,2,2,4,6,6,6] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2],ymm7[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps 512(%rdi), %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 544(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm4, %ymm2, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5],ymm9[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm3, %ymm11, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5],ymm9[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 288(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 256(%rdi), %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm0[0,1],ymm1[0,1] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm5[0,1,2,3,4,5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm12 = ymm5[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps 224(%rdi), %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 192(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm3, %ymm14, %ymm15 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm13 = ymm9[0,2,2,2,4,6,6,6] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm2, %ymm14, %ymm15 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm13 = ymm12[0,2,2,2,4,6,6,6] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm13 = ymm15[0,1,2],ymm13[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps 320(%rdi), %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 352(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm8, %ymm2, %ymm15 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm9, %ymm11, %ymm15 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm13[0,1,2,3,4,5],ymm15[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 608(%rdi), %ymm0 @@ -4073,42 +4058,42 @@ ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm13 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm13, %ymm14, %ymm14 ; AVX2-FAST-PERLANE-NEXT: vmovaps 672(%rdi), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 640(%rdi), %ymm15 -; AVX2-FAST-PERLANE-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm15[0,1],ymm1[0,1] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, (%rsp) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovaps 640(%rdi), %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm0[0,1],ymm1[0,1] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm5 = ymm7[0,2,2,2,4,6,6,6] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm14[0,1,2],ymm5[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps 704(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 736(%rdi), %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm14 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm14, %ymm2, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5],ymm0[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} xmm0 = <1,7,5,u> -; AVX2-FAST-PERLANE-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm11 = ymm11[1,3,2,3,5,7,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2],ymm11[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastsd {{.*#+}} ymm11 = [12884901893,12884901893,12884901893,12884901893] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm10, %ymm11, %ymm10 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm10[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovaps 736(%rdi), %ymm15 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm14, %ymm11, %ymm11 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm11[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm6, %ymm0, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm6 = ymm12[1,3,2,3,5,7,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2],ymm6[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm4, %ymm11, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm3, %ymm0, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm3 = ymm9[1,3,2,3,5,7,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm3[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm8, %ymm11, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm13, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} xmm5 = <1,7,5,u> +; AVX2-FAST-PERLANE-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm11 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm8 = ymm8[1,3,2,3,5,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm11[0,1,2],ymm8[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} ymm11 = +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm10, %ymm11, %ymm10 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm10[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm4, %ymm5, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm6 = ymm6[1,3,2,3,5,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2],ymm6[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm3, %ymm11, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm2, %ymm5, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm12[1,3,2,3,5,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm9, %ymm11, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm13, %ymm5, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm7[1,3,2,3,5,7,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm14, %ymm11, %ymm1 @@ -4116,8 +4101,8 @@ ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdi), %xmm2 ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm2[2,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm13[0,0,2,3,4,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm7[0,0,2,3,4,4,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload @@ -4126,8 +4111,8 @@ ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm9[0,0,0,0,4,4,4,4] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm8[0,0,0,0,4,4,4,4] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm3 = ymm10[0,0,2,3,4,4,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3],ymm3[4,5,6],ymm1[7] @@ -4136,8 +4121,8 @@ ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 384(%rdi), %xmm1 ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm1[2,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm3 = ymm8[0,0,2,3,4,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm3 = ymm9[0,0,2,3,4,4,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm3[1,2,3],ymm0[4],ymm3[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload @@ -4160,14 +4145,14 @@ ; AVX2-FAST-PERLANE-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm4 = mem[0,0,2,3,4,4,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm4[1,2,3],ymm0[4],ymm4[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm4 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm4 = ymm15[0,1],mem[2,3],ymm15[4,5],mem[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups (%rsp), %ymm3 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm4 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm4 = mem[0,1],ymm3[2,3],mem[4,5],ymm3[6,7] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm4 = ymm4[2,0,2,3,6,4,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,2,0,3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm4[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm4 = mem[0,0,0,0,4,4,4,4] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm4 = ymm15[0,0,0,0,4,4,4,4] ; AVX2-FAST-PERLANE-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm5 = mem[0,0,2,3,4,4,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3],ymm5[4,5,6],ymm4[7] @@ -4176,68 +4161,67 @@ ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 192(%rdi), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm4 = xmm0[2,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm5 = ymm7[0,0,2,3,4,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm5 = mem[0,0,2,3,4,4,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm5[1,2,3],ymm4[4],ymm5[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm14[0,1],ymm15[2,3],ymm14[4,5],ymm15[6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm13[0,1],ymm14[2,3],ymm13[4,5],ymm14[6,7] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm5 = ymm5[2,0,2,3,6,4,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,2,0,3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2],ymm5[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm5 = ymm3[0,0,0,0,4,4,4,4] -; AVX2-FAST-PERLANE-NEXT: vpermilps $224, (%rsp), %ymm6 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm6 = mem[0,0,2,3,4,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm6 = ymm15[0,0,2,3,4,4,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2],ymm5[3],ymm6[4,5,6],ymm5[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,0,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm5[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm2 = xmm2[3,3,3,3] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm4 = ymm13[0,1,3,3,4,5,7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm4 = ymm7[0,1,3,3,4,5,7,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm4[1,2,3],ymm2[4],ymm4[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm4 = ymm12[3,3,3,3,7,7,7,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm11[1],ymm4[2,3,4],ymm11[5],ymm4[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,2,0,3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm4[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm4 = ymm9[0,1,0,1,4,5,4,5] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm4 = ymm8[0,1,0,1,4,5,4,5] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm5 = ymm10[0,1,3,3,4,5,7,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3],ymm5[4,5,6],ymm4[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,0,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm4[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm8[0,1,3,3,4,5,7,7] -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm8, %ymm13 +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm9[0,1,3,3,4,5,7,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm9[3,3,3,3,7,7,7,7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm8[3,3,3,3,7,7,7,7] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm5[1],ymm2[2,3,4],ymm5[5],ymm2[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,0,3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm8[0,1,0,1,4,5,4,5] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm4 = ymm6[0,1,3,3,4,5,7,7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm10[0,1,0,1,4,5,4,5] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm4 = ymm9[0,1,3,3,4,5,7,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2],ymm2[3],ymm4[4,5,6],ymm2[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm7[0,1,3,3,4,5,7,7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm6[0,1,3,3,4,5,7,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm15[3,3,3,3,7,7,7,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm14[1],ymm1[2,3,4],ymm14[5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm14[3,3,3,3,7,7,7,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm13[1],ymm1[2,3,4],ymm13[5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm3[0,1,0,1,4,5,4,5] -; AVX2-FAST-PERLANE-NEXT: vmovups (%rsp), %ymm10 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm10[0,1,3,3,4,5,7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm15[0,1,3,3,4,5,7,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] @@ -4247,57 +4231,54 @@ ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm3[0,1,3,3,4,5,7,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovups (%rsp), %ymm12 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm12[3,3,3,3,7,7,7,7] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm11[1],ymm1[2,3,4],ymm11[5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0,1,0,1,4,5,4,5] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm7[0,1,0,1,4,5,4,5] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm15[0,1,3,3,4,5,7,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm0 = ymm13[0,1,2,3],mem[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm14 = ymm5[0,1,2,3],ymm9[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm14 = ymm5[0,1,2,3],ymm8[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps 464(%rdi), %xmm1 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm14[0,1],ymm1[2,3],ymm14[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,1,0,2,4,5,4,6] ; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} xmm1 = <4,2,u,u> ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm0, %ymm1, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, %ymm9 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm2[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1],ymm6[2,3],ymm8[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastf128 {{.*#+}} ymm2 = [0,2,0,6,0,2,0,6] -; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm7, %ymm2, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3,4],ymm5[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1],ymm9[2,3],ymm10[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} ymm2 = +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm0, %ymm2, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3,4],ymm5[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm13 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm13 = ymm1[0,1,2,3],mem[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm5 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm5 = ymm6[0,1,2,3],mem[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm13 = ymm13[0,1,2,3],mem[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps 272(%rdi), %xmm6 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm13[0,1],ymm6[2,3],ymm13[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,1,0,2,4,5,4,6] -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm9, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm0, %ymm9, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm5, %ymm1, %ymm8 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm8[0,1],ymm4[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm10 = mem[0,1],ymm10[2,3],mem[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm10 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm10 = mem[0,1],ymm0[2,3],mem[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm10, %ymm2, %ymm8 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3,4],ymm8[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm9 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm9 = ymm3[0,1,2,3],mem[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm12 = ymm11[0,1,2,3],ymm12[4,5,6,7] @@ -4305,56 +4286,55 @@ ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm12[0,1],ymm4[2,3],ymm12[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm8 = ymm8[0,1,0,2,4,5,4,6] ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm9, %ymm1, %ymm11 -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, %ymm3 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm11[0,1],ymm8[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm15 = mem[0,1],ymm15[2,3],mem[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm15 = ymm7[0,1],ymm15[2,3],ymm7[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm15, %ymm2, %ymm11 +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, %ymm7 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1,2,3,4],ymm11[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm8 = ymm0[0,1,2,3],mem[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm1 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm3 = ymm0[0,1,2,3],mem[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps 80(%rdi), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm11 = ymm8[0,1],ymm0[2,3],ymm8[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm11 = ymm11[0,1,0,2,4,5,4,6] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm3, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm11[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm11 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm11 = mem[0,1],ymm3[2,3],mem[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm11, %ymm2, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm2[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm3, %ymm1, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm11[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0,1],ymm1[2,3],mem[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm7, %ymm11 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm11[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, (%rsp) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm14[1,1,1,1,5,5,5,5] ; AVX2-FAST-PERLANE-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm2 = ymm2[0,1,2],mem[3],ymm2[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} xmm14 = <5,3,u,u> -; AVX2-FAST-PERLANE-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm3 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastf128 {{.*#+}} ymm3 = [0,3,1,7,0,3,1,7] -; AVX2-FAST-PERLANE-NEXT: # ymm3 = mem[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm7, %ymm3, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm11 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm11[0,1],ymm2[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} ymm11 = +; AVX2-FAST-PERLANE-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm7 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm7[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm7 = ymm13[1,1,1,1,5,5,5,5] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2],ymm6[3],ymm7[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm7 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm10, %ymm3, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm7[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm7 = ymm12[1,1,1,1,5,5,5,5] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm7[0,1,2],ymm4[3],ymm7[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm9, %ymm14, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm7[0,1],ymm4[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm15, %ymm3, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm7[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm7 = ymm8[1,1,1,1,5,5,5,5] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1,2],ymm0[3],ymm7[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm14, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm11, %ymm3, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm5, %ymm14, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm6[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm10, %ymm11, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm6[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm6 = ymm12[1,1,1,1,5,5,5,5] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm6[0,1,2],ymm4[3],ymm6[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm9, %ymm14, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm6[0,1],ymm4[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm15, %ymm11, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm6[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm6 = ymm8[1,1,1,1,5,5,5,5] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2],ymm0[3],ymm6[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm3, %ymm14, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm11, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 96(%rsi) @@ -4390,14 +4370,15 @@ ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, (%r8) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 96(%r9) -; AVX2-FAST-PERLANE-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 32(%r9) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 64(%r9) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, (%r9) +; AVX2-FAST-PERLANE-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, (%r9) ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 96(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 32(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, 32(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, 64(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, (%rax) ; AVX2-FAST-PERLANE-NEXT: addq $1160, %rsp # imm = 0x488 @@ -6980,7 +6961,7 @@ ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm6[4,5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vbroadcastsd {{.*#+}} ymm9 = [8589934596,8589934596,8589934596,8589934596] +; AVX2-SLOW-NEXT: vmovaps {{.*#+}} ymm9 = ; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm9, %ymm1 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -7147,7 +7128,7 @@ ; AVX2-SLOW-NEXT: vpermilps $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm14 = mem[1,3,2,3,5,7,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm14 = ymm0[0,1,2],ymm14[3,4,5,6,7] -; AVX2-SLOW-NEXT: vbroadcastsd {{.*#+}} ymm0 = [12884901893,12884901893,12884901893,12884901893] +; AVX2-SLOW-NEXT: vmovaps {{.*#+}} ymm0 = ; AVX2-SLOW-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -7526,8 +7507,7 @@ ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm15 = mem[0,1],ymm0[2,3],mem[4,5,6,7] -; AVX2-SLOW-NEXT: vbroadcastf128 {{.*#+}} ymm0 = [0,2,0,6,0,2,0,6] -; AVX2-SLOW-NEXT: # ymm0 = mem[0,1,0,1] +; AVX2-SLOW-NEXT: vmovaps {{.*#+}} ymm0 = ; AVX2-SLOW-NEXT: vpermps %ymm15, %ymm0, %ymm2 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -7658,70 +7638,69 @@ ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm0 = mem[1,1,1,1,5,5,5,5] -; AVX2-SLOW-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm0 = ymm0[0,1,2],mem[3],ymm0[4,5,6,7] -; AVX2-SLOW-NEXT: vmovaps {{.*#+}} xmm1 = <5,3,u,u> -; AVX2-SLOW-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm7 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm0[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vbroadcastf128 {{.*#+}} ymm0 = [0,3,1,7,0,3,1,7] -; AVX2-SLOW-NEXT: # ymm0 = mem[0,1,0,1] -; AVX2-SLOW-NEXT: vpermps %ymm15, %ymm0, %ymm15 +; AVX2-SLOW-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm1 = ymm0[0,1,2],mem[3],ymm0[4,5,6,7] +; AVX2-SLOW-NEXT: vmovaps {{.*#+}} xmm0 = <5,3,u,u> +; AVX2-SLOW-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm1[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vmovaps {{.*#+}} ymm1 = +; AVX2-SLOW-NEXT: vpermps %ymm15, %ymm1, %ymm15 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm15[5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm7 = mem[1,1,1,1,5,5,5,5] ; AVX2-SLOW-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm7 = ymm7[0,1,2],mem[3],ymm7[4,5,6,7] -; AVX2-SLOW-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm15[0,1],ymm7[2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm15[0,1],ymm7[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm15[5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm7 = mem[1,1,1,1,5,5,5,5] ; AVX2-SLOW-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm7 = ymm7[0,1,2],mem[3],ymm7[4,5,6,7] -; AVX2-SLOW-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm15[0,1],ymm7[2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm15[0,1],ymm7[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm15[5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm7 = mem[1,1,1,1,5,5,5,5] ; AVX2-SLOW-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm7 = ymm7[0,1,2],mem[3],ymm7[4,5,6,7] -; AVX2-SLOW-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm15[0,1],ymm7[2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm15[0,1],ymm7[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm15[5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm7, (%rsp) # 32-byte Spill ; AVX2-SLOW-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm15 = mem[1,1,1,1,5,5,5,5] ; AVX2-SLOW-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm15 = ymm15[0,1,2],mem[3],ymm15[4,5,6,7] -; AVX2-SLOW-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm7 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm15[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpermps %ymm14, %ymm0, %ymm14 +; AVX2-SLOW-NEXT: vpermps %ymm14, %ymm1, %ymm14 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm14 = ymm7[0,1,2,3,4],ymm14[5,6,7] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm7 = ymm11[1,1,1,1,5,5,5,5] ; AVX2-SLOW-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm7 = ymm7[0,1,2],mem[3],ymm7[4,5,6,7] -; AVX2-SLOW-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm11 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm11[0,1],ymm7[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpermps %ymm10, %ymm0, %ymm10 +; AVX2-SLOW-NEXT: vpermps %ymm10, %ymm1, %ymm10 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm10 = ymm7[0,1,2,3,4],ymm10[5,6,7] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm7 = ymm9[1,1,1,1,5,5,5,5] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2],ymm12[3],ymm7[4,5,6,7] -; AVX2-SLOW-NEXT: vpermps %ymm13, %ymm1, %ymm8 +; AVX2-SLOW-NEXT: vpermps %ymm13, %ymm0, %ymm8 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1],ymm7[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpermps %ymm6, %ymm0, %ymm6 +; AVX2-SLOW-NEXT: vpermps %ymm6, %ymm1, %ymm6 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3,4],ymm6[5,6,7] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm5 = ymm5[1,1,1,1,5,5,5,5] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1,2],ymm2[3],ymm5[4,5,6,7] -; AVX2-SLOW-NEXT: vpermps %ymm4, %ymm1, %ymm1 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpermps %ymm3, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm0[5,6,7] +; AVX2-SLOW-NEXT: vpermps %ymm4, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpermps %ymm3, %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 192(%rsi) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -7849,7 +7828,7 @@ ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm6[4,5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vbroadcastsd {{.*#+}} ymm10 = [8589934596,8589934596,8589934596,8589934596] +; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm10 = ; AVX2-FAST-NEXT: vpermps %ymm1, %ymm10, %ymm1 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -8016,7 +7995,7 @@ ; AVX2-FAST-NEXT: vpermilps $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm14 = mem[1,3,2,3,5,7,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm14 = ymm0[0,1,2],ymm14[3,4,5,6,7] -; AVX2-FAST-NEXT: vbroadcastsd {{.*#+}} ymm0 = [12884901893,12884901893,12884901893,12884901893] +; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm0 = ; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] ; AVX2-FAST-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -8335,17 +8314,17 @@ ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm3[0,1,3,3,4,5,7,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm6[3,3,3,3,7,7,7,7] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm5[1],ymm1[2,3,4],ymm5[5],ymm1[6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm5[3,3,3,3,7,7,7,7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm4[1],ymm1[2,3,4],ymm4[5],ymm1[6,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm9[0,1,0,1,4,5,4,5] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm7[0,1,3,3,4,5,7,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm7[0,1,0,1,4,5,4,5] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm6[0,1,3,3,4,5,7,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] @@ -8382,32 +8361,31 @@ ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5,6,7] ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,0,2,4,5,4,6] -; AVX2-FAST-NEXT: vmovaps {{.*#+}} xmm8 = <4,2,u,u> -; AVX2-FAST-NEXT: vpermps %ymm1, %ymm8, %ymm1 +; AVX2-FAST-NEXT: vmovaps {{.*#+}} xmm9 = <4,2,u,u> +; AVX2-FAST-NEXT: vpermps %ymm1, %ymm9, %ymm1 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm0[2,3,4,5,6,7] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm0 = ymm0[0,1],mem[2,3],ymm0[4,5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vbroadcastf128 {{.*#+}} ymm4 = [0,2,0,6,0,2,0,6] -; AVX2-FAST-NEXT: # ymm4 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpermps %ymm0, %ymm4, %ymm2 +; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm8 = +; AVX2-FAST-NEXT: vpermps %ymm0, %ymm8, %ymm2 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm3 = ymm3[0,1,2,3],mem[4,5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm6[4,5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm5[4,5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps 464(%rdi), %xmm0 ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7] ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,0,2,4,5,4,6] -; AVX2-FAST-NEXT: vpermps %ymm3, %ymm8, %ymm2 +; AVX2-FAST-NEXT: vpermps %ymm3, %ymm9, %ymm2 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1],ymm7[2,3],ymm9[4,5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1],ymm6[2,3],ymm7[4,5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermps %ymm0, %ymm4, %ymm2 +; AVX2-FAST-NEXT: vpermps %ymm0, %ymm8, %ymm2 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload @@ -8422,13 +8400,13 @@ ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5,6,7] ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,0,2,4,5,4,6] -; AVX2-FAST-NEXT: vpermps %ymm0, %ymm8, %ymm2 +; AVX2-FAST-NEXT: vpermps %ymm0, %ymm9, %ymm2 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-FAST-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm0 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm0 = ymm2[0,1],mem[2,3],ymm2[4,5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermps %ymm0, %ymm4, %ymm2 +; AVX2-FAST-NEXT: vpermps %ymm0, %ymm8, %ymm2 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm0 # 32-byte Folded Reload @@ -8440,13 +8418,13 @@ ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5,6,7] ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,0,2,4,5,4,6] -; AVX2-FAST-NEXT: vpermps %ymm0, %ymm8, %ymm2 +; AVX2-FAST-NEXT: vpermps %ymm0, %ymm9, %ymm2 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm0 = mem[0,1],ymm0[2,3],mem[4,5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermps %ymm0, %ymm4, %ymm2 +; AVX2-FAST-NEXT: vpermps %ymm0, %ymm8, %ymm2 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload @@ -8461,12 +8439,12 @@ ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5,6,7] ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,0,2,4,5,4,6] -; AVX2-FAST-NEXT: vpermps %ymm0, %ymm8, %ymm2 +; AVX2-FAST-NEXT: vpermps %ymm0, %ymm9, %ymm2 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-FAST-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm14 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm14 = mem[0,1],ymm2[2,3],mem[4,5,6,7] -; AVX2-FAST-NEXT: vpermps %ymm14, %ymm4, %ymm2 +; AVX2-FAST-NEXT: vpermps %ymm14, %ymm8, %ymm2 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm0 # 32-byte Folded Reload @@ -8478,12 +8456,12 @@ ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm11[0,1],ymm1[2,3],ymm11[4,5,6,7] ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,0,2,4,5,4,6] -; AVX2-FAST-NEXT: vpermps %ymm0, %ymm8, %ymm2 +; AVX2-FAST-NEXT: vpermps %ymm0, %ymm9, %ymm2 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-FAST-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm10 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm10 = mem[0,1],ymm2[2,3],mem[4,5,6,7] -; AVX2-FAST-NEXT: vpermps %ymm10, %ymm4, %ymm2 +; AVX2-FAST-NEXT: vpermps %ymm10, %ymm8, %ymm2 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload @@ -8496,95 +8474,94 @@ ; AVX2-FAST-NEXT: vmovaps 1424(%rdi), %xmm13 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm12[0,1],ymm13[2,3],ymm12[4,5,6,7] ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,0,2,4,5,4,6] -; AVX2-FAST-NEXT: vpermps %ymm0, %ymm8, %ymm2 +; AVX2-FAST-NEXT: vpermps %ymm0, %ymm9, %ymm2 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-FAST-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm6 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm6 = mem[0,1],ymm2[2,3],mem[4,5,6,7] -; AVX2-FAST-NEXT: vpermps %ymm6, %ymm4, %ymm2 +; AVX2-FAST-NEXT: vpermps %ymm6, %ymm8, %ymm2 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm5 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm5 = ymm15[0,1,2,3],mem[4,5,6,7] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm9 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm9 = ymm1[0,1,2,3],mem[4,5,6,7] +; AVX2-FAST-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm4 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm4 = ymm1[0,1,2,3],mem[4,5,6,7] ; AVX2-FAST-NEXT: vmovaps 80(%rdi), %xmm2 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1],ymm2[2,3],ymm5[4,5,6,7] ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,0,2,4,5,4,6] -; AVX2-FAST-NEXT: vpermps %ymm9, %ymm8, %ymm3 +; AVX2-FAST-NEXT: vpermps %ymm4, %ymm9, %ymm3 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3,4,5,6,7] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-FAST-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm3 = mem[0,1],ymm3[2,3],mem[4,5,6,7] -; AVX2-FAST-NEXT: vpermps %ymm3, %ymm4, %ymm0 +; AVX2-FAST-NEXT: vpermps %ymm3, %ymm8, %ymm0 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm0 = mem[1,1,1,1,5,5,5,5] -; AVX2-FAST-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm0 = ymm0[0,1,2],mem[3],ymm0[4,5,6,7] -; AVX2-FAST-NEXT: vmovaps {{.*#+}} xmm1 = <5,3,u,u> -; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm7 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm0[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vbroadcastf128 {{.*#+}} ymm0 = [0,3,1,7,0,3,1,7] -; AVX2-FAST-NEXT: # ymm0 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm1 = ymm0[0,1,2],mem[3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vmovaps {{.*#+}} xmm0 = <5,3,u,u> +; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm1[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm1 = +; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm15[5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm7 = mem[1,1,1,1,5,5,5,5] ; AVX2-FAST-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm7 = ymm7[0,1,2],mem[3],ymm7[4,5,6,7] -; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm15[0,1],ymm7[2,3,4,5,6,7] ; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm7[0,1,2,3,4],ymm15[5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm15[0,1],ymm7[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm9 = ymm7[0,1,2,3,4],ymm15[5,6,7] ; AVX2-FAST-NEXT: vpermilps $85, (%rsp), %ymm7 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm7 = mem[1,1,1,1,5,5,5,5] ; AVX2-FAST-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm7 = ymm7[0,1,2],mem[3],ymm7[4,5,6,7] -; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm15[0,1],ymm7[2,3,4,5,6,7] ; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm15[0,1],ymm7[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm15[5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm7 = mem[1,1,1,1,5,5,5,5] ; AVX2-FAST-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm7 = ymm7[0,1,2],mem[3],ymm7[4,5,6,7] -; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm15[0,1],ymm7[2,3,4,5,6,7] ; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm15[0,1],ymm7[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm15[5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm15 = mem[1,1,1,1,5,5,5,5] ; AVX2-FAST-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm15 = ymm15[0,1,2],mem[3],ymm15[4,5,6,7] -; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm7 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 32-byte Folded Reload ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm15[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vpermps %ymm14, %ymm0, %ymm14 +; AVX2-FAST-NEXT: vpermps %ymm14, %ymm1, %ymm14 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm14 = ymm7[0,1,2,3,4],ymm14[5,6,7] ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm7 = ymm11[1,1,1,1,5,5,5,5] ; AVX2-FAST-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm7 = ymm7[0,1,2],mem[3],ymm7[4,5,6,7] -; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm11 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 32-byte Folded Reload ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm11[0,1],ymm7[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vpermps %ymm10, %ymm0, %ymm10 +; AVX2-FAST-NEXT: vpermps %ymm10, %ymm1, %ymm10 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm10 = ymm7[0,1,2,3,4],ymm10[5,6,7] ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm7 = ymm12[1,1,1,1,5,5,5,5] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2],ymm13[3],ymm7[4,5,6,7] -; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm8 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1],ymm7[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vpermps %ymm6, %ymm0, %ymm6 +; AVX2-FAST-NEXT: vpermps %ymm6, %ymm1, %ymm6 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3,4],ymm6[5,6,7] ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm5 = ymm5[1,1,1,1,5,5,5,5] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1,2],ymm2[3],ymm5[4,5,6,7] -; AVX2-FAST-NEXT: vpermps %ymm9, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vpermps %ymm3, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm0[5,6,7] +; AVX2-FAST-NEXT: vpermps %ymm4, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpermps %ymm3, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, 192(%rsi) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -8673,7 +8650,7 @@ ; AVX2-FAST-NEXT: vmovaps %ymm0, 128(%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, 96(%rax) -; AVX2-FAST-NEXT: vmovaps %ymm4, 64(%rax) +; AVX2-FAST-NEXT: vmovaps %ymm9, 64(%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, 32(%rax) ; AVX2-FAST-NEXT: vmovaps %ymm1, (%rax) @@ -8711,7 +8688,7 @@ ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm6[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vbroadcastsd {{.*#+}} ymm9 = [8589934596,8589934596,8589934596,8589934596] +; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} ymm9 = ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm9, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -8878,7 +8855,7 @@ ; AVX2-FAST-PERLANE-NEXT: vpermilps $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm14 = mem[1,3,2,3,5,7,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm14 = ymm0[0,1,2],ymm14[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastsd {{.*#+}} ymm0 = [12884901893,12884901893,12884901893,12884901893] +; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} ymm0 = ; AVX2-FAST-PERLANE-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -9257,8 +9234,7 @@ ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm15 = mem[0,1],ymm0[2,3],mem[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastf128 {{.*#+}} ymm0 = [0,2,0,6,0,2,0,6] -; AVX2-FAST-PERLANE-NEXT: # ymm0 = mem[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} ymm0 = ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm15, %ymm0, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -9389,70 +9365,69 @@ ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm0 = mem[1,1,1,1,5,5,5,5] -; AVX2-FAST-PERLANE-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm0 = ymm0[0,1,2],mem[3],ymm0[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} xmm1 = <5,3,u,u> -; AVX2-FAST-PERLANE-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm7 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm0[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastf128 {{.*#+}} ymm0 = [0,3,1,7,0,3,1,7] -; AVX2-FAST-PERLANE-NEXT: # ymm0 = mem[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm15, %ymm0, %ymm15 +; AVX2-FAST-PERLANE-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm1 = ymm0[0,1,2],mem[3],ymm0[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} xmm0 = <5,3,u,u> +; AVX2-FAST-PERLANE-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm1[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} ymm1 = +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm15, %ymm1, %ymm15 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm15[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm7 = mem[1,1,1,1,5,5,5,5] ; AVX2-FAST-PERLANE-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm7 = ymm7[0,1,2],mem[3],ymm7[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm15[0,1],ymm7[2,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm15[0,1],ymm7[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm15[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm7 = mem[1,1,1,1,5,5,5,5] ; AVX2-FAST-PERLANE-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm7 = ymm7[0,1,2],mem[3],ymm7[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm15[0,1],ymm7[2,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm15[0,1],ymm7[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm15[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm7 = mem[1,1,1,1,5,5,5,5] ; AVX2-FAST-PERLANE-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm7 = ymm7[0,1,2],mem[3],ymm7[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm15[0,1],ymm7[2,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm15[0,1],ymm7[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm15[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm7, (%rsp) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm15 = mem[1,1,1,1,5,5,5,5] ; AVX2-FAST-PERLANE-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm15 = ymm15[0,1,2],mem[3],ymm15[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm7 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm15[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm14, %ymm0, %ymm14 +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm14, %ymm1, %ymm14 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm14 = ymm7[0,1,2,3,4],ymm14[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm7 = ymm11[1,1,1,1,5,5,5,5] ; AVX2-FAST-PERLANE-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm7 = ymm7[0,1,2],mem[3],ymm7[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm11 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm11[0,1],ymm7[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm10, %ymm0, %ymm10 +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm10, %ymm1, %ymm10 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm10 = ymm7[0,1,2,3,4],ymm10[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm7 = ymm9[1,1,1,1,5,5,5,5] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2],ymm12[3],ymm7[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm13, %ymm1, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm13, %ymm0, %ymm8 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1],ymm7[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm6, %ymm0, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm6, %ymm1, %ymm6 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3,4],ymm6[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm5 = ymm5[1,1,1,1,5,5,5,5] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1,2],ymm2[3],ymm5[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm4, %ymm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm3, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm0[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm4, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm3, %ymm1, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 192(%rsi) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-7.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-7.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-7.ll @@ -910,12 +910,12 @@ ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vpermd %ymm13, %ymm12, %ymm12 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm10 = xmm12[0,1],xmm10[2,3] -; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm12 = [30064771072,30064771072,30064771072,30064771072] -; AVX2-SLOW-NEXT: vpermd %ymm5, %ymm12, %ymm13 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],ymm4[6,7] -; AVX2-SLOW-NEXT: vpbroadcastd 212(%rdi), %ymm14 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5,6],ymm14[7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm13[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm12 = +; AVX2-SLOW-NEXT: vpermd %ymm5, %ymm12, %ymm12 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],ymm4[6,7] +; AVX2-SLOW-NEXT: vpbroadcastd 212(%rdi), %ymm13 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5,6],ymm13[7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm12[4,5,6,7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0],ymm5[1],ymm4[2,3,4],ymm5[5],ymm4[6,7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[1,0,3,3,5,4,7,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,0,3] @@ -928,7 +928,8 @@ ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm8, %xmm8 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm8 = xmm8[0,1],xmm11[2,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3],ymm4[4,5,6,7] -; AVX2-SLOW-NEXT: vpermd 192(%rdi), %ymm12, %ymm8 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = +; AVX2-SLOW-NEXT: vpermd 192(%rdi), %ymm8, %ymm8 ; AVX2-SLOW-NEXT: vpbroadcastd 136(%rdi), %xmm11 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm3 = xmm11[0],xmm3[1],xmm11[2,3] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 @@ -979,7 +980,7 @@ ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,2,0] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3,4,5,6],ymm6[7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm10[0,1],ymm9[2,3],ymm10[4,5],ymm9[6,7] -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm8 = [25769803781,25769803781,25769803781,25769803781] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = ; AVX2-FAST-NEXT: vpermd %ymm7, %ymm8, %ymm7 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = <1,0,7,u,u,u,u,u> ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7] @@ -1013,12 +1014,12 @@ ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-NEXT: vpermd %ymm13, %ymm12, %ymm12 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm10 = xmm12[0,1],xmm10[2,3] -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm12 = [30064771072,30064771072,30064771072,30064771072] -; AVX2-FAST-NEXT: vpermd %ymm5, %ymm12, %ymm13 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],ymm4[6,7] -; AVX2-FAST-NEXT: vpbroadcastd 212(%rdi), %ymm14 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5,6],ymm14[7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm13[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = +; AVX2-FAST-NEXT: vpermd %ymm5, %ymm12, %ymm12 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],ymm4[6,7] +; AVX2-FAST-NEXT: vpbroadcastd 212(%rdi), %ymm13 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5,6],ymm13[7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm12[4,5,6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0],ymm5[1],ymm4[2,3,4],ymm5[5],ymm4[6,7] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = ; AVX2-FAST-NEXT: vpermd %ymm4, %ymm5, %ymm4 @@ -1031,7 +1032,8 @@ ; AVX2-FAST-NEXT: vextracti128 $1, %ymm8, %xmm8 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm8 = xmm8[0,1],xmm11[2,3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3],ymm4[4,5,6,7] -; AVX2-FAST-NEXT: vpermd 192(%rdi), %ymm12, %ymm8 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = +; AVX2-FAST-NEXT: vpermd 192(%rdi), %ymm8, %ymm8 ; AVX2-FAST-NEXT: vpbroadcastd 136(%rdi), %xmm11 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm3 = xmm11[0],xmm3[1],xmm11[2,3] ; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 @@ -1116,12 +1118,12 @@ ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm13 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermd %ymm13, %ymm12, %ymm12 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm10 = xmm12[0,1],xmm10[2,3] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} ymm12 = [30064771072,30064771072,30064771072,30064771072] -; AVX2-FAST-PERLANE-NEXT: vpermd %ymm5, %ymm12, %ymm13 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],ymm4[6,7] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 212(%rdi), %ymm14 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5,6],ymm14[7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm13[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm12 = +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm5, %ymm12, %ymm12 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],ymm4[6,7] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 212(%rdi), %ymm13 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5,6],ymm13[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm12[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0],ymm5[1],ymm4[2,3,4],ymm5[5],ymm4[6,7] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[1,0,3,3,5,4,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,0,3] @@ -1134,7 +1136,8 @@ ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm8, %xmm8 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm8 = xmm8[0,1],xmm11[2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3],ymm4[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermd 192(%rdi), %ymm12, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm8 = +; AVX2-FAST-PERLANE-NEXT: vpermd 192(%rdi), %ymm8, %ymm8 ; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 136(%rdi), %xmm11 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm3 = xmm11[0],xmm3[1],xmm11[2,3] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 @@ -1797,342 +1800,338 @@ ; ; AVX2-SLOW-LABEL: load_i32_stride7_vf16: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: subq $264, %rsp # imm = 0x108 -; AVX2-SLOW-NEXT: vmovdqa 288(%rdi), %ymm15 -; AVX2-SLOW-NEXT: vmovdqa 384(%rdi), %ymm8 -; AVX2-SLOW-NEXT: vmovdqa 352(%rdi), %ymm6 -; AVX2-SLOW-NEXT: vmovdqa 320(%rdi), %ymm5 -; AVX2-SLOW-NEXT: vmovdqa 256(%rdi), %ymm0 +; AVX2-SLOW-NEXT: subq $232, %rsp +; AVX2-SLOW-NEXT: vmovdqa 288(%rdi), %ymm5 +; AVX2-SLOW-NEXT: vmovdqa 384(%rdi), %ymm6 +; AVX2-SLOW-NEXT: vmovdqa 352(%rdi), %ymm7 +; AVX2-SLOW-NEXT: vmovdqa 320(%rdi), %ymm3 +; AVX2-SLOW-NEXT: vmovdqa 256(%rdi), %ymm2 ; AVX2-SLOW-NEXT: vmovdqa 224(%rdi), %ymm10 ; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm9 -; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %ymm4 -; AVX2-SLOW-NEXT: vmovdqa 96(%rdi), %ymm12 -; AVX2-SLOW-NEXT: vpbroadcastq 80(%rdi), %ymm1 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm12[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = <0,7,6,u> -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm9[0,1,2,3,4,5],ymm4[6],ymm9[7] -; AVX2-SLOW-NEXT: vmovdqa %ymm4, %ymm7 -; AVX2-SLOW-NEXT: vpermd %ymm3, %ymm2, %ymm3 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa 128(%rdi), %xmm3 -; AVX2-SLOW-NEXT: vmovdqa 160(%rdi), %xmm4 -; AVX2-SLOW-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm4[1] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-SLOW-NEXT: vpbroadcastd 196(%rdi), %ymm4 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm4[7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm3[5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %ymm8 +; AVX2-SLOW-NEXT: vmovdqa 96(%rdi), %ymm15 +; AVX2-SLOW-NEXT: vpbroadcastq 80(%rdi), %ymm0 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = <0,7,6,u> +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0,1,2,3,4,5],ymm8[6],ymm9[7] +; AVX2-SLOW-NEXT: vpermd %ymm4, %ymm1, %ymm4 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2],ymm0[3,4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqa 128(%rdi), %xmm4 +; AVX2-SLOW-NEXT: vmovdqa 160(%rdi), %xmm11 +; AVX2-SLOW-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm4 = xmm4[1],xmm11[1] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX2-SLOW-NEXT: vpbroadcastd 196(%rdi), %ymm11 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm11[7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1,2,3,4,5],ymm0[6],ymm10[7] -; AVX2-SLOW-NEXT: vpermd %ymm1, %ymm2, %ymm1 -; AVX2-SLOW-NEXT: vpbroadcastq 304(%rdi), %ymm2 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa 352(%rdi), %xmm2 -; AVX2-SLOW-NEXT: vmovdqa 384(%rdi), %xmm3 -; AVX2-SLOW-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm3[1] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-SLOW-NEXT: vpbroadcastd 420(%rdi), %ymm3 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = mem[2,2,2,2] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2,3,4,5],ymm2[6],ymm10[7] +; AVX2-SLOW-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2-SLOW-NEXT: vpbroadcastq 304(%rdi), %ymm1 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqa 352(%rdi), %xmm1 +; AVX2-SLOW-NEXT: vmovdqa 384(%rdi), %xmm4 +; AVX2-SLOW-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm4[1] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vpalignr {{.*#+}} ymm2 = ymm8[12,13,14,15],ymm6[0,1,2,3,4,5,6,7,8,9,10,11],ymm8[28,29,30,31],ymm6[16,17,18,19,20,21,22,23,24,25,26,27] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,2,0] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1],ymm15[2,3],ymm5[4,5],ymm15[6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[1,2,2,3,5,6,6,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0],ymm10[1],ymm0[2,3,4],ymm10[5],ymm0[6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[1,0,3,3,5,4,7,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,3,2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3,4,5,6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm1[5,6,7] +; AVX2-SLOW-NEXT: vpbroadcastd 420(%rdi), %ymm4 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm4[7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = mem[2,2,2,2] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpalignr {{.*#+}} ymm1 = ymm6[12,13,14,15],ymm7[0,1,2,3,4,5,6,7,8,9,10,11],ymm6[28,29,30,31],ymm7[16,17,18,19,20,21,22,23,24,25,26,27] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,0] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1],ymm5[2,3],ymm3[4,5],ymm5[6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,2,2,3,5,6,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm2[0],ymm10[1],ymm2[2,3,4],ymm10[5],ymm2[6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm4, (%rsp) # 32-byte Spill +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[1,0,3,3,5,4,7,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,3,2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2],ymm1[3,4,5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = mem[2,2,2,2] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm3 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = mem[2,2,2,2] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm4 ; AVX2-SLOW-NEXT: vmovdqa 160(%rdi), %ymm2 ; AVX2-SLOW-NEXT: vmovdqa 128(%rdi), %ymm1 -; AVX2-SLOW-NEXT: vpalignr {{.*#+}} ymm4 = ymm2[12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10,11],ymm2[28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26,27] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,2,0] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5,6],ymm3[7] -; AVX2-SLOW-NEXT: vmovdqa 64(%rdi), %ymm10 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm12[0,1],ymm10[2,3],ymm12[4,5],ymm10[6,7] +; AVX2-SLOW-NEXT: vpalignr {{.*#+}} ymm11 = ymm2[12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10,11],ymm2[28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26,27] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,1,2,0] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5,6],ymm4[7] +; AVX2-SLOW-NEXT: vmovdqa 64(%rdi), %ymm12 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm15[0,1],ymm12[2,3],ymm15[4,5],ymm12[6,7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[1,2,2,3,5,6,6,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm4[2,2,2,2] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0],ymm9[1],ymm7[2,3,4],ymm9[5],ymm7[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0],ymm9[1],ymm8[2,3,4],ymm9[5],ymm8[6,7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm14 = ymm4[1,0,3,3,5,4,7,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,3,2,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2],ymm0[3,4,5,6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm11[5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa 80(%rdi), %xmm0 -; AVX2-SLOW-NEXT: vpalignr {{.*#+}} ymm3 = ymm12[8,9,10,11,12,13,14,15],ymm10[0,1,2,3,4,5,6,7],ymm12[24,25,26,27,28,29,30,31],ymm10[16,17,18,19,20,21,22,23] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3],ymm3[4,5,6,7] -; AVX2-SLOW-NEXT: vpbroadcastd 8(%rdi), %xmm3 +; AVX2-SLOW-NEXT: vpalignr {{.*#+}} ymm11 = ymm15[8,9,10,11,12,13,14,15],ymm12[0,1,2,3,4,5,6,7],ymm15[24,25,26,27,28,29,30,31],ymm12[16,17,18,19,20,21,22,23] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2],ymm0[3],ymm11[4,5,6,7] +; AVX2-SLOW-NEXT: vpbroadcastd 8(%rdi), %xmm11 ; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %xmm14 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0],xmm14[1],xmm3[2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm11 = xmm11[0],xmm14[1],xmm11[2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1],ymm0[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] ; AVX2-SLOW-NEXT: vpbroadcastd 204(%rdi), %ymm13 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm13[7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5,6],ymm13[7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm11[5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa 304(%rdi), %xmm0 -; AVX2-SLOW-NEXT: vpalignr {{.*#+}} ymm3 = ymm5[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],ymm5[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3],ymm3[4,5,6,7] -; AVX2-SLOW-NEXT: vpbroadcastd 232(%rdi), %xmm3 +; AVX2-SLOW-NEXT: vpalignr {{.*#+}} ymm11 = ymm3[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],ymm3[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2],ymm0[3],ymm11[4,5,6,7] +; AVX2-SLOW-NEXT: vpbroadcastd 232(%rdi), %xmm11 ; AVX2-SLOW-NEXT: vmovdqa 256(%rdi), %xmm13 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0],xmm13[1],xmm3[2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm6[0],ymm8[0],ymm6[2],ymm8[2] -; AVX2-SLOW-NEXT: vpbroadcastd 428(%rdi), %ymm11 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm11[7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm11 = xmm11[0],xmm13[1],xmm11[2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1],ymm0[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm7[0],ymm6[0],ymm7[2],ymm6[2] +; AVX2-SLOW-NEXT: vpbroadcastd 428(%rdi), %ymm10 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3,4,5,6],ymm10[7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm10[5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0],ymm10[1],ymm12[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm3 = xmm14[0,1,2],mem[3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[3,2,2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0],ymm12[1],ymm15[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm10 = xmm14[0,1,2],mem[3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[3,2,2,3] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm3 = ymm1[0,2],ymm2[1,3],ymm1[4,6],ymm2[5,7] -; AVX2-SLOW-NEXT: vbroadcastss 208(%rdi), %ymm10 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm10[7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0],ymm15[1],ymm5[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1],ymm0[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm10 = ymm1[0,2],ymm2[1,3],ymm1[4,6],ymm2[5,7] +; AVX2-SLOW-NEXT: vbroadcastss 208(%rdi), %ymm11 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5,6],ymm11[7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm10[5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0],ymm5[1],ymm3[2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm3 = xmm13[0,1,2],mem[3] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[3,2,2,3] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm3 = ymm6[0,2],ymm8[1,3],ymm6[4,6],ymm8[5,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm3 = ymm7[0,2],ymm6[1,3],ymm7[4,6],ymm6[5,7] ; AVX2-SLOW-NEXT: vbroadcastss 432(%rdi), %ymm5 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm5[7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpbroadcastd 100(%rdi), %xmm0 -; AVX2-SLOW-NEXT: vmovdqa 64(%rdi), %xmm3 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm3[0,1,2],xmm0[3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm15 = ymm0[0,1,2,3,4],ymm3[5,6,7] +; AVX2-SLOW-NEXT: vpbroadcastd 100(%rdi), %xmm3 +; AVX2-SLOW-NEXT: vmovdqa 64(%rdi), %xmm0 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm3 = xmm0[0,1,2],xmm3[3] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm5 = <4,3,u,u> -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm7[0,1,2,3],ymm9[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa %ymm7, %ymm12 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm8[0,1,2,3],ymm9[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqa %ymm8, %ymm12 ; AVX2-SLOW-NEXT: vpermd %ymm10, %ymm5, %ymm10 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm10 = xmm10[0,1],xmm0[2,3] -; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm0 = [30064771072,30064771072,30064771072,30064771072] -; AVX2-SLOW-NEXT: vpermd %ymm1, %ymm0, %ymm11 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm3 = xmm10[0,1],xmm3[2,3] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm10 = +; AVX2-SLOW-NEXT: vpermd %ymm1, %ymm10, %ymm11 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],ymm2[6,7] ; AVX2-SLOW-NEXT: vpbroadcastd 212(%rdi), %ymm13 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5,6],ymm13[7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm15 = ymm10[0,1,2,3],ymm11[4,5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm11 = ymm3[0,1,2,3],ymm11[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm14[0,1,2,3],ymm7[4,5,6,7] -; AVX2-SLOW-NEXT: vpermd %ymm10, %ymm5, %ymm5 -; AVX2-SLOW-NEXT: vpbroadcastd 324(%rdi), %xmm10 -; AVX2-SLOW-NEXT: vmovdqa 288(%rdi), %xmm11 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm10 = xmm11[0,1,2],xmm10[3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],xmm10[2,3] -; AVX2-SLOW-NEXT: vpermd %ymm6, %ymm0, %ymm10 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm8[6,7] -; AVX2-SLOW-NEXT: vpbroadcastd 436(%rdi), %ymm13 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5,6],ymm13[7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm10 = ymm5[0,1,2,3],ymm10[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm14[0,1,2,3],ymm8[4,5,6,7] +; AVX2-SLOW-NEXT: vpermd %ymm3, %ymm5, %ymm3 +; AVX2-SLOW-NEXT: vpbroadcastd 324(%rdi), %xmm5 +; AVX2-SLOW-NEXT: vmovdqa 288(%rdi), %xmm13 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm5 = xmm13[0,1,2],xmm5[3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm5[2,3] +; AVX2-SLOW-NEXT: vpermd %ymm7, %ymm10, %ymm5 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm6[6,7] +; AVX2-SLOW-NEXT: vpbroadcastd 436(%rdi), %ymm10 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm10[7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm5[4,5,6,7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,0,3,3,5,4,7,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,3] ; AVX2-SLOW-NEXT: vpbroadcastd 216(%rdi), %ymm2 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] -; AVX2-SLOW-NEXT: vmovdqa 96(%rdi), %xmm5 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm5[0,1,2],xmm3[3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,3,2] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm4[1,0,2,3,5,4,6,7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm3 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa 320(%rdi), %xmm3 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1,2],xmm11[3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,3,2] -; AVX2-SLOW-NEXT: vpshufd $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm4 = mem[1,0,2,3,5,4,6,7] +; AVX2-SLOW-NEXT: vmovdqa 96(%rdi), %xmm2 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,3,2] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[1,0,2,3,5,4,6,7] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm4 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm4[0,1],xmm2[2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0],ymm6[1],ymm8[2,3,4],ymm6[5],ymm8[6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[1,0,3,3,5,4,7,7] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,0,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm4[0,1],xmm0[2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqa 320(%rdi), %xmm4 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm4[0,1,2],xmm13[3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,3,2] +; AVX2-SLOW-NEXT: vpshufd $225, (%rsp), %ymm5 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm5 = mem[1,0,2,3,5,4,6,7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm5 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm5[0,1],xmm1[2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0],ymm7[1],ymm6[2,3,4],ymm7[5],ymm6[6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[1,0,3,3,5,4,7,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,3] ; AVX2-SLOW-NEXT: vpbroadcastd 440(%rdi), %ymm6 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm6[7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] -; AVX2-SLOW-NEXT: vpbroadcastd 136(%rdi), %xmm4 -; AVX2-SLOW-NEXT: vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm4 = xmm4[0],mem[1],xmm4[2,3] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-SLOW-NEXT: vpermd 192(%rdi), %ymm0, %ymm6 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm6[6,7] -; AVX2-SLOW-NEXT: vpbroadcastd 80(%rdi), %ymm6 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0,1,2],xmm5[3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm9[2,3,2,3,6,7,6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0],ymm12[1],ymm6[2,3,4],ymm12[5],ymm6[6,7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm6 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm6[7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5,6,7] +; AVX2-SLOW-NEXT: vpbroadcastd 136(%rdi), %xmm5 +; AVX2-SLOW-NEXT: vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm5 = xmm5[0],mem[1],xmm5[2,3] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = +; AVX2-SLOW-NEXT: vpermd 192(%rdi), %ymm6, %ymm7 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm7[6,7] +; AVX2-SLOW-NEXT: vpbroadcastd 80(%rdi), %ymm7 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm7[0,1,2],xmm2[3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm9[2,3,2,3,6,7,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0],ymm12[1],ymm7[2,3,4],ymm12[5],ymm7[6,7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm7, %xmm7 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm7[0,1],xmm2[2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7] ; AVX2-SLOW-NEXT: vpbroadcastd 360(%rdi), %xmm5 ; AVX2-SLOW-NEXT: vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm5 = xmm5[0],mem[1],xmm5[2,3] -; AVX2-SLOW-NEXT: vpermd 416(%rdi), %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpermd 416(%rdi), %ymm6, %ymm6 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5],ymm0[6,7] -; AVX2-SLOW-NEXT: vpbroadcastd 304(%rdi), %ymm5 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm3 = xmm5[0,1,2],xmm3[3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm7[2,3,2,3,6,7,6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0],ymm14[1],ymm5[2,3,4],ymm14[5],ymm5[6,7] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm5 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm3 = xmm5[0,1],xmm3[2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm3, 32(%rsi) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm3, (%rsi) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm3, 32(%rdx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm3, (%rdx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm3, 32(%rcx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm3, (%rcx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm3, 32(%r8) -; AVX2-SLOW-NEXT: vmovups (%rsp), %ymm3 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm3, (%r8) -; AVX2-SLOW-NEXT: vmovdqa %ymm10, 32(%r9) -; AVX2-SLOW-NEXT: vmovdqa %ymm15, (%r9) +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm6[6,7] +; AVX2-SLOW-NEXT: vpbroadcastd 304(%rdi), %ymm6 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm6[0,1,2],xmm4[3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm8[2,3,2,3,6,7,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0],ymm14[1],ymm6[2,3,4],ymm14[5],ymm6[6,7] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm6 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm4 = xmm6[0,1],xmm4[2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm5, 32(%rsi) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm5, (%rsi) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm5, 32(%rdx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm5, (%rdx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm5, 32(%rcx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm5, (%rcx) +; AVX2-SLOW-NEXT: vmovdqa %ymm15, 32(%r8) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm5, (%r8) +; AVX2-SLOW-NEXT: vmovdqa %ymm3, 32(%r9) +; AVX2-SLOW-NEXT: vmovdqa %ymm11, (%r9) ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-SLOW-NEXT: vmovdqa %ymm2, 32(%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm1, (%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm1, 32(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm0, (%rax) ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-SLOW-NEXT: vmovdqa %ymm0, 32(%rax) -; AVX2-SLOW-NEXT: vmovdqa %ymm4, (%rax) -; AVX2-SLOW-NEXT: addq $264, %rsp # imm = 0x108 +; AVX2-SLOW-NEXT: vmovdqa %ymm4, 32(%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm2, (%rax) +; AVX2-SLOW-NEXT: addq $232, %rsp ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-LABEL: load_i32_stride7_vf16: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: subq $296, %rsp # imm = 0x128 +; AVX2-FAST-NEXT: subq $264, %rsp # imm = 0x108 ; AVX2-FAST-NEXT: vmovdqa 160(%rdi), %ymm9 ; AVX2-FAST-NEXT: vmovdqa 128(%rdi), %ymm10 -; AVX2-FAST-NEXT: vmovdqa 320(%rdi), %ymm4 +; AVX2-FAST-NEXT: vmovdqa 320(%rdi), %ymm3 ; AVX2-FAST-NEXT: vmovdqa 256(%rdi), %ymm0 -; AVX2-FAST-NEXT: vmovdqa 224(%rdi), %ymm1 -; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm12 -; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %ymm5 -; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %ymm11 -; AVX2-FAST-NEXT: vmovdqa 96(%rdi), %ymm3 +; AVX2-FAST-NEXT: vmovdqa 224(%rdi), %ymm4 +; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm5 +; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %ymm7 +; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %ymm13 +; AVX2-FAST-NEXT: vmovdqa 96(%rdi), %ymm1 ; AVX2-FAST-NEXT: vpbroadcastq 80(%rdi), %ymm2 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm7 = <0,7,6,u> -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm12[0,1,2,3,4,5],ymm5[6],ymm12[7] -; AVX2-FAST-NEXT: vmovdqa %ymm5, %ymm6 -; AVX2-FAST-NEXT: vpermd %ymm8, %ymm7, %ymm8 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = <0,7,6,u> +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm5[0,1,2,3,4,5],ymm7[6],ymm5[7] +; AVX2-FAST-NEXT: vpermd %ymm8, %ymm6, %ymm8 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2],ymm2[3,4,5,6,7] ; AVX2-FAST-NEXT: vmovdqa 128(%rdi), %xmm8 -; AVX2-FAST-NEXT: vmovdqa 160(%rdi), %xmm5 -; AVX2-FAST-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm8 = xmm8[1],xmm5[1] +; AVX2-FAST-NEXT: vmovdqa 160(%rdi), %xmm11 +; AVX2-FAST-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm8 = xmm8[1],xmm11[1] ; AVX2-FAST-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX2-FAST-NEXT: vpbroadcastd 196(%rdi), %ymm13 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6],ymm13[7] +; AVX2-FAST-NEXT: vpbroadcastd 196(%rdi), %ymm11 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6],ymm11[7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm8[5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa %ymm0, %ymm13 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5],ymm0[6],ymm4[7] +; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa %ymm0, %ymm12 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3,4,5],ymm0[6],ymm1[7] -; AVX2-FAST-NEXT: vmovdqa %ymm1, %ymm5 -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermd %ymm2, %ymm7, %ymm2 -; AVX2-FAST-NEXT: vpbroadcastq 304(%rdi), %ymm7 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm4[4,5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm7[3,4,5,6,7] -; AVX2-FAST-NEXT: vmovdqa 352(%rdi), %xmm7 +; AVX2-FAST-NEXT: vpermd %ymm2, %ymm6, %ymm2 +; AVX2-FAST-NEXT: vpbroadcastq 304(%rdi), %ymm6 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm3[4,5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm6[3,4,5,6,7] +; AVX2-FAST-NEXT: vmovdqa 352(%rdi), %xmm6 ; AVX2-FAST-NEXT: vmovdqa 384(%rdi), %xmm0 ; AVX2-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm7 = xmm7[1],xmm0[1] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX2-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm6 = xmm6[1],xmm0[1] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 ; AVX2-FAST-NEXT: vpbroadcastd 420(%rdi), %ymm8 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm8[7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm7[5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm8[7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm6[5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm2 = mem[2,2,2,2] ; AVX2-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FAST-NEXT: vpalignr {{.*#+}} ymm7 = ymm9[12,13,14,15],ymm10[0,1,2,3,4,5,6,7,8,9,10,11],ymm9[28,29,30,31],ymm10[16,17,18,19,20,21,22,23,24,25,26,27] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,2,0] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1,2,3,4,5,6],ymm2[7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm3[0,1],ymm11[2,3],ymm3[4,5],ymm11[6,7] -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm0 = [25769803781,25769803781,25769803781,25769803781] -; AVX2-FAST-NEXT: vpermd %ymm7, %ymm0, %ymm8 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [1,0,7,7,5,4,7,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0],ymm12[1],ymm6[2,3,4],ymm12[5],ymm6[6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermd %ymm1, %ymm7, %ymm14 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm14[0,1,2],ymm8[3,4,5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3,4],ymm2[5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpalignr {{.*#+}} ymm6 = ymm9[12,13,14,15],ymm10[0,1,2,3,4,5,6,7,8,9,10,11],ymm9[28,29,30,31],ymm10[16,17,18,19,20,21,22,23,24,25,26,27] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,1,2,0] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3,4,5,6],ymm2[7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm1[0,1],ymm13[2,3],ymm1[4,5],ymm13[6,7] +; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm8 = [25769803781,25769803781,25769803781,25769803781] +; AVX2-FAST-NEXT: vpermd %ymm6, %ymm8, %ymm11 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [1,0,7,7,5,4,7,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0],ymm5[1],ymm7[2,3,4],ymm5[5],ymm7[6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm6, %ymm14 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm14[0,1,2],ymm11[3,4,5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3,4],ymm2[5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm2 = mem[2,2,2,2] ; AVX2-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-FAST-NEXT: vmovdqa 384(%rdi), %ymm14 ; AVX2-FAST-NEXT: vmovdqa 352(%rdi), %ymm15 -; AVX2-FAST-NEXT: vpalignr {{.*#+}} ymm8 = ymm14[12,13,14,15],ymm15[0,1,2,3,4,5,6,7,8,9,10,11],ymm14[28,29,30,31],ymm15[16,17,18,19,20,21,22,23,24,25,26,27] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,2,0] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3,4,5,6],ymm2[7] -; AVX2-FAST-NEXT: vmovdqa 288(%rdi), %ymm8 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1],ymm8[2,3],ymm4[4,5],ymm8[6,7] -; AVX2-FAST-NEXT: vpermd %ymm2, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0],ymm5[1],ymm13[2,3,4],ymm5[5],ymm13[6,7] -; AVX2-FAST-NEXT: vpermd %ymm2, %ymm7, %ymm5 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2],ymm0[3,4,5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] +; AVX2-FAST-NEXT: vpalignr {{.*#+}} ymm11 = ymm14[12,13,14,15],ymm15[0,1,2,3,4,5,6,7,8,9,10,11],ymm14[28,29,30,31],ymm15[16,17,18,19,20,21,22,23,24,25,26,27] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,1,2,0] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3,4,5,6],ymm2[7] +; AVX2-FAST-NEXT: vmovdqa 288(%rdi), %ymm11 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1],ymm11[2,3],ymm3[4,5],ymm11[6,7] +; AVX2-FAST-NEXT: vpermd %ymm2, %ymm8, %ymm8 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm12[0],ymm4[1],ymm12[2,3,4],ymm4[5],ymm12[6,7] +; AVX2-FAST-NEXT: vpermd %ymm2, %ymm6, %ymm4 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2],ymm8[3,4,5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4],ymm0[5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa 80(%rdi), %xmm0 -; AVX2-FAST-NEXT: vpalignr {{.*#+}} ymm1 = ymm3[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],ymm3[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7] -; AVX2-FAST-NEXT: vpbroadcastd 8(%rdi), %xmm1 -; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %xmm5 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm5[1],xmm1[2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm10[0],ymm9[0],ymm10[2],ymm9[2] -; AVX2-FAST-NEXT: vpbroadcastd 204(%rdi), %ymm13 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm13[7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] +; AVX2-FAST-NEXT: vpalignr {{.*#+}} ymm4 = ymm1[8,9,10,11,12,13,14,15],ymm13[0,1,2,3,4,5,6,7],ymm1[24,25,26,27,28,29,30,31],ymm13[16,17,18,19,20,21,22,23] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2],ymm0[3],ymm4[4,5,6,7] +; AVX2-FAST-NEXT: vpbroadcastd 8(%rdi), %xmm4 +; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %xmm8 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0],xmm8[1],xmm4[2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm10[0],ymm9[0],ymm10[2],ymm9[2] +; AVX2-FAST-NEXT: vpbroadcastd 204(%rdi), %ymm6 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm6[7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa 304(%rdi), %xmm0 -; AVX2-FAST-NEXT: vpalignr {{.*#+}} ymm1 = ymm4[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],ymm4[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7] -; AVX2-FAST-NEXT: vpbroadcastd 232(%rdi), %xmm1 -; AVX2-FAST-NEXT: vmovdqa 256(%rdi), %xmm13 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm13[1],xmm1[2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm15[0],ymm14[0],ymm15[2],ymm14[2] -; AVX2-FAST-NEXT: vpbroadcastd 428(%rdi), %ymm7 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm7[7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] +; AVX2-FAST-NEXT: vpalignr {{.*#+}} ymm4 = ymm3[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],ymm3[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2],ymm0[3],ymm4[4,5,6,7] +; AVX2-FAST-NEXT: vpbroadcastd 232(%rdi), %xmm4 +; AVX2-FAST-NEXT: vmovdqa 256(%rdi), %xmm6 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0],xmm6[1],xmm4[2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm4 = ymm15[0],ymm14[0],ymm15[2],ymm14[2] +; AVX2-FAST-NEXT: vpbroadcastd 428(%rdi), %ymm12 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm12[7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0],ymm11[1],ymm3[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm5[0,1,2],mem[3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm13[1],ymm1[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm8[0,1,2],mem[3] ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,2,2,3] ; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm10[0,2],ymm9[1,3],ymm10[4,6],ymm9[5,7] -; AVX2-FAST-NEXT: vbroadcastss 208(%rdi), %ymm3 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm3[7] +; AVX2-FAST-NEXT: vbroadcastss 208(%rdi), %ymm4 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm4[7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0],ymm8[1],ymm4[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm13[0,1,2],mem[3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0],ymm11[1],ymm3[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm6[0,1,2],mem[3] ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,2,2,3] ; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] @@ -2140,50 +2139,49 @@ ; AVX2-FAST-NEXT: vbroadcastss 432(%rdi), %ymm3 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm3[7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill -; AVX2-FAST-NEXT: vpbroadcastd 100(%rdi), %xmm0 -; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %xmm1 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3] +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpbroadcastd 100(%rdi), %xmm1 +; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %xmm0 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm0[0,1,2],xmm1[3] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = <4,3,u,u> -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3],ymm12[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqa %ymm6, %ymm8 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0,1,2,3],ymm5[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqa %ymm7, %ymm11 ; AVX2-FAST-NEXT: vpermd %ymm4, %ymm3, %ymm4 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],xmm0[2,3] -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm0 = [30064771072,30064771072,30064771072,30064771072] -; AVX2-FAST-NEXT: vpermd %ymm10, %ymm0, %ymm5 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm9[6,7] -; AVX2-FAST-NEXT: vpbroadcastd 212(%rdi), %ymm7 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm7[7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3],ymm13[4,5,6,7] -; AVX2-FAST-NEXT: vpermd %ymm4, %ymm3, %ymm3 -; AVX2-FAST-NEXT: vpbroadcastd 324(%rdi), %xmm4 -; AVX2-FAST-NEXT: vmovdqa 288(%rdi), %xmm5 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3] -; AVX2-FAST-NEXT: vpermd %ymm15, %ymm0, %ymm4 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm14[6,7] -; AVX2-FAST-NEXT: vpbroadcastd 436(%rdi), %ymm7 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm7[7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm3[0,1,2,3],ymm4[4,5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm4[0,1],xmm1[2,3] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = +; AVX2-FAST-NEXT: vpermd %ymm10, %ymm4, %ymm6 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm9[6,7] +; AVX2-FAST-NEXT: vpbroadcastd 212(%rdi), %ymm8 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm8[7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm1[0,1,2,3],ymm6[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3],ymm7[4,5,6,7] +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm3, %ymm1 +; AVX2-FAST-NEXT: vpbroadcastd 324(%rdi), %xmm3 +; AVX2-FAST-NEXT: vmovdqa 288(%rdi), %xmm12 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm3 = xmm12[0,1,2],xmm3[3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3] +; AVX2-FAST-NEXT: vpermd %ymm15, %ymm4, %ymm3 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm14[6,7] +; AVX2-FAST-NEXT: vpbroadcastd 436(%rdi), %ymm4 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm4[7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm9[0],ymm10[1],ymm9[2,3,4],ymm10[5],ymm9[6,7] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [1,0,3,3,1,0,7,7] ; AVX2-FAST-NEXT: vpermd %ymm3, %ymm4, %ymm3 -; AVX2-FAST-NEXT: vpbroadcastd 216(%rdi), %ymm7 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm7[7] +; AVX2-FAST-NEXT: vpbroadcastd 216(%rdi), %ymm6 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm6[7] ; AVX2-FAST-NEXT: vmovdqa 96(%rdi), %xmm9 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm9[0,1,2],xmm1[3] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,3,2] -; AVX2-FAST-NEXT: vpshufd $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm7 = mem[1,0,2,3,5,4,6,7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm7, %xmm7 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm7[0,1],xmm1[2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm1[0,1,2,3],ymm3[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqa 320(%rdi), %xmm1 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm3 = xmm1[0,1,2],xmm5[3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm9[0,1,2],xmm0[3] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,3,2] +; AVX2-FAST-NEXT: vpshufd $225, (%rsp), %ymm6 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm6 = mem[1,0,2,3,5,4,6,7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm6, %xmm6 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm6[0,1],xmm0[2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm0[0,1,2,3],ymm3[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqa 320(%rdi), %xmm0 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm3 = xmm0[0,1,2],xmm12[3] ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,3,2] ; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[1,0,2,3,5,4,6,7] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm2, %xmm2 @@ -2197,280 +2195,278 @@ ; AVX2-FAST-NEXT: vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload ; AVX2-FAST-NEXT: # xmm3 = xmm3[0],mem[1],xmm3[2,3] ; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-FAST-NEXT: vpermd 192(%rdi), %ymm0, %ymm4 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm4[6,7] -; AVX2-FAST-NEXT: vpbroadcastd 80(%rdi), %ymm4 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1,2],xmm9[3] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm5 = ymm12[2,3,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = +; AVX2-FAST-NEXT: vpermd 192(%rdi), %ymm4, %ymm10 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm10[6,7] +; AVX2-FAST-NEXT: vpbroadcastd 80(%rdi), %ymm10 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0,1,2],xmm9[3] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[2,3,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0],ymm11[1],ymm5[2,3,4],ymm11[5],ymm5[6,7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm5, %xmm5 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],xmm9[2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5,6,7] +; AVX2-FAST-NEXT: vpbroadcastd 360(%rdi), %xmm5 +; AVX2-FAST-NEXT: vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm5 = xmm5[0],mem[1],xmm5[2,3] +; AVX2-FAST-NEXT: vpermd 416(%rdi), %ymm4, %ymm4 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7] +; AVX2-FAST-NEXT: vpbroadcastd 304(%rdi), %ymm5 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm5[0,1,2],xmm0[3] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm5 = ymm7[2,3,2,3,6,7,6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0],ymm8[1],ymm5[2,3,4],ymm8[5],ymm5[6,7] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm5, %xmm5 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] -; AVX2-FAST-NEXT: vpbroadcastd 360(%rdi), %xmm4 -; AVX2-FAST-NEXT: vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm4 = xmm4[0],mem[1],xmm4[2,3] -; AVX2-FAST-NEXT: vpermd 416(%rdi), %ymm0, %ymm0 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm0[6,7] -; AVX2-FAST-NEXT: vpbroadcastd 304(%rdi), %ymm4 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm4[0,1,2],xmm1[3] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm4 = ymm13[2,3,2,3,6,7,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0],ymm6[1],ymm4[2,3,4],ymm6[5],ymm4[6,7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm4, %xmm4 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm4[0,1],xmm1[2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, 32(%rsi) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, (%rsi) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, 32(%rdx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, (%rdx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, 32(%rcx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, (%rcx) -; AVX2-FAST-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, 32(%r8) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, (%r8) -; AVX2-FAST-NEXT: vmovdqa %ymm11, 32(%r9) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm1, (%r9) +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm5[0,1],xmm0[2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm4, 32(%rsi) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm4, (%rsi) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm4, 32(%rdx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm4, (%rdx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm4, 32(%rcx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm4, (%rcx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm4, 32(%r8) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm4, (%r8) +; AVX2-FAST-NEXT: vmovdqa %ymm1, 32(%r9) +; AVX2-FAST-NEXT: vmovdqa %ymm13, (%r9) ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FAST-NEXT: vmovdqa %ymm2, 32(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm7, (%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm6, (%rax) ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FAST-NEXT: vmovdqa %ymm0, 32(%rax) ; AVX2-FAST-NEXT: vmovdqa %ymm3, (%rax) -; AVX2-FAST-NEXT: addq $296, %rsp # imm = 0x128 +; AVX2-FAST-NEXT: addq $264, %rsp # imm = 0x108 ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; ; AVX2-FAST-PERLANE-LABEL: load_i32_stride7_vf16: ; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: subq $264, %rsp # imm = 0x108 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 288(%rdi), %ymm15 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 384(%rdi), %ymm8 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 352(%rdi), %ymm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 320(%rdi), %ymm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 256(%rdi), %ymm0 +; AVX2-FAST-PERLANE-NEXT: subq $232, %rsp +; AVX2-FAST-PERLANE-NEXT: vmovdqa 288(%rdi), %ymm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 384(%rdi), %ymm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 352(%rdi), %ymm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 320(%rdi), %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 256(%rdi), %ymm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 224(%rdi), %ymm10 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm9 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdi), %ymm12 -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 80(%rdi), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm12[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm2 = <0,7,6,u> -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm9[0,1,2,3,4,5],ymm4[6],ymm9[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm4, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vpermd %ymm3, %ymm2, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 128(%rdi), %xmm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 160(%rdi), %xmm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpunpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm4[1] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 196(%rdi), %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm4[7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm3[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm8 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdi), %ymm15 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 80(%rdi), %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm1 = <0,7,6,u> +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0,1,2,3,4,5],ymm8[6],ymm9[7] +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm4, %ymm1, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2],ymm0[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 128(%rdi), %xmm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 160(%rdi), %xmm11 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpunpckhqdq {{.*#+}} xmm4 = xmm4[1],xmm11[1] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 196(%rdi), %ymm11 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm11[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1,2,3,4,5],ymm0[6],ymm10[7] -; AVX2-FAST-PERLANE-NEXT: vpermd %ymm1, %ymm2, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 304(%rdi), %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 352(%rdi), %xmm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 384(%rdi), %xmm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm3[1] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 420(%rdi), %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm1 = mem[2,2,2,2] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2,3,4,5],ymm2[6],ymm10[7] +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 304(%rdi), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 352(%rdi), %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 384(%rdi), %xmm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpunpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm4[1] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpalignr {{.*#+}} ymm2 = ymm8[12,13,14,15],ymm6[0,1,2,3,4,5,6,7,8,9,10,11],ymm8[28,29,30,31],ymm6[16,17,18,19,20,21,22,23,24,25,26,27] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,2,0] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1],ymm15[2,3],ymm5[4,5],ymm15[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[1,2,2,3,5,6,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0],ymm10[1],ymm0[2,3,4],ymm10[5],ymm0[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[1,0,3,3,5,4,7,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm1[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 420(%rdi), %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm4[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm0 = mem[2,2,2,2] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpalignr {{.*#+}} ymm1 = ymm6[12,13,14,15],ymm7[0,1,2,3,4,5,6,7,8,9,10,11],ymm6[28,29,30,31],ymm7[16,17,18,19,20,21,22,23,24,25,26,27] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,0] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1],ymm5[2,3],ymm3[4,5],ymm5[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,2,2,3,5,6,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm2[0],ymm10[1],ymm2[2,3,4],ymm10[5],ymm2[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm4, (%rsp) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[1,0,3,3,5,4,7,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2],ymm1[3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm1 = mem[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm0 = mem[2,2,2,2] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm4 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 160(%rdi), %ymm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 128(%rdi), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpalignr {{.*#+}} ymm4 = ymm2[12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10,11],ymm2[28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26,27] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,2,0] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5,6],ymm3[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %ymm10 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm12[0,1],ymm10[2,3],ymm12[4,5],ymm10[6,7] +; AVX2-FAST-PERLANE-NEXT: vpalignr {{.*#+}} ymm11 = ymm2[12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10,11],ymm2[28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26,27] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,1,2,0] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5,6],ymm4[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %ymm12 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm15[0,1],ymm12[2,3],ymm15[4,5],ymm12[6,7] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[1,2,2,3,5,6,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm4[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0],ymm9[1],ymm7[2,3,4],ymm9[5],ymm7[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0],ymm9[1],ymm8[2,3,4],ymm9[5],ymm8[6,7] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm14 = ymm4[1,0,3,3,5,4,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm14 = ymm14[0,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2],ymm0[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm11[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa 80(%rdi), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpalignr {{.*#+}} ymm3 = ymm12[8,9,10,11,12,13,14,15],ymm10[0,1,2,3,4,5,6,7],ymm12[24,25,26,27,28,29,30,31],ymm10[16,17,18,19,20,21,22,23] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3],ymm3[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 8(%rdi), %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpalignr {{.*#+}} ymm11 = ymm15[8,9,10,11,12,13,14,15],ymm12[0,1,2,3,4,5,6,7],ymm15[24,25,26,27,28,29,30,31],ymm12[16,17,18,19,20,21,22,23] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2],ymm0[3],ymm11[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 8(%rdi), %xmm11 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %xmm14 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0],xmm14[1],xmm3[2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm11 = xmm11[0],xmm14[1],xmm11[2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1],ymm0[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 204(%rdi), %ymm13 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm13[7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5,6],ymm13[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm11[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa 304(%rdi), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpalignr {{.*#+}} ymm3 = ymm5[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],ymm5[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3],ymm3[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 232(%rdi), %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpalignr {{.*#+}} ymm11 = ymm3[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],ymm3[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2],ymm0[3],ymm11[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 232(%rdi), %xmm11 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 256(%rdi), %xmm13 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0],xmm13[1],xmm3[2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm6[0],ymm8[0],ymm6[2],ymm8[2] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 428(%rdi), %ymm11 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm11[7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm11 = xmm11[0],xmm13[1],xmm11[2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1],ymm0[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpcklqdq {{.*#+}} ymm11 = ymm7[0],ymm6[0],ymm7[2],ymm6[2] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 428(%rdi), %ymm10 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3,4,5,6],ymm10[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm10[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0],ymm10[1],ymm12[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm3 = xmm14[0,1,2],mem[3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[3,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0],ymm12[1],ymm15[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm10 = xmm14[0,1,2],mem[3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[3,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm3 = ymm1[0,2],ymm2[1,3],ymm1[4,6],ymm2[5,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 208(%rdi), %ymm10 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm10[7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0],ymm15[1],ymm5[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1],ymm0[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm10 = ymm1[0,2],ymm2[1,3],ymm1[4,6],ymm2[5,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 208(%rdi), %ymm11 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5,6],ymm11[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm10[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0],ymm5[1],ymm3[2,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm3 = xmm13[0,1,2],mem[3] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[3,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm3 = ymm6[0,2],ymm8[1,3],ymm6[4,6],ymm8[5,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm3 = ymm7[0,2],ymm6[1,3],ymm7[4,6],ymm6[5,7] ; AVX2-FAST-PERLANE-NEXT: vbroadcastss 432(%rdi), %ymm5 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm5[7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 100(%rdi), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm0 = xmm3[0,1,2],xmm0[3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm15 = ymm0[0,1,2,3,4],ymm3[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 100(%rdi), %xmm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm3 = xmm0[0,1,2],xmm3[3] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm5 = <4,3,u,u> -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm10 = ymm7[0,1,2,3],ymm9[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm7, %ymm12 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm10 = ymm8[0,1,2,3],ymm9[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm8, %ymm12 ; AVX2-FAST-PERLANE-NEXT: vpermd %ymm10, %ymm5, %ymm10 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm10 = xmm10[0,1],xmm0[2,3] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} ymm0 = [30064771072,30064771072,30064771072,30064771072] -; AVX2-FAST-PERLANE-NEXT: vpermd %ymm1, %ymm0, %ymm11 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm3 = xmm10[0,1],xmm3[2,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm10 = +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm1, %ymm10, %ymm11 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],ymm2[6,7] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 212(%rdi), %ymm13 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5,6],ymm13[7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm15 = ymm10[0,1,2,3],ymm11[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm11 = ymm3[0,1,2,3],ymm11[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm10 = ymm14[0,1,2,3],ymm7[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermd %ymm10, %ymm5, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 324(%rdi), %xmm10 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 288(%rdi), %xmm11 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm10 = xmm11[0,1,2],xmm10[3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],xmm10[2,3] -; AVX2-FAST-PERLANE-NEXT: vpermd %ymm6, %ymm0, %ymm10 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm8[6,7] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 436(%rdi), %ymm13 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5,6],ymm13[7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm10 = ymm5[0,1,2,3],ymm10[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm14[0,1,2,3],ymm8[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm3, %ymm5, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 324(%rdi), %xmm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 288(%rdi), %xmm13 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm5 = xmm13[0,1,2],xmm5[3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm5[2,3] +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm7, %ymm10, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm6[6,7] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 436(%rdi), %ymm10 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm10[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm5[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,0,3,3,5,4,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,3] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 216(%rdi), %ymm2 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdi), %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm2 = xmm5[0,1,2],xmm3[3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,3,2] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm3 = ymm4[1,0,2,3,5,4,6,7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm3, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 320(%rdi), %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1,2],xmm11[3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,3,2] -; AVX2-FAST-PERLANE-NEXT: vpshufd $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm4 = mem[1,0,2,3,5,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdi), %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,3,2] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[1,0,2,3,5,4,6,7] ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm4, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm2 = xmm4[0,1],xmm2[2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0],ymm6[1],ymm8[2,3,4],ymm6[5],ymm8[6,7] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[1,0,3,3,5,4,7,7] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,0,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm0 = xmm4[0,1],xmm0[2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 320(%rdi), %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm4[0,1,2],xmm13[3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,3,2] +; AVX2-FAST-PERLANE-NEXT: vpshufd $225, (%rsp), %ymm5 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm5 = mem[1,0,2,3,5,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm5, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm5[0,1],xmm1[2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0],ymm7[1],ymm6[2,3,4],ymm7[5],ymm6[6,7] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[1,0,3,3,5,4,7,7] +; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,0,3] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 440(%rdi), %ymm6 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm6[7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 136(%rdi), %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm4 = xmm4[0],mem[1],xmm4[2,3] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpermd 192(%rdi), %ymm0, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm6[6,7] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 80(%rdi), %ymm6 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0,1,2],xmm5[3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm6 = ymm9[2,3,2,3,6,7,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0],ymm12[1],ymm6[2,3,4],ymm12[5],ymm6[6,7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm6, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm6[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 136(%rdi), %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm5 = xmm5[0],mem[1],xmm5[2,3] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm6 = +; AVX2-FAST-PERLANE-NEXT: vpermd 192(%rdi), %ymm6, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm7[6,7] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 80(%rdi), %ymm7 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm2 = xmm7[0,1,2],xmm2[3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm7 = ymm9[2,3,2,3,6,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0],ymm12[1],ymm7[2,3,4],ymm12[5],ymm7[6,7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm7, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm2 = xmm7[0,1],xmm2[2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 360(%rdi), %xmm5 ; AVX2-FAST-PERLANE-NEXT: vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm5 = xmm5[0],mem[1],xmm5[2,3] -; AVX2-FAST-PERLANE-NEXT: vpermd 416(%rdi), %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpermd 416(%rdi), %ymm6, %ymm6 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5],ymm0[6,7] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 304(%rdi), %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm3 = xmm5[0,1,2],xmm3[3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm5 = ymm7[2,3,2,3,6,7,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0],ymm14[1],ymm5[2,3,4],ymm14[5],ymm5[6,7] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm5, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm3 = xmm5[0,1],xmm3[2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, 32(%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, (%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, 32(%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, (%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, 32(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, (%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, 32(%r8) -; AVX2-FAST-PERLANE-NEXT: vmovups (%rsp), %ymm3 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, (%r8) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm10, 32(%r9) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm15, (%r9) +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm6[6,7] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 304(%rdi), %ymm6 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm6[0,1,2],xmm4[3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm6 = ymm8[2,3,2,3,6,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0],ymm14[1],ymm6[2,3,4],ymm14[5],ymm6[6,7] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm6, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm4 = xmm6[0,1],xmm4[2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, 32(%rsi) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, (%rsi) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, 32(%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, (%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, 32(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, (%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm15, 32(%r8) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, (%r8) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, 32(%r9) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm11, (%r9) ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, 32(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, (%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, 32(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, (%rax) ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 32(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm4, (%rax) -; AVX2-FAST-PERLANE-NEXT: addq $264, %rsp # imm = 0x108 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm4, 32(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, (%rax) +; AVX2-FAST-PERLANE-NEXT: addq $232, %rsp ; AVX2-FAST-PERLANE-NEXT: vzeroupper ; AVX2-FAST-PERLANE-NEXT: retq ; @@ -4015,18 +4011,18 @@ ; AVX2-SLOW-NEXT: vmovdqa 320(%rdi), %ymm9 ; AVX2-SLOW-NEXT: vmovdqa 256(%rdi), %ymm4 ; AVX2-SLOW-NEXT: vmovdqa 224(%rdi), %ymm5 -; AVX2-SLOW-NEXT: vmovdqa 544(%rdi), %ymm11 +; AVX2-SLOW-NEXT: vmovdqa 544(%rdi), %ymm10 ; AVX2-SLOW-NEXT: vmovdqa 480(%rdi), %ymm7 ; AVX2-SLOW-NEXT: vmovdqa 448(%rdi), %ymm8 ; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm3 -; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %ymm6 -; AVX2-SLOW-NEXT: vmovdqa 96(%rdi), %ymm10 +; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %ymm11 +; AVX2-SLOW-NEXT: vmovdqa 96(%rdi), %ymm1 +; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpbroadcastq 80(%rdi), %ymm0 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm10[4,5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = <0,7,6,u> -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm6[6],ymm3[7] -; AVX2-SLOW-NEXT: vmovdqa %ymm6, %ymm12 -; AVX2-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm11[6],ymm3[7] +; AVX2-SLOW-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa %ymm3, %ymm13 ; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpermd %ymm2, %ymm0, %ymm2 @@ -4046,7 +4042,8 @@ ; AVX2-SLOW-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpermd %ymm1, %ymm0, %ymm1 ; AVX2-SLOW-NEXT: vpbroadcastq 528(%rdi), %ymm2 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm11[4,5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm10[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqa %ymm10, %ymm8 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqa 576(%rdi), %xmm2 ; AVX2-SLOW-NEXT: vmovdqa 608(%rdi), %xmm3 @@ -4065,7 +4062,7 @@ ; AVX2-SLOW-NEXT: vpbroadcastq 304(%rdi), %ymm2 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm9[4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqa %ymm9, %ymm15 -; AVX2-SLOW-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqu %ymm9, (%rsp) # 32-byte Spill ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqa 352(%rdi), %xmm2 ; AVX2-SLOW-NEXT: vmovdqa 384(%rdi), %xmm3 @@ -4106,10 +4103,10 @@ ; AVX2-SLOW-NEXT: vpalignr {{.*#+}} ymm1 = ymm2[12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10,11],ymm2[28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26,27] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,0] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] -; AVX2-SLOW-NEXT: vmovdqa 512(%rdi), %ymm2 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1],ymm2[2,3],ymm11[4,5],ymm2[6,7] -; AVX2-SLOW-NEXT: vmovdqa %ymm2, %ymm8 -; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 512(%rdi), %ymm1 +; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1],ymm1[2,3],ymm10[4,5],ymm1[6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,2,2,3,5,6,6,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0],ymm14[1],ymm7[2,3,4],ymm14[5],ymm7[6,7] @@ -4129,7 +4126,7 @@ ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,0] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] ; AVX2-SLOW-NEXT: vmovdqa 288(%rdi), %ymm1 -; AVX2-SLOW-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm15[0,1],ymm1[2,3],ymm15[4,5],ymm1[6,7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,2,2,3,5,6,6,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] @@ -4167,19 +4164,18 @@ ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = mem[2,2,2,2] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vmovdqa 160(%rdi), %ymm2 -; AVX2-SLOW-NEXT: vmovdqa 128(%rdi), %ymm3 -; AVX2-SLOW-NEXT: vpalignr {{.*#+}} ymm1 = ymm2[12,13,14,15],ymm3[0,1,2,3,4,5,6,7,8,9,10,11],ymm2[28,29,30,31],ymm3[16,17,18,19,20,21,22,23,24,25,26,27] -; AVX2-SLOW-NEXT: vmovdqa %ymm3, %ymm5 +; AVX2-SLOW-NEXT: vmovdqa 128(%rdi), %ymm10 +; AVX2-SLOW-NEXT: vpalignr {{.*#+}} ymm1 = ymm2[12,13,14,15],ymm10[0,1,2,3,4,5,6,7,8,9,10,11],ymm2[28,29,30,31],ymm10[16,17,18,19,20,21,22,23,24,25,26,27] ; AVX2-SLOW-NEXT: vmovdqa %ymm2, %ymm4 ; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,0] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm0[7] ; AVX2-SLOW-NEXT: vmovdqa 64(%rdi), %ymm0 -; AVX2-SLOW-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0,1],ymm0[2,3],ymm10[4,5],ymm0[6,7] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1],ymm0[2,3],ymm7[4,5],ymm0[6,7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[1,2,2,3,5,6,6,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm12[0],ymm13[1],ymm12[2,3,4],ymm13[5],ymm12[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm11[0],ymm13[1],ymm11[2,3,4],ymm13[5],ymm11[6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[1,0,3,3,5,4,7,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,3,2,3] @@ -4187,21 +4183,21 @@ ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa 80(%rdi), %xmm1 -; AVX2-SLOW-NEXT: vpalignr {{.*#+}} ymm2 = ymm10[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm10[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] +; AVX2-SLOW-NEXT: vpalignr {{.*#+}} ymm2 = ymm7[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm7[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6,7] ; AVX2-SLOW-NEXT: vpbroadcastd 8(%rdi), %xmm3 ; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %xmm2 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0],xmm2[1],xmm3[2,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm5[0],ymm4[0],ymm5[2],ymm4[2] -; AVX2-SLOW-NEXT: vmovdqa %ymm5, %ymm10 -; AVX2-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm10[0],ymm4[0],ymm10[2],ymm4[2] +; AVX2-SLOW-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpbroadcastd 204(%rdi), %ymm14 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm14[7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm3[5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa 528(%rdi), %xmm1 -; AVX2-SLOW-NEXT: vpalignr {{.*#+}} ymm3 = ymm11[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],ymm11[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-SLOW-NEXT: vpalignr {{.*#+}} ymm3 = ymm8[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],ymm8[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm1[3],ymm3[4,5,6,7] ; AVX2-SLOW-NEXT: vpbroadcastd 456(%rdi), %xmm14 ; AVX2-SLOW-NEXT: vmovdqa 480(%rdi), %xmm1 @@ -4229,8 +4225,8 @@ ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4],ymm13[5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa 304(%rdi), %xmm12 -; AVX2-SLOW-NEXT: vmovdqu (%rsp), %ymm6 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovdqu (%rsp), %ymm5 # 32-byte Reload ; AVX2-SLOW-NEXT: vpalignr {{.*#+}} ymm13 = ymm5[8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4,5,6,7],ymm5[24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20,21,22,23] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2],ymm12[3],ymm13[4,5,6,7] ; AVX2-SLOW-NEXT: vpbroadcastd 232(%rdi), %xmm13 @@ -4250,19 +4246,19 @@ ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,2,2,3] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm10[0,2],ymm12[1,3],ymm10[4,6],ymm12[5,7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm10[0,2],ymm15[1,3],ymm10[4,6],ymm15[5,7] ; AVX2-SLOW-NEXT: vbroadcastss 208(%rdi), %ymm10 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm10[7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm0 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm0 = ymm11[0],mem[1],ymm11[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpblendd $253, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm0 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm0 = mem[0],ymm11[1],mem[2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1,2],mem[3] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,2,2,3] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa %ymm7, %ymm13 +; AVX2-SLOW-NEXT: vmovdqa %ymm7, %ymm10 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm7[0,2],ymm8[1,3],ymm7[4,6],ymm8[5,7] ; AVX2-SLOW-NEXT: vbroadcastss 656(%rdi), %ymm2 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] @@ -4273,13 +4269,13 @@ ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,2,2,3] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa %ymm3, %ymm10 -; AVX2-SLOW-NEXT: vmovdqa %ymm4, %ymm15 +; AVX2-SLOW-NEXT: vmovdqa %ymm3, %ymm13 +; AVX2-SLOW-NEXT: vmovdqa %ymm4, %ymm12 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm3[0,2],ymm4[1,3],ymm3[4,6],ymm4[5,7] ; AVX2-SLOW-NEXT: vbroadcastss 432(%rdi), %ymm2 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm0 = ymm0[0],mem[1],ymm0[2,3,4,5,6,7] @@ -4294,7 +4290,7 @@ ; AVX2-SLOW-NEXT: vbroadcastss 880(%rdi), %ymm2 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] -; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill ; AVX2-SLOW-NEXT: vbroadcastss 100(%rdi), %xmm1 ; AVX2-SLOW-NEXT: vmovaps 64(%rdi), %xmm0 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm0[0,1,2],xmm1[3] @@ -4304,11 +4300,11 @@ ; AVX2-SLOW-NEXT: # ymm2 = mem[0,1,2,3],ymm2[4,5,6,7] ; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm3, %ymm2 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] -; AVX2-SLOW-NEXT: vbroadcastsd {{.*#+}} ymm4 = [30064771072,30064771072,30064771072,30064771072] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-SLOW-NEXT: vpermps %ymm6, %ymm4, %ymm2 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm12[6,7] -; AVX2-SLOW-NEXT: vmovaps %ymm12, %ymm5 +; AVX2-SLOW-NEXT: vmovaps {{.*#+}} ymm6 = +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-SLOW-NEXT: vpermps %ymm5, %ymm6, %ymm2 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm15[6,7] +; AVX2-SLOW-NEXT: vmovaps %ymm15, %ymm4 ; AVX2-SLOW-NEXT: vbroadcastss 212(%rdi), %ymm7 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm7[7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] @@ -4321,23 +4317,23 @@ ; AVX2-SLOW-NEXT: vmovaps 512(%rdi), %xmm1 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm7 = xmm1[0,1,2],xmm7[3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm7[2,3] -; AVX2-SLOW-NEXT: vpermps %ymm13, %ymm4, %ymm7 +; AVX2-SLOW-NEXT: vpermps %ymm10, %ymm6, %ymm7 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm8[6,7] -; AVX2-SLOW-NEXT: vmovaps %ymm8, %ymm12 +; AVX2-SLOW-NEXT: vmovaps %ymm8, %ymm10 ; AVX2-SLOW-NEXT: vbroadcastss 660(%rdi), %ymm8 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm8[7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm7[4,5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm2 = mem[0,1,2,3],ymm2[4,5,6,7] +; AVX2-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm2 = ymm2[0,1,2,3],mem[4,5,6,7] ; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm3, %ymm7 ; AVX2-SLOW-NEXT: vbroadcastss 324(%rdi), %xmm8 ; AVX2-SLOW-NEXT: vmovaps 288(%rdi), %xmm2 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm8 = xmm2[0,1,2],xmm8[3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm8[2,3] -; AVX2-SLOW-NEXT: vpermps %ymm10, %ymm4, %ymm8 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm15[6,7] +; AVX2-SLOW-NEXT: vpermps %ymm13, %ymm6, %ymm8 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm12[6,7] ; AVX2-SLOW-NEXT: vbroadcastss 436(%rdi), %ymm9 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6],ymm9[7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5,6,7] @@ -4350,18 +4346,18 @@ ; AVX2-SLOW-NEXT: vmovaps 736(%rdi), %xmm7 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm8 = xmm7[0,1,2],xmm8[3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm8[2,3] -; AVX2-SLOW-NEXT: vpermps %ymm11, %ymm4, %ymm8 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm14[6,7] -; AVX2-SLOW-NEXT: vmovaps %ymm14, %ymm10 -; AVX2-SLOW-NEXT: vbroadcastss 884(%rdi), %ymm9 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6],ymm9[7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm8[4,5,6,7] -; AVX2-SLOW-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0],ymm6[1],ymm5[2,3,4],ymm6[5],ymm5[6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm3 = ymm3[1,0,3,3,5,4,7,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,3] +; AVX2-SLOW-NEXT: vpermps %ymm11, %ymm6, %ymm6 +; AVX2-SLOW-NEXT: vmovaps %ymm11, %ymm15 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm14[6,7] +; AVX2-SLOW-NEXT: vmovaps %ymm14, %ymm9 +; AVX2-SLOW-NEXT: vbroadcastss 884(%rdi), %ymm8 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm8[7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4,5,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm5[1],ymm4[2,3,4],ymm5[5],ymm4[6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm4 = ymm4[1,0,3,3,5,4,7,7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,0,3] ; AVX2-SLOW-NEXT: vbroadcastss 216(%rdi), %ymm5 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm5[7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm5[7] ; AVX2-SLOW-NEXT: vmovaps 96(%rdi), %xmm14 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm14[0,1,2],xmm0[3] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1,3,2] @@ -4369,15 +4365,16 @@ ; AVX2-SLOW-NEXT: # ymm5 = mem[1,0,2,3,5,4,6,7] ; AVX2-SLOW-NEXT: vextractf128 $1, %ymm5, %xmm5 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm5[0,1],xmm0[2,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] -; AVX2-SLOW-NEXT: vmovaps 544(%rdi), %xmm3 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7] +; AVX2-SLOW-NEXT: vmovaps 544(%rdi), %xmm4 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm4[0,1,2],xmm1[3] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,3,2] ; AVX2-SLOW-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm5 = mem[1,0,2,3,5,4,6,7] ; AVX2-SLOW-NEXT: vextractf128 $1, %ymm5, %xmm5 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm5[0,1],xmm1[2,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm12[0],ymm13[1],ymm12[2,3,4],ymm13[5],ymm12[6,7] +; AVX2-SLOW-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm5 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm5 = ymm10[0],mem[1],ymm10[2,3,4],mem[5],ymm10[6,7] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm5 = ymm5[1,0,3,3,5,4,7,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,0,3] ; AVX2-SLOW-NEXT: vbroadcastss 664(%rdi), %ymm6 @@ -4390,8 +4387,7 @@ ; AVX2-SLOW-NEXT: # ymm5 = mem[1,0,2,3,5,4,6,7] ; AVX2-SLOW-NEXT: vextractf128 $1, %ymm5, %xmm5 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3] -; AVX2-SLOW-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm5 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm5 = ymm15[0],mem[1],ymm15[2,3,4],mem[5],ymm15[6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm12[0],ymm13[1],ymm12[2,3,4],ymm13[5],ymm12[6,7] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm5 = ymm5[1,0,3,3,5,4,7,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,0,3] ; AVX2-SLOW-NEXT: vbroadcastss 440(%rdi), %ymm6 @@ -4404,8 +4400,7 @@ ; AVX2-SLOW-NEXT: # ymm7 = mem[1,0,2,3,5,4,6,7] ; AVX2-SLOW-NEXT: vextractf128 $1, %ymm7, %xmm7 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm5 = xmm7[0,1],xmm5[2,3] -; AVX2-SLOW-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm7 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm7 = ymm10[0],mem[1],ymm10[2,3,4],mem[5],ymm10[6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm9[0],ymm15[1],ymm9[2,3,4],ymm15[5],ymm9[6,7] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm7 = ymm7[1,0,3,3,5,4,7,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,0,3] ; AVX2-SLOW-NEXT: vbroadcastss 888(%rdi), %ymm8 @@ -4414,103 +4409,103 @@ ; AVX2-SLOW-NEXT: vbroadcastss 584(%rdi), %xmm5 ; AVX2-SLOW-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm5 = xmm5[0],mem[1],xmm5[2,3] -; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-SLOW-NEXT: vpermps 640(%rdi), %ymm4, %ymm8 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm8[6,7] -; AVX2-SLOW-NEXT: vbroadcastss 528(%rdi), %ymm8 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm3 = xmm8[0,1,2],xmm3[3] -; AVX2-SLOW-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm8 = mem[2,3,2,3,6,7,6,7] -; AVX2-SLOW-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm8 = ymm8[0],mem[1],ymm8[2,3,4],mem[5],ymm8[6,7] -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm8, %xmm8 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm3 = xmm8[0,1],xmm3[2,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm3[0,1,2,3],ymm5[4,5,6,7] -; AVX2-SLOW-NEXT: vbroadcastss 136(%rdi), %xmm3 -; AVX2-SLOW-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm3 = xmm3[0],mem[1],xmm3[2,3] -; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-SLOW-NEXT: vpermps 192(%rdi), %ymm4, %ymm5 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm5[6,7] -; AVX2-SLOW-NEXT: vbroadcastss 80(%rdi), %ymm5 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1,2],xmm14[3] +; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm8 +; AVX2-SLOW-NEXT: vmovaps {{.*#+}} ymm5 = +; AVX2-SLOW-NEXT: vpermps 640(%rdi), %ymm5, %ymm9 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm9[6,7] +; AVX2-SLOW-NEXT: vbroadcastss 528(%rdi), %ymm9 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm4 = xmm9[0,1,2],xmm4[3] ; AVX2-SLOW-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm9 = mem[2,3,2,3,6,7,6,7] ; AVX2-SLOW-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm9 = ymm9[0],mem[1],ymm9[2,3,4],mem[5],ymm9[6,7] ; AVX2-SLOW-NEXT: vextractf128 $1, %ymm9, %xmm9 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm5 = xmm9[0,1],xmm5[2,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm5[0,1,2,3],ymm3[4,5,6,7] -; AVX2-SLOW-NEXT: vbroadcastss 360(%rdi), %xmm3 -; AVX2-SLOW-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm3 = xmm3[0],mem[1],xmm3[2,3] -; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-SLOW-NEXT: vpermps 416(%rdi), %ymm4, %ymm5 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm5[6,7] -; AVX2-SLOW-NEXT: vbroadcastss 304(%rdi), %ymm5 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1,2],xmm11[3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm4 = xmm9[0,1],xmm4[2,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm4[0,1,2,3],ymm8[4,5,6,7] +; AVX2-SLOW-NEXT: vbroadcastss 136(%rdi), %xmm4 +; AVX2-SLOW-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm4 = xmm4[0],mem[1],xmm4[2,3] +; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 +; AVX2-SLOW-NEXT: vpermps 192(%rdi), %ymm5, %ymm9 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm9[6,7] +; AVX2-SLOW-NEXT: vbroadcastss 80(%rdi), %ymm9 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm9 = xmm9[0,1,2],xmm14[3] ; AVX2-SLOW-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm10 = mem[2,3,2,3,6,7,6,7] ; AVX2-SLOW-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm10 = ymm10[0],mem[1],ymm10[2,3,4],mem[5],ymm10[6,7] ; AVX2-SLOW-NEXT: vextractf128 $1, %ymm10, %xmm10 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm5 = xmm10[0,1],xmm5[2,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5,6,7] -; AVX2-SLOW-NEXT: vbroadcastss 808(%rdi), %xmm5 -; AVX2-SLOW-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm5 = xmm5[0],mem[1],xmm5[2,3] -; AVX2-SLOW-NEXT: vpermps 864(%rdi), %ymm4, %ymm4 -; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7] -; AVX2-SLOW-NEXT: vbroadcastss 752(%rdi), %ymm5 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm2 = xmm5[0,1,2],xmm2[3] -; AVX2-SLOW-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm5 = mem[2,3,2,3,6,7,6,7] -; AVX2-SLOW-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm5 = ymm5[0],mem[1],ymm5[2,3,4],mem[5],ymm5[6,7] -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm5, %xmm5 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm4, 96(%rsi) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm4, 32(%rsi) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm4, 64(%rsi) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm4, (%rsi) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm4, 96(%rdx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm4, 32(%rdx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm4, 64(%rdx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm4, (%rdx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm4, 32(%rcx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm4, 96(%rcx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm4, 64(%rcx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm4, (%rcx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm4, 96(%r8) -; AVX2-SLOW-NEXT: vmovups (%rsp), %ymm4 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm4, 32(%r8) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm4, 64(%r8) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm4, (%r8) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm4, 96(%r9) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm4, 32(%r9) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm4, 64(%r9) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm4, (%r9) +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm9 = xmm10[0,1],xmm9[2,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm4[4,5,6,7] +; AVX2-SLOW-NEXT: vbroadcastss 360(%rdi), %xmm4 +; AVX2-SLOW-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm4 = xmm4[0],mem[1],xmm4[2,3] +; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 +; AVX2-SLOW-NEXT: vpermps 416(%rdi), %ymm5, %ymm10 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm10[6,7] +; AVX2-SLOW-NEXT: vbroadcastss 304(%rdi), %ymm10 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm10 = xmm10[0,1,2],xmm11[3] +; AVX2-SLOW-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm11 = mem[2,3,2,3,6,7,6,7] +; AVX2-SLOW-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm11 = ymm11[0],mem[1],ymm11[2,3,4],mem[5],ymm11[6,7] +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm11, %xmm11 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm10 = xmm11[0,1],xmm10[2,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm10[0,1,2,3],ymm4[4,5,6,7] +; AVX2-SLOW-NEXT: vbroadcastss 808(%rdi), %xmm10 +; AVX2-SLOW-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm10 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm10 = xmm10[0],mem[1],xmm10[2,3] +; AVX2-SLOW-NEXT: vpermps 864(%rdi), %ymm5, %ymm5 +; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm10 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm10[0,1,2,3,4,5],ymm5[6,7] +; AVX2-SLOW-NEXT: vbroadcastss 752(%rdi), %ymm10 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm2 = xmm10[0,1,2],xmm2[3] +; AVX2-SLOW-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm10 = mem[2,3,2,3,6,7,6,7] +; AVX2-SLOW-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm10 = ymm10[0],mem[1],ymm10[2,3,4],mem[5],ymm10[6,7] +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm10, %xmm10 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm2 = xmm10[0,1],xmm2[2,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm5, 96(%rsi) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm5, 32(%rsi) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm5, 64(%rsi) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm5, (%rsi) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm5, 96(%rdx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm5, 32(%rdx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm5, 64(%rdx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm5, (%rdx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm5, 32(%rcx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm5, 96(%rcx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm5, 64(%rcx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm5, (%rcx) +; AVX2-SLOW-NEXT: vmovups (%rsp), %ymm5 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm5, 96(%r8) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm5, 32(%r8) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm5, 64(%r8) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm5, (%r8) +; AVX2-SLOW-NEXT: vmovaps %ymm3, 96(%r9) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm3, 32(%r9) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm3, 64(%r9) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm3, (%r9) ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-SLOW-NEXT: vmovaps %ymm7, 96(%rax) ; AVX2-SLOW-NEXT: vmovaps %ymm6, 32(%rax) @@ -4518,7 +4513,7 @@ ; AVX2-SLOW-NEXT: vmovaps %ymm0, (%rax) ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-SLOW-NEXT: vmovaps %ymm2, 96(%rax) -; AVX2-SLOW-NEXT: vmovaps %ymm3, 32(%rax) +; AVX2-SLOW-NEXT: vmovaps %ymm4, 32(%rax) ; AVX2-SLOW-NEXT: vmovaps %ymm9, (%rax) ; AVX2-SLOW-NEXT: vmovaps %ymm8, 64(%rax) ; AVX2-SLOW-NEXT: addq $1224, %rsp # imm = 0x4C8 @@ -4527,25 +4522,23 @@ ; ; AVX2-FAST-LABEL: load_i32_stride7_vf32: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: subq $1224, %rsp # imm = 0x4C8 -; AVX2-FAST-NEXT: vmovaps 320(%rdi), %ymm0 -; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: subq $1240, %rsp # imm = 0x4D8 +; AVX2-FAST-NEXT: vmovdqa 320(%rdi), %ymm7 ; AVX2-FAST-NEXT: vmovdqa 256(%rdi), %ymm4 ; AVX2-FAST-NEXT: vmovdqa 224(%rdi), %ymm5 -; AVX2-FAST-NEXT: vmovdqa 544(%rdi), %ymm13 -; AVX2-FAST-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 480(%rdi), %ymm9 -; AVX2-FAST-NEXT: vmovdqa 448(%rdi), %ymm8 -; AVX2-FAST-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 544(%rdi), %ymm11 +; AVX2-FAST-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 480(%rdi), %ymm8 +; AVX2-FAST-NEXT: vmovdqa 448(%rdi), %ymm9 ; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm3 ; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %ymm6 -; AVX2-FAST-NEXT: vmovdqa 96(%rdi), %ymm10 +; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 96(%rdi), %ymm1 +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpbroadcastq 80(%rdi), %ymm0 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm10[4,5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = <0,7,6,u> ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm6[6],ymm3[7] -; AVX2-FAST-NEXT: vmovdqa %ymm6, %ymm11 -; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa %ymm3, %ymm12 ; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpermd %ymm2, %ymm0, %ymm2 @@ -4559,11 +4552,12 @@ ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3,4,5],ymm9[6],ymm8[7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3,4,5],ymm8[6],ymm9[7] ; AVX2-FAST-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpermd %ymm1, %ymm0, %ymm1 ; AVX2-FAST-NEXT: vpbroadcastq 528(%rdi), %ymm2 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm13[4,5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm11[4,5,6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] ; AVX2-FAST-NEXT: vmovdqa 576(%rdi), %xmm2 ; AVX2-FAST-NEXT: vmovdqa 608(%rdi), %xmm3 @@ -4581,7 +4575,7 @@ ; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpermd %ymm1, %ymm0, %ymm1 ; AVX2-FAST-NEXT: vpbroadcastq 304(%rdi), %ymm2 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FAST-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm7[4,5,6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] ; AVX2-FAST-NEXT: vmovdqa 352(%rdi), %xmm2 @@ -4594,9 +4588,9 @@ ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa 704(%rdi), %ymm2 -; AVX2-FAST-NEXT: vmovdqa 672(%rdi), %ymm8 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3,4,5],ymm2[6],ymm8[7] -; AVX2-FAST-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 672(%rdi), %ymm11 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1,2,3,4,5],ymm2[6],ymm11[7] +; AVX2-FAST-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa %ymm2, %ymm13 ; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpermd %ymm1, %ymm0, %ymm0 @@ -4623,12 +4617,15 @@ ; AVX2-FAST-NEXT: vpalignr {{.*#+}} ymm1 = ymm2[12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10,11],ymm2[28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26,27] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,0] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3,4,5,6],ymm0[7] -; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %ymm6 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1],ymm6[2,3],ymm10[4,5],ymm6[6,7] +; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %ymm10 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1],ymm10[2,3],ymm6[4,5],ymm10[6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm0 = [25769803781,25769803781,25769803781,25769803781] ; AVX2-FAST-NEXT: vpermd %ymm1, %ymm0, %ymm3 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [1,0,7,7,5,4,7,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm11[0],ymm12[1],ymm11[2,3,4],ymm12[5],ymm11[6,7] +; AVX2-FAST-NEXT: vpblendd $221, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm4 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm4 = mem[0],ymm12[1],mem[2,3,4],ymm12[5],mem[6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpermd %ymm4, %ymm1, %ymm4 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3,4,5,6,7] @@ -4643,14 +4640,12 @@ ; AVX2-FAST-NEXT: vpalignr {{.*#+}} ymm3 = ymm4[12,13,14,15],ymm3[0,1,2,3,4,5,6,7,8,9,10,11],ymm4[28,29,30,31],ymm3[16,17,18,19,20,21,22,23,24,25,26,27] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,2,0] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7] -; AVX2-FAST-NEXT: vmovdqa 512(%rdi), %ymm4 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm12[0,1],ymm4[2,3],ymm12[4,5],ymm4[6,7] -; AVX2-FAST-NEXT: vmovdqa %ymm4, %ymm5 -; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 512(%rdi), %ymm3 +; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1],ymm3[2,3],ymm5[4,5],ymm3[6,7] ; AVX2-FAST-NEXT: vpermd %ymm3, %ymm0, %ymm3 -; AVX2-FAST-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm4 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm4 = ymm9[0],mem[1],ymm9[2,3,4],mem[5],ymm9[6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0],ymm9[1],ymm8[2,3,4],ymm9[5],ymm8[6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpermd %ymm4, %ymm1, %ymm4 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3,4,5,6,7] @@ -4678,90 +4673,91 @@ ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm2 = mem[2,2,2,2] ; AVX2-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-FAST-NEXT: vmovdqa 832(%rdi), %ymm9 -; AVX2-FAST-NEXT: vmovdqa 800(%rdi), %ymm7 -; AVX2-FAST-NEXT: vpalignr {{.*#+}} ymm3 = ymm9[12,13,14,15],ymm7[0,1,2,3,4,5,6,7,8,9,10,11],ymm9[28,29,30,31],ymm7[16,17,18,19,20,21,22,23,24,25,26,27] -; AVX2-FAST-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 800(%rdi), %ymm12 +; AVX2-FAST-NEXT: vpalignr {{.*#+}} ymm3 = ymm9[12,13,14,15],ymm12[0,1,2,3,4,5,6,7,8,9,10,11],ymm9[28,29,30,31],ymm12[16,17,18,19,20,21,22,23,24,25,26,27] +; AVX2-FAST-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,2,0] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7] -; AVX2-FAST-NEXT: vmovdqa 736(%rdi), %ymm4 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm15 = ymm14[0,1],ymm4[2,3],ymm14[4,5],ymm4[6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm4, (%rsp) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 736(%rdi), %ymm3 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm15 = ymm8[0,1],ymm3[2,3],ymm8[4,5],ymm3[6,7] +; AVX2-FAST-NEXT: vmovdqa %ymm3, %ymm7 +; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpermd %ymm15, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm13[0],ymm8[1],ymm13[2,3,4],ymm8[5],ymm13[6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm13[0],ymm11[1],ymm13[2,3,4],ymm11[5],ymm13[6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpermd %ymm3, %ymm1, %ymm1 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa 80(%rdi), %xmm0 -; AVX2-FAST-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpalignr {{.*#+}} ymm1 = ymm10[8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4,5,6,7],ymm10[24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20,21,22,23] +; AVX2-FAST-NEXT: vpalignr {{.*#+}} ymm1 = ymm6[8,9,10,11,12,13,14,15],ymm10[0,1,2,3,4,5,6,7],ymm6[24,25,26,27,28,29,30,31],ymm10[16,17,18,19,20,21,22,23] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7] ; AVX2-FAST-NEXT: vpbroadcastd 8(%rdi), %xmm2 ; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %xmm1 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm1[1],xmm2[2,3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm3[0],ymm8[0],ymm3[2],ymm8[2] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm3[0],ymm10[0],ymm3[2],ymm10[2] ; AVX2-FAST-NEXT: vpbroadcastd 204(%rdi), %ymm15 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm15[7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa 528(%rdi), %xmm0 -; AVX2-FAST-NEXT: vpalignr {{.*#+}} ymm2 = ymm12[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],ymm12[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vpalignr {{.*#+}} ymm2 = ymm5[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],ymm5[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm0[3],ymm2[4,5,6,7] ; AVX2-FAST-NEXT: vpbroadcastd 456(%rdi), %xmm15 ; AVX2-FAST-NEXT: vmovdqa 480(%rdi), %xmm0 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm15 = xmm15[0],xmm0[1],xmm15[2,3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm15[0,1],ymm2[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm10[0],ymm5[0],ymm10[2],ymm5[2] -; AVX2-FAST-NEXT: vpbroadcastd 652(%rdi), %ymm11 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm15[0,1,2,3,4,5,6],ymm11[7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm11[5,6,7] +; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm5[0],ymm11[0],ymm5[2],ymm11[2] +; AVX2-FAST-NEXT: vpbroadcastd 652(%rdi), %ymm14 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4,5,6],ymm14[7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm14[5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa 752(%rdi), %xmm2 -; AVX2-FAST-NEXT: vpalignr {{.*#+}} ymm11 = ymm14[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],ymm14[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2],ymm2[3],ymm11[4,5,6,7] +; AVX2-FAST-NEXT: vpalignr {{.*#+}} ymm14 = ymm8[8,9,10,11,12,13,14,15],ymm7[0,1,2,3,4,5,6,7],ymm8[24,25,26,27,28,29,30,31],ymm7[16,17,18,19,20,21,22,23] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2],ymm2[3],ymm14[4,5,6,7] ; AVX2-FAST-NEXT: vpbroadcastd 680(%rdi), %xmm15 ; AVX2-FAST-NEXT: vmovdqa 704(%rdi), %xmm2 ; AVX2-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm15 = xmm15[0],xmm2[1],xmm15[2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm15[0,1],ymm11[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm7[0],ymm9[0],ymm7[2],ymm9[2] -; AVX2-FAST-NEXT: vpbroadcastd 876(%rdi), %ymm14 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4,5,6],ymm14[7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4],ymm14[5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 304(%rdi), %xmm11 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1],ymm14[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm12[0],ymm9[0],ymm12[2],ymm9[2] +; AVX2-FAST-NEXT: vpbroadcastd 876(%rdi), %ymm13 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1,2,3,4,5,6],ymm13[7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm14[0,1,2,3,4],ymm13[5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 304(%rdi), %xmm13 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-NEXT: vpalignr {{.*#+}} ymm14 = ymm9[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],ymm9[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm14[0,1,2],ymm11[3],ymm14[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FAST-NEXT: vpalignr {{.*#+}} ymm14 = ymm7[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],ymm7[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2],ymm13[3],ymm14[4,5,6,7] ; AVX2-FAST-NEXT: vpbroadcastd 232(%rdi), %xmm14 -; AVX2-FAST-NEXT: vmovdqa 256(%rdi), %xmm2 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm14 = xmm14[0],xmm2[1],xmm14[2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm14[0,1],ymm11[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-FAST-NEXT: vmovdqa 256(%rdi), %xmm4 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm14 = xmm14[0],xmm4[1],xmm14[2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1],ymm13[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm15[0],ymm12[0],ymm15[2],ymm12[2] -; AVX2-FAST-NEXT: vpbroadcastd 428(%rdi), %ymm13 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3,4,5,6],ymm13[7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4],ymm13[5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd $253, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm6 = mem[0],ymm6[1],mem[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm2[0],ymm15[0],ymm2[2],ymm15[2] +; AVX2-FAST-NEXT: vpbroadcastd 428(%rdi), %ymm12 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm14[0,1,2,3,4,5,6],ymm12[7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3,4],ymm12[5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm6 = ymm6[0],mem[1],ymm6[2,3,4,5,6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1,2],mem[3] ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,2,2,3] ; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[3,1,1,0,7,5,5,4] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm6[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vmovdqa %ymm8, %ymm14 -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm6 = ymm3[0,2],ymm8[1,3],ymm3[4,6],ymm8[5,7] -; AVX2-FAST-NEXT: vmovdqa %ymm3, %ymm7 +; AVX2-FAST-NEXT: vmovdqa %ymm10, %ymm13 +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm6 = ymm3[0,2],ymm10[1,3],ymm3[4,6],ymm10[5,7] +; AVX2-FAST-NEXT: vmovdqa %ymm3, %ymm10 ; AVX2-FAST-NEXT: vbroadcastss 208(%rdi), %ymm8 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm8[7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm6[5,6,7] @@ -4773,35 +4769,35 @@ ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,2,2,3] ; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[3,1,1,0,7,5,5,4] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vmovdqa %ymm10, %ymm13 -; AVX2-FAST-NEXT: vmovdqa %ymm5, %ymm6 -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm10[0,2],ymm5[1,3],ymm10[4,6],ymm5[5,7] +; AVX2-FAST-NEXT: vmovdqa %ymm11, %ymm6 +; AVX2-FAST-NEXT: vmovdqa %ymm5, %ymm11 +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm5[0,2],ymm6[1,3],ymm5[4,6],ymm6[5,7] ; AVX2-FAST-NEXT: vbroadcastss 656(%rdi), %ymm5 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm5[7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0],ymm4[1],ymm9[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1,2],mem[3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0],ymm9[1],ymm7[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm4[0,1,2],mem[3] ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,2,2,3] ; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm15[0,2],ymm12[1,3],ymm15[4,6],ymm12[5,7] -; AVX2-FAST-NEXT: vmovdqa %ymm12, %ymm10 +; AVX2-FAST-NEXT: vmovdqa %ymm2, %ymm12 +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,2],ymm15[1,3],ymm2[4,6],ymm15[5,7] ; AVX2-FAST-NEXT: vbroadcastss 432(%rdi), %ymm4 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm4[7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps $2, (%rsp), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm0 = ymm0[0],mem[1],ymm0[2,3,4,5,6,7] ; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1,2],mem[3] ; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,2,2,3] ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm8[0,2],ymm9[1,3],ymm8[4,6],ymm9[5,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm14[0,2],ymm9[1,3],ymm14[4,6],ymm9[5,7] ; AVX2-FAST-NEXT: vbroadcastss 880(%rdi), %ymm2 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] @@ -4815,116 +4811,116 @@ ; AVX2-FAST-NEXT: # ymm3 = mem[0,1,2,3],ymm3[4,5,6,7] ; AVX2-FAST-NEXT: vpermd %ymm3, %ymm0, %ymm3 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3] -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm11 = [30064771072,30064771072,30064771072,30064771072] -; AVX2-FAST-NEXT: vpermd %ymm7, %ymm11, %ymm3 -; AVX2-FAST-NEXT: vmovdqa %ymm7, %ymm12 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm14[6,7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = +; AVX2-FAST-NEXT: vpermd %ymm10, %ymm8, %ymm3 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm13[6,7] ; AVX2-FAST-NEXT: vpbroadcastd 212(%rdi), %ymm4 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm4[7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX2-FAST-NEXT: vpermd %ymm1, %ymm0, %ymm1 -; AVX2-FAST-NEXT: vpbroadcastd 548(%rdi), %xmm4 +; AVX2-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm3 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm3 = ymm1[0,1,2,3],mem[4,5,6,7] +; AVX2-FAST-NEXT: vpermd %ymm3, %ymm0, %ymm4 +; AVX2-FAST-NEXT: vpbroadcastd 548(%rdi), %xmm5 ; AVX2-FAST-NEXT: vmovdqa 512(%rdi), %xmm3 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm4 = xmm3[0,1,2],xmm4[3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3] -; AVX2-FAST-NEXT: vpermd %ymm13, %ymm11, %ymm4 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm6[6,7] -; AVX2-FAST-NEXT: vpbroadcastd 660(%rdi), %ymm5 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm5[7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4,5,6,7] -; AVX2-FAST-NEXT: vpermd %ymm1, %ymm0, %ymm1 -; AVX2-FAST-NEXT: vpbroadcastd 324(%rdi), %xmm6 -; AVX2-FAST-NEXT: vmovdqa 288(%rdi), %xmm5 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm6 = xmm5[0,1,2],xmm6[3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm6[2,3] -; AVX2-FAST-NEXT: vpermd %ymm15, %ymm11, %ymm6 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm10[6,7] -; AVX2-FAST-NEXT: vmovdqa %ymm10, %ymm13 -; AVX2-FAST-NEXT: vpbroadcastd 436(%rdi), %ymm7 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm7[7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm6[4,5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm5 = xmm3[0,1,2],xmm5[3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3] +; AVX2-FAST-NEXT: vpermd %ymm11, %ymm8, %ymm5 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm6[6,7] +; AVX2-FAST-NEXT: vmovdqa %ymm6, %ymm11 +; AVX2-FAST-NEXT: vpbroadcastd 660(%rdi), %ymm6 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm6[7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm5[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm6 = mem[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FAST-NEXT: vpermd %ymm6, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpbroadcastd 772(%rdi), %xmm7 -; AVX2-FAST-NEXT: vmovdqa 736(%rdi), %xmm6 +; AVX2-FAST-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm5 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm5 = ymm1[0,1,2,3],mem[4,5,6,7] +; AVX2-FAST-NEXT: vpermd %ymm5, %ymm0, %ymm5 +; AVX2-FAST-NEXT: vpbroadcastd 324(%rdi), %xmm7 +; AVX2-FAST-NEXT: vmovdqa 288(%rdi), %xmm6 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm7 = xmm6[0,1,2],xmm7[3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm7[2,3] -; AVX2-FAST-NEXT: vpermd %ymm8, %ymm11, %ymm7 -; AVX2-FAST-NEXT: vmovdqa %ymm8, %ymm4 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm9[6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],xmm7[2,3] +; AVX2-FAST-NEXT: vpermd %ymm12, %ymm8, %ymm7 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm15[6,7] +; AVX2-FAST-NEXT: vpbroadcastd 436(%rdi), %ymm12 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm12[7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm7[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm7 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm7 = mem[0,1,2,3],ymm5[4,5,6,7] +; AVX2-FAST-NEXT: vpermd %ymm7, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpbroadcastd 772(%rdi), %xmm12 +; AVX2-FAST-NEXT: vmovdqa 736(%rdi), %xmm7 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm12 = xmm7[0,1,2],xmm12[3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm12[2,3] +; AVX2-FAST-NEXT: vpermd %ymm14, %ymm8, %ymm8 +; AVX2-FAST-NEXT: vmovdqa %ymm14, %ymm4 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm9[6,7] ; AVX2-FAST-NEXT: vmovdqa %ymm9, %ymm1 -; AVX2-FAST-NEXT: vpbroadcastd 884(%rdi), %ymm8 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm8[7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm7[4,5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm14[0],ymm12[1],ymm14[2,3,4],ymm12[5],ymm14[6,7] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [1,0,3,3,1,0,7,7] -; AVX2-FAST-NEXT: vpermd %ymm8, %ymm7, %ymm8 -; AVX2-FAST-NEXT: vpbroadcastd 216(%rdi), %ymm9 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6],ymm9[7] -; AVX2-FAST-NEXT: vmovdqa 96(%rdi), %xmm15 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm15[0,1,2],xmm2[3] +; AVX2-FAST-NEXT: vpbroadcastd 884(%rdi), %ymm12 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6],ymm12[7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm8[4,5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm13[0],ymm10[1],ymm13[2,3,4],ymm10[5],ymm13[6,7] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [1,0,3,3,1,0,7,7] +; AVX2-FAST-NEXT: vpermd %ymm9, %ymm8, %ymm9 +; AVX2-FAST-NEXT: vpbroadcastd 216(%rdi), %ymm10 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5,6],ymm10[7] +; AVX2-FAST-NEXT: vmovdqa 96(%rdi), %xmm14 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm14[0,1,2],xmm2[3] ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,3,2] -; AVX2-FAST-NEXT: vpshufd $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm9 = mem[1,0,2,3,5,4,6,7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm9, %xmm9 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm9[0,1],xmm2[2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7] +; AVX2-FAST-NEXT: vpshufd $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm10 = mem[1,0,2,3,5,4,6,7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm10, %xmm10 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm10[0,1],xmm2[2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm9[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqa 544(%rdi), %xmm10 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm3 = xmm10[0,1,2],xmm3[3] ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,3,2] -; AVX2-FAST-NEXT: vpshufd $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm8 = mem[1,0,2,3,5,4,6,7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm8, %xmm8 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm3 = xmm8[0,1],xmm3[2,3] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm8 = ymm8[0],mem[1],ymm8[2,3,4],mem[5],ymm8[6,7] -; AVX2-FAST-NEXT: vpermd %ymm8, %ymm7, %ymm8 -; AVX2-FAST-NEXT: vpbroadcastd 664(%rdi), %ymm9 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6],ymm9[7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm8[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqa 320(%rdi), %xmm9 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm5 = xmm9[0,1,2],xmm5[3] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[0,1,3,2] -; AVX2-FAST-NEXT: vpshufd $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm8 = mem[1,0,2,3,5,4,6,7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm8, %xmm8 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm5 = xmm8[0,1],xmm5[2,3] -; AVX2-FAST-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm8 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm8 = ymm13[0],mem[1],ymm13[2,3,4],mem[5],ymm13[6,7] -; AVX2-FAST-NEXT: vpermd %ymm8, %ymm7, %ymm8 -; AVX2-FAST-NEXT: vpbroadcastd 440(%rdi), %ymm12 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6],ymm12[7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqa 768(%rdi), %xmm8 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm6 = xmm8[0,1,2],xmm6[3] +; AVX2-FAST-NEXT: vpshufd $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm9 = mem[1,0,2,3,5,4,6,7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm9, %xmm9 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm3 = xmm9[0,1],xmm3[2,3] +; AVX2-FAST-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm9 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm9 = ymm11[0],mem[1],ymm11[2,3,4],mem[5],ymm11[6,7] +; AVX2-FAST-NEXT: vpermd %ymm9, %ymm8, %ymm9 +; AVX2-FAST-NEXT: vpbroadcastd 664(%rdi), %ymm11 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5,6],ymm11[7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm9[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqa 320(%rdi), %xmm15 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm6 = xmm15[0,1,2],xmm6[3] ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,1,3,2] -; AVX2-FAST-NEXT: vpshufd $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm12 = mem[1,0,2,3,5,4,6,7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm12, %xmm12 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm6 = xmm12[0,1],xmm6[2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm1[0],ymm4[1],ymm1[2,3,4],ymm4[5],ymm1[6,7] -; AVX2-FAST-NEXT: vpermd %ymm12, %ymm7, %ymm7 -; AVX2-FAST-NEXT: vpbroadcastd 888(%rdi), %ymm12 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm12[7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7] -; AVX2-FAST-NEXT: vpbroadcastd 584(%rdi), %xmm7 -; AVX2-FAST-NEXT: vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm7 = xmm7[0],mem[1],xmm7[2,3] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX2-FAST-NEXT: vpshufd $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm9 = mem[1,0,2,3,5,4,6,7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm9, %xmm9 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm6 = xmm9[0,1],xmm6[2,3] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm9 = ymm9[0],mem[1],ymm9[2,3,4],mem[5],ymm9[6,7] +; AVX2-FAST-NEXT: vpermd %ymm9, %ymm8, %ymm9 +; AVX2-FAST-NEXT: vpbroadcastd 440(%rdi), %ymm11 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5,6],ymm11[7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm9[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqa 768(%rdi), %xmm9 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm7 = xmm9[0,1,2],xmm7[3] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,1,3,2] +; AVX2-FAST-NEXT: vpshufd $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm11 = mem[1,0,2,3,5,4,6,7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm11, %xmm11 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm7 = xmm11[0,1],xmm7[2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm1[0],ymm4[1],ymm1[2,3,4],ymm4[5],ymm1[6,7] +; AVX2-FAST-NEXT: vpermd %ymm11, %ymm8, %ymm8 +; AVX2-FAST-NEXT: vpbroadcastd 888(%rdi), %ymm11 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6],ymm11[7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5,6,7] +; AVX2-FAST-NEXT: vpbroadcastd 584(%rdi), %xmm8 +; AVX2-FAST-NEXT: vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm8 = xmm8[0],mem[1],xmm8[2,3] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = ; AVX2-FAST-NEXT: vpermd 640(%rdi), %ymm11, %ymm12 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm12[6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm12[6,7] ; AVX2-FAST-NEXT: vpbroadcastd 528(%rdi), %ymm12 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm10 = xmm12[0,1,2],xmm10[3] ; AVX2-FAST-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload @@ -4933,7 +4929,7 @@ ; AVX2-FAST-NEXT: # ymm12 = ymm12[0],mem[1],ymm12[2,3,4],mem[5],ymm12[6,7] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm12, %xmm12 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm10 = xmm12[0,1],xmm10[2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm10[0,1,2,3],ymm7[4,5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm10[0,1,2,3],ymm8[4,5,6,7] ; AVX2-FAST-NEXT: vpbroadcastd 136(%rdi), %xmm10 ; AVX2-FAST-NEXT: vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm10 # 16-byte Folded Reload ; AVX2-FAST-NEXT: # xmm10 = xmm10[0],mem[1],xmm10[2,3] @@ -4941,7 +4937,7 @@ ; AVX2-FAST-NEXT: vpermd 192(%rdi), %ymm11, %ymm12 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm12[6,7] ; AVX2-FAST-NEXT: vpbroadcastd 80(%rdi), %ymm12 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm12 = xmm12[0,1,2],xmm15[3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm12 = xmm12[0,1,2],xmm14[3] ; AVX2-FAST-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm13 = mem[2,3,2,3,6,7,6,7] ; AVX2-FAST-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload @@ -4956,53 +4952,52 @@ ; AVX2-FAST-NEXT: vpermd 416(%rdi), %ymm11, %ymm13 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],ymm13[6,7] ; AVX2-FAST-NEXT: vpbroadcastd 304(%rdi), %ymm13 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm13 = xmm13[0,1,2],xmm15[3] +; AVX2-FAST-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm14 = mem[2,3,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm14 = ymm14[0],mem[1],ymm14[2,3,4],mem[5],ymm14[6,7] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm14, %xmm14 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm13 = xmm14[0,1],xmm13[2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3],ymm12[4,5,6,7] +; AVX2-FAST-NEXT: vpbroadcastd 808(%rdi), %xmm13 +; AVX2-FAST-NEXT: vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm13 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm13 = xmm13[0],mem[1],xmm13[2,3] +; AVX2-FAST-NEXT: vpermd 864(%rdi), %ymm11, %ymm11 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm13[0,1,2,3,4,5],ymm11[6,7] +; AVX2-FAST-NEXT: vpbroadcastd 752(%rdi), %ymm13 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm9 = xmm13[0,1,2],xmm9[3] -; AVX2-FAST-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm13 = mem[2,3,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm13 = ymm5[2,3,2,3,6,7,6,7] ; AVX2-FAST-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm13 = ymm13[0],mem[1],ymm13[2,3,4],mem[5],ymm13[6,7] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm13, %xmm13 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm9 = xmm13[0,1],xmm9[2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm12[4,5,6,7] -; AVX2-FAST-NEXT: vpbroadcastd 808(%rdi), %xmm12 -; AVX2-FAST-NEXT: vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm12 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm12 = xmm12[0],mem[1],xmm12[2,3] -; AVX2-FAST-NEXT: vpermd 864(%rdi), %ymm11, %ymm11 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3,4,5],ymm11[6,7] -; AVX2-FAST-NEXT: vpbroadcastd 752(%rdi), %ymm12 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm8 = xmm12[0,1,2],xmm8[3] -; AVX2-FAST-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm12 = mem[2,3,2,3,6,7,6,7] -; AVX2-FAST-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm12 = ymm12[0],mem[1],ymm12[2,3,4],mem[5],ymm12[6,7] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm12, %xmm12 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm8 = xmm12[0,1],xmm8[2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm11[4,5,6,7] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm11, 96(%rsi) +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm11[4,5,6,7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm1, 96(%rsi) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm11, 32(%rsi) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm11, 64(%rsi) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm11, (%rsi) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm11, 96(%rdx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm11, 32(%rdx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm11, 64(%rdx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm11, (%rdx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm1, 96(%rdx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm1, 32(%rdx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm1, 64(%rdx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm1, (%rdx) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm1, 32(%rcx) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm1, 96(%rcx) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm1, 64(%rcx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm11, (%rcx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm1, (%rcx) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm1, 96(%r8) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload @@ -5014,21 +5009,21 @@ ; AVX2-FAST-NEXT: vmovdqa %ymm0, 96(%r9) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, 32(%r9) -; AVX2-FAST-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, 64(%r9) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, (%r9) ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-NEXT: vmovdqa %ymm6, 96(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm5, 32(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm7, 96(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm6, 32(%rax) ; AVX2-FAST-NEXT: vmovdqa %ymm3, 64(%rax) ; AVX2-FAST-NEXT: vmovdqa %ymm2, (%rax) ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-NEXT: vmovdqa %ymm8, 96(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm9, 32(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm9, 96(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm12, 32(%rax) ; AVX2-FAST-NEXT: vmovdqa %ymm10, (%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm7, 64(%rax) -; AVX2-FAST-NEXT: addq $1224, %rsp # imm = 0x4C8 +; AVX2-FAST-NEXT: vmovdqa %ymm8, 64(%rax) +; AVX2-FAST-NEXT: addq $1240, %rsp # imm = 0x4D8 ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; @@ -5038,18 +5033,18 @@ ; AVX2-FAST-PERLANE-NEXT: vmovdqa 320(%rdi), %ymm9 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 256(%rdi), %ymm4 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 224(%rdi), %ymm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 544(%rdi), %ymm11 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 544(%rdi), %ymm10 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 480(%rdi), %ymm7 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 448(%rdi), %ymm8 ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdi), %ymm10 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm11 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdi), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 80(%rdi), %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm10[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm0 = <0,7,6,u> -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm6[6],ymm3[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm6, %ymm12 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm11[6],ymm3[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, %ymm13 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpermd %ymm2, %ymm0, %ymm2 @@ -5069,7 +5064,8 @@ ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpermd %ymm1, %ymm0, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 528(%rdi), %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm11[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm10[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm10, %ymm8 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa 576(%rdi), %xmm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 608(%rdi), %xmm3 @@ -5088,7 +5084,7 @@ ; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 304(%rdi), %ymm2 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm9[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm9, %ymm15 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm9, (%rsp) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa 352(%rdi), %xmm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 384(%rdi), %xmm3 @@ -5129,10 +5125,10 @@ ; AVX2-FAST-PERLANE-NEXT: vpalignr {{.*#+}} ymm1 = ymm2[12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10,11],ymm2[28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26,27] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,0] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 512(%rdi), %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1],ymm2[2,3],ymm11[4,5],ymm2[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 512(%rdi), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1],ymm1[2,3],ymm10[4,5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,2,2,3,5,6,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0],ymm14[1],ymm7[2,3,4],ymm14[5],ymm7[6,7] @@ -5152,7 +5148,7 @@ ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,0] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa 288(%rdi), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm15[0,1],ymm1[2,3],ymm15[4,5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,2,2,3,5,6,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] @@ -5190,19 +5186,18 @@ ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm0 = mem[2,2,2,2] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 160(%rdi), %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 128(%rdi), %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpalignr {{.*#+}} ymm1 = ymm2[12,13,14,15],ymm3[0,1,2,3,4,5,6,7,8,9,10,11],ymm2[28,29,30,31],ymm3[16,17,18,19,20,21,22,23,24,25,26,27] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 128(%rdi), %ymm10 +; AVX2-FAST-PERLANE-NEXT: vpalignr {{.*#+}} ymm1 = ymm2[12,13,14,15],ymm10[0,1,2,3,4,5,6,7,8,9,10,11],ymm2[28,29,30,31],ymm10[16,17,18,19,20,21,22,23,24,25,26,27] ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, %ymm4 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,0] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm0[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0,1],ymm0[2,3],ymm10[4,5],ymm0[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1],ymm0[2,3],ymm7[4,5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[1,2,2,3,5,6,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm12[0],ymm13[1],ymm12[2,3,4],ymm13[5],ymm12[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm11[0],ymm13[1],ymm11[2,3,4],ymm13[5],ymm11[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[1,0,3,3,5,4,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,3,2,3] @@ -5210,21 +5205,21 @@ ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa 80(%rdi), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpalignr {{.*#+}} ymm2 = ymm10[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm10[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] +; AVX2-FAST-PERLANE-NEXT: vpalignr {{.*#+}} ymm2 = ymm7[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm7[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 8(%rdi), %xmm3 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %xmm2 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0],xmm2[1],xmm3[2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm5[0],ymm4[0],ymm5[2],ymm4[2] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm5, %ymm10 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm10[0],ymm4[0],ymm10[2],ymm4[2] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 204(%rdi), %ymm14 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm14[7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm3[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa 528(%rdi), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpalignr {{.*#+}} ymm3 = ymm11[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],ymm11[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpalignr {{.*#+}} ymm3 = ymm8[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],ymm8[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm1[3],ymm3[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 456(%rdi), %xmm14 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 480(%rdi), %xmm1 @@ -5252,8 +5247,8 @@ ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4],ymm13[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa 304(%rdi), %xmm12 -; AVX2-FAST-PERLANE-NEXT: vmovdqu (%rsp), %ymm6 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovdqu (%rsp), %ymm5 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpalignr {{.*#+}} ymm13 = ymm5[8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4,5,6,7],ymm5[24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20,21,22,23] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2],ymm12[3],ymm13[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 232(%rdi), %xmm13 @@ -5273,19 +5268,19 @@ ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm10[0,2],ymm12[1,3],ymm10[4,6],ymm12[5,7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm10[0,2],ymm15[1,3],ymm10[4,6],ymm15[5,7] ; AVX2-FAST-PERLANE-NEXT: vbroadcastss 208(%rdi), %ymm10 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm10[7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm0 = ymm11[0],mem[1],ymm11[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd $253, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm0 = mem[0],ymm11[1],mem[2,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1,2],mem[3] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm7, %ymm13 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm7, %ymm10 ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm7[0,2],ymm8[1,3],ymm7[4,6],ymm8[5,7] ; AVX2-FAST-PERLANE-NEXT: vbroadcastss 656(%rdi), %ymm2 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] @@ -5296,13 +5291,13 @@ ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, %ymm10 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm4, %ymm15 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, %ymm13 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm4, %ymm12 ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm3[0,2],ymm4[1,3],ymm3[4,6],ymm4[5,7] ; AVX2-FAST-PERLANE-NEXT: vbroadcastss 432(%rdi), %ymm2 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm0 = ymm0[0],mem[1],ymm0[2,3,4,5,6,7] @@ -5317,7 +5312,7 @@ ; AVX2-FAST-PERLANE-NEXT: vbroadcastss 880(%rdi), %ymm2 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vbroadcastss 100(%rdi), %xmm1 ; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%rdi), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm1 = xmm0[0,1,2],xmm1[3] @@ -5327,11 +5322,11 @@ ; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[0,1,2,3],ymm2[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm2, %ymm3, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] -; AVX2-FAST-PERLANE-NEXT: vbroadcastsd {{.*#+}} ymm4 = [30064771072,30064771072,30064771072,30064771072] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm6, %ymm4, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm12[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm12, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} ymm6 = +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm5, %ymm6, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm15[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm15, %ymm4 ; AVX2-FAST-PERLANE-NEXT: vbroadcastss 212(%rdi), %ymm7 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm7[7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] @@ -5344,23 +5339,23 @@ ; AVX2-FAST-PERLANE-NEXT: vmovaps 512(%rdi), %xmm1 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm7 = xmm1[0,1,2],xmm7[3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm7[2,3] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm13, %ymm4, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm10, %ymm6, %ymm7 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm8[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm8, %ymm12 +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm8, %ymm10 ; AVX2-FAST-PERLANE-NEXT: vbroadcastss 660(%rdi), %ymm8 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm8[7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm7[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm2 = ymm2[0,1,2,3],mem[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm2, %ymm3, %ymm7 ; AVX2-FAST-PERLANE-NEXT: vbroadcastss 324(%rdi), %xmm8 ; AVX2-FAST-PERLANE-NEXT: vmovaps 288(%rdi), %xmm2 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm8 = xmm2[0,1,2],xmm8[3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm8[2,3] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm10, %ymm4, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm15[6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm13, %ymm6, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm12[6,7] ; AVX2-FAST-PERLANE-NEXT: vbroadcastss 436(%rdi), %ymm9 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6],ymm9[7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5,6,7] @@ -5373,18 +5368,18 @@ ; AVX2-FAST-PERLANE-NEXT: vmovaps 736(%rdi), %xmm7 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm8 = xmm7[0,1,2],xmm8[3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm8[2,3] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm11, %ymm4, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm14[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm14, %ymm10 -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 884(%rdi), %ymm9 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6],ymm9[7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm8[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0],ymm6[1],ymm5[2,3,4],ymm6[5],ymm5[6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm3 = ymm3[1,0,3,3,5,4,7,7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,3] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm11, %ymm6, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm11, %ymm15 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm14[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm14, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 884(%rdi), %ymm8 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm8[7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm5[1],ymm4[2,3,4],ymm5[5],ymm4[6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm4 = ymm4[1,0,3,3,5,4,7,7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,0,3] ; AVX2-FAST-PERLANE-NEXT: vbroadcastss 216(%rdi), %ymm5 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm5[7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm5[7] ; AVX2-FAST-PERLANE-NEXT: vmovaps 96(%rdi), %xmm14 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm0 = xmm14[0,1,2],xmm0[3] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1,3,2] @@ -5392,15 +5387,16 @@ ; AVX2-FAST-PERLANE-NEXT: # ymm5 = mem[1,0,2,3,5,4,6,7] ; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm5, %xmm5 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm0 = xmm5[0,1],xmm0[2,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovaps 544(%rdi), %xmm3 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovaps 544(%rdi), %xmm4 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm1 = xmm4[0,1,2],xmm1[3] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,3,2] ; AVX2-FAST-PERLANE-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm5 = mem[1,0,2,3,5,4,6,7] ; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm5, %xmm5 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm1 = xmm5[0,1],xmm1[2,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm12[0],ymm13[1],ymm12[2,3,4],ymm13[5],ymm12[6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm5 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm5 = ymm10[0],mem[1],ymm10[2,3,4],mem[5],ymm10[6,7] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm5 = ymm5[1,0,3,3,5,4,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,0,3] ; AVX2-FAST-PERLANE-NEXT: vbroadcastss 664(%rdi), %ymm6 @@ -5413,8 +5409,7 @@ ; AVX2-FAST-PERLANE-NEXT: # ymm5 = mem[1,0,2,3,5,4,6,7] ; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm5, %xmm5 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3] -; AVX2-FAST-PERLANE-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm5 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm5 = ymm15[0],mem[1],ymm15[2,3,4],mem[5],ymm15[6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm12[0],ymm13[1],ymm12[2,3,4],ymm13[5],ymm12[6,7] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm5 = ymm5[1,0,3,3,5,4,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,0,3] ; AVX2-FAST-PERLANE-NEXT: vbroadcastss 440(%rdi), %ymm6 @@ -5427,8 +5422,7 @@ ; AVX2-FAST-PERLANE-NEXT: # ymm7 = mem[1,0,2,3,5,4,6,7] ; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm7, %xmm7 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm5 = xmm7[0,1],xmm5[2,3] -; AVX2-FAST-PERLANE-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm7 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm7 = ymm10[0],mem[1],ymm10[2,3,4],mem[5],ymm10[6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm9[0],ymm15[1],ymm9[2,3,4],ymm15[5],ymm9[6,7] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm7 = ymm7[1,0,3,3,5,4,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,0,3] ; AVX2-FAST-PERLANE-NEXT: vbroadcastss 888(%rdi), %ymm8 @@ -5437,111 +5431,111 @@ ; AVX2-FAST-PERLANE-NEXT: vbroadcastss 584(%rdi), %xmm5 ; AVX2-FAST-PERLANE-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm5 = xmm5[0],mem[1],xmm5[2,3] -; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpermps 640(%rdi), %ymm4, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm8[6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 528(%rdi), %ymm8 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm3 = xmm8[0,1,2],xmm3[3] -; AVX2-FAST-PERLANE-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm8 = mem[2,3,2,3,6,7,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm8 = ymm8[0],mem[1],ymm8[2,3,4],mem[5],ymm8[6,7] -; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm8, %xmm8 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm3 = xmm8[0,1],xmm3[2,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm3[0,1,2,3],ymm5[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 136(%rdi), %xmm3 -; AVX2-FAST-PERLANE-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm3 = xmm3[0],mem[1],xmm3[2,3] -; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpermps 192(%rdi), %ymm4, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm5[6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 80(%rdi), %ymm5 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1,2],xmm14[3] +; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} ymm5 = +; AVX2-FAST-PERLANE-NEXT: vpermps 640(%rdi), %ymm5, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm9[6,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 528(%rdi), %ymm9 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm4 = xmm9[0,1,2],xmm4[3] ; AVX2-FAST-PERLANE-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm9 = mem[2,3,2,3,6,7,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm9 = ymm9[0],mem[1],ymm9[2,3,4],mem[5],ymm9[6,7] ; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm9, %xmm9 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm5 = xmm9[0,1],xmm5[2,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm5[0,1,2,3],ymm3[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 360(%rdi), %xmm3 -; AVX2-FAST-PERLANE-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm3 = xmm3[0],mem[1],xmm3[2,3] -; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpermps 416(%rdi), %ymm4, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm5[6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 304(%rdi), %ymm5 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1,2],xmm11[3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm4 = xmm9[0,1],xmm4[2,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm4[0,1,2,3],ymm8[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 136(%rdi), %xmm4 +; AVX2-FAST-PERLANE-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm4 = xmm4[0],mem[1],xmm4[2,3] +; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpermps 192(%rdi), %ymm5, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm9[6,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 80(%rdi), %ymm9 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm9 = xmm9[0,1,2],xmm14[3] ; AVX2-FAST-PERLANE-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm10 = mem[2,3,2,3,6,7,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm10 = ymm10[0],mem[1],ymm10[2,3,4],mem[5],ymm10[6,7] ; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm10, %xmm10 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm5 = xmm10[0,1],xmm5[2,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 808(%rdi), %xmm5 -; AVX2-FAST-PERLANE-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm5 = xmm5[0],mem[1],xmm5[2,3] -; AVX2-FAST-PERLANE-NEXT: vpermps 864(%rdi), %ymm4, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 752(%rdi), %ymm5 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm2 = xmm5[0,1,2],xmm2[3] -; AVX2-FAST-PERLANE-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm5 = mem[2,3,2,3,6,7,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm5 = ymm5[0],mem[1],ymm5[2,3,4],mem[5],ymm5[6,7] -; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm5, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 96(%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 32(%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 64(%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, (%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 96(%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 32(%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 64(%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, (%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 32(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 96(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 64(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, (%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 96(%r8) -; AVX2-FAST-PERLANE-NEXT: vmovups (%rsp), %ymm4 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 32(%r8) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 64(%r8) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, (%r8) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 96(%r9) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 32(%r9) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 64(%r9) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, (%r9) -; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm9 = xmm10[0,1],xmm9[2,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm4[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 360(%rdi), %xmm4 +; AVX2-FAST-PERLANE-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm4 = xmm4[0],mem[1],xmm4[2,3] +; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpermps 416(%rdi), %ymm5, %ymm10 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm10[6,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 304(%rdi), %ymm10 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm10 = xmm10[0,1,2],xmm11[3] +; AVX2-FAST-PERLANE-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm11 = mem[2,3,2,3,6,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm11 = ymm11[0],mem[1],ymm11[2,3,4],mem[5],ymm11[6,7] +; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm11, %xmm11 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm10 = xmm11[0,1],xmm10[2,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm10[0,1,2,3],ymm4[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 808(%rdi), %xmm10 +; AVX2-FAST-PERLANE-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm10 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm10 = xmm10[0],mem[1],xmm10[2,3] +; AVX2-FAST-PERLANE-NEXT: vpermps 864(%rdi), %ymm5, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm10 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm10[0,1,2,3,4,5],ymm5[6,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 752(%rdi), %ymm10 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm2 = xmm10[0,1,2],xmm2[3] +; AVX2-FAST-PERLANE-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm10 = mem[2,3,2,3,6,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm10 = ymm10[0],mem[1],ymm10[2,3,4],mem[5],ymm10[6,7] +; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm10, %xmm10 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm2 = xmm10[0,1],xmm2[2,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, 96(%rsi) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, 32(%rsi) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, 64(%rsi) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, (%rsi) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, 96(%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, 32(%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, 64(%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, (%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, 32(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, 96(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, 64(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, (%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovups (%rsp), %ymm5 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, 96(%r8) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, 32(%r8) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, 64(%r8) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, (%r8) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, 96(%r9) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, 32(%r9) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, 64(%r9) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, (%r9) +; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm7, 96(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 32(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 64(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, (%rax) ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, 96(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, 32(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 32(%rax) ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm9, (%rax) ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm8, 64(%rax) ; AVX2-FAST-PERLANE-NEXT: addq $1224, %rsp # imm = 0x4C8 @@ -8614,14 +8608,14 @@ ; ; AVX2-SLOW-LABEL: load_i32_stride7_vf64: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: subq $2680, %rsp # imm = 0xA78 -; AVX2-SLOW-NEXT: vmovdqa 1216(%rdi), %ymm12 -; AVX2-SLOW-NEXT: vmovdqu %ymm12, (%rsp) # 32-byte Spill +; AVX2-SLOW-NEXT: subq $2664, %rsp # imm = 0xA68 +; AVX2-SLOW-NEXT: vmovdqa 1216(%rdi), %ymm9 +; AVX2-SLOW-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa 1152(%rdi), %ymm4 -; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa 1120(%rdi), %ymm5 -; AVX2-SLOW-NEXT: vmovdqa 768(%rdi), %ymm13 -; AVX2-SLOW-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 768(%rdi), %ymm12 +; AVX2-SLOW-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa 704(%rdi), %ymm6 ; AVX2-SLOW-NEXT: vmovdqa 672(%rdi), %ymm7 ; AVX2-SLOW-NEXT: vmovdqa 320(%rdi), %ymm8 @@ -8648,13 +8642,12 @@ ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5],ymm6[6],ymm7[7] -; AVX2-SLOW-NEXT: vmovdqa %ymm7, %ymm9 ; AVX2-SLOW-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa %ymm6, %ymm8 ; AVX2-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpermd %ymm1, %ymm0, %ymm1 ; AVX2-SLOW-NEXT: vpbroadcastq 752(%rdi), %ymm2 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm13[4,5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm12[4,5,6,7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqa 800(%rdi), %xmm2 ; AVX2-SLOW-NEXT: vmovdqa 832(%rdi), %xmm3 @@ -8666,10 +8659,11 @@ ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5],ymm4[6],ymm5[7] -; AVX2-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa %ymm4, %ymm6 +; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpermd %ymm1, %ymm0, %ymm1 ; AVX2-SLOW-NEXT: vpbroadcastq 1200(%rdi), %ymm2 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm12[4,5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm9[4,5,6,7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqa 1248(%rdi), %xmm2 ; AVX2-SLOW-NEXT: vmovdqa 1280(%rdi), %xmm3 @@ -8680,11 +8674,11 @@ ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 1600(%rdi), %ymm1 +; AVX2-SLOW-NEXT: vmovdqa 1600(%rdi), %ymm2 +; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 1568(%rdi), %ymm1 ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 1568(%rdi), %ymm12 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3,4,5],ymm1[6],ymm12[7] -; AVX2-SLOW-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6],ymm1[7] ; AVX2-SLOW-NEXT: vpermd %ymm1, %ymm0, %ymm1 ; AVX2-SLOW-NEXT: vmovdqa 1664(%rdi), %ymm3 ; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -8721,16 +8715,16 @@ ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa 480(%rdi), %ymm2 -; AVX2-SLOW-NEXT: vmovdqa 448(%rdi), %ymm13 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1,2,3,4,5],ymm2[6],ymm13[7] -; AVX2-SLOW-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 448(%rdi), %ymm12 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3,4,5],ymm2[6],ymm12[7] +; AVX2-SLOW-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa %ymm2, %ymm15 ; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpermd %ymm1, %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vmovdqa 544(%rdi), %ymm6 +; AVX2-SLOW-NEXT: vmovdqa 544(%rdi), %ymm5 ; AVX2-SLOW-NEXT: vpbroadcastq 528(%rdi), %ymm2 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm6[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqa 576(%rdi), %xmm2 ; AVX2-SLOW-NEXT: vmovdqa 608(%rdi), %xmm3 @@ -8741,16 +8735,18 @@ ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa 928(%rdi), %ymm14 -; AVX2-SLOW-NEXT: vmovdqa 896(%rdi), %ymm1 -; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm14[6],ymm1[7] -; AVX2-SLOW-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 928(%rdi), %ymm2 +; AVX2-SLOW-NEXT: vmovdqa 896(%rdi), %ymm3 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm2[6],ymm3[7] +; AVX2-SLOW-NEXT: vmovdqa %ymm3, %ymm14 +; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa %ymm2, %ymm13 +; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpermd %ymm1, %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vmovdqa 992(%rdi), %ymm7 +; AVX2-SLOW-NEXT: vmovdqa 992(%rdi), %ymm9 ; AVX2-SLOW-NEXT: vpbroadcastq 976(%rdi), %ymm2 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm7[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm9[4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqa 1024(%rdi), %xmm2 ; AVX2-SLOW-NEXT: vmovdqa 1056(%rdi), %xmm3 @@ -8814,11 +8810,11 @@ ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] ; AVX2-SLOW-NEXT: vmovdqa 736(%rdi), %ymm1 ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1],ymm1[2,3],ymm11[4,5],ymm1[6,7] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1],ymm1[2,3],ymm10[4,5],ymm1[6,7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,2,2,3,5,6,6,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0],ymm9[1],ymm8[2,3,4],ymm9[5],ymm8[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0],ymm7[1],ymm8[2,3,4],ymm7[5],ymm8[6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[1,0,3,3,5,4,7,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,3,2,3] @@ -8836,12 +8832,12 @@ ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] ; AVX2-SLOW-NEXT: vmovdqa 1184(%rdi), %ymm1 ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqu (%rsp), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1],ymm1[2,3],ymm9[4,5],ymm1[6,7] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1],ymm1[2,3],ymm11[4,5],ymm1[6,7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,2,2,3,5,6,6,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] -; AVX2-SLOW-NEXT: vpblendd $221, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm2 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm2 = mem[0],ymm5[1],mem[2,3,4],ymm5[5],mem[6,7] +; AVX2-SLOW-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm2 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm2 = ymm6[0],mem[1],ymm6[2,3,4],mem[5],ymm6[6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[1,0,3,3,5,4,7,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,3,2,3] @@ -8859,12 +8855,13 @@ ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] ; AVX2-SLOW-NEXT: vmovdqa 1632(%rdi), %ymm1 ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1],ymm1[2,3],ymm5[4,5],ymm1[6,7] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1],ymm1[2,3],ymm6[4,5],ymm1[6,7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,2,2,3,5,6,6,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] -; AVX2-SLOW-NEXT: vpblendd $221, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm2 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm2 = mem[0],ymm12[1],mem[2,3,4],ymm12[5],mem[6,7] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm2 = ymm2[0],mem[1],ymm2[2,3,4],mem[5],ymm2[6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[1,0,3,3,5,4,7,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,3,2,3] @@ -8882,10 +8879,10 @@ ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] ; AVX2-SLOW-NEXT: vmovdqa 512(%rdi), %ymm1 ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1],ymm1[2,3],ymm6[4,5],ymm1[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1],ymm1[2,3],ymm5[4,5],ymm1[6,7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,2,2,3,5,6,6,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm15[0],ymm13[1],ymm15[2,3,4],ymm13[5],ymm15[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm15[0],ymm12[1],ymm15[2,3,4],ymm12[5],ymm15[6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[1,0,3,3,5,4,7,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,3,2,3] @@ -8903,11 +8900,10 @@ ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] ; AVX2-SLOW-NEXT: vmovdqa 960(%rdi), %ymm1 ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1],ymm1[2,3],ymm7[4,5],ymm1[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1],ymm1[2,3],ymm9[4,5],ymm1[6,7] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,2,2,3,5,6,6,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] -; AVX2-SLOW-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm2 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm2 = ymm14[0],mem[1],ymm14[2,3,4],mem[5],ymm14[6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0],ymm14[1],ymm13[2,3,4],ymm14[5],ymm13[6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[1,0,3,3,5,4,7,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,3,2,3] @@ -8920,13 +8916,13 @@ ; AVX2-SLOW-NEXT: vmovdqa 1472(%rdi), %ymm1 ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpalignr {{.*#+}} ymm1 = ymm2[12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10,11],ymm2[28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26,27] -; AVX2-SLOW-NEXT: vmovdqa %ymm2, %ymm12 +; AVX2-SLOW-NEXT: vmovdqa %ymm2, %ymm13 ; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,0] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] ; AVX2-SLOW-NEXT: vmovdqa 1408(%rdi), %ymm7 -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm15[0,1],ymm7[2,3],ymm15[4,5],ymm7[6,7] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm14[0,1],ymm7[2,3],ymm14[4,5],ymm7[6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,2,2,3,5,6,6,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] @@ -8941,18 +8937,17 @@ ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = mem[2,2,2,2] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vmovdqa 160(%rdi), %ymm13 -; AVX2-SLOW-NEXT: vmovdqa 128(%rdi), %ymm3 -; AVX2-SLOW-NEXT: vpalignr {{.*#+}} ymm1 = ymm13[12,13,14,15],ymm3[0,1,2,3,4,5,6,7,8,9,10,11],ymm13[28,29,30,31],ymm3[16,17,18,19,20,21,22,23,24,25,26,27] -; AVX2-SLOW-NEXT: vmovdqa %ymm3, %ymm14 -; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 160(%rdi), %ymm15 +; AVX2-SLOW-NEXT: vmovdqa 128(%rdi), %ymm1 +; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpalignr {{.*#+}} ymm1 = ymm15[12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10,11],ymm15[28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26,27] +; AVX2-SLOW-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,0] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] -; AVX2-SLOW-NEXT: vmovdqa 64(%rdi), %ymm6 +; AVX2-SLOW-NEXT: vmovdqa 64(%rdi), %ymm12 ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1],ymm6[2,3],ymm8[4,5],ymm6[6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1],ymm12[2,3],ymm8[4,5],ymm12[6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,2,2,3,5,6,6,7] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload @@ -8973,48 +8968,48 @@ ; AVX2-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm10[0],ymm4[0],ymm10[2],ymm4[2] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm5[0],ymm9[0],ymm5[2],ymm9[2] ; AVX2-SLOW-NEXT: vpbroadcastd 428(%rdi), %ymm2 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa 752(%rdi), %xmm0 ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vpalignr {{.*#+}} ymm1 = ymm11[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm11[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] +; AVX2-SLOW-NEXT: vpalignr {{.*#+}} ymm1 = ymm10[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm10[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7] ; AVX2-SLOW-NEXT: vpbroadcastd 680(%rdi), %xmm1 ; AVX2-SLOW-NEXT: vmovdqa 704(%rdi), %xmm2 ; AVX2-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-SLOW-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm1 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm1 = ymm3[0],mem[0],ymm3[2],mem[2] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm1[0],ymm4[0],ymm1[2],ymm4[2] ; AVX2-SLOW-NEXT: vpbroadcastd 876(%rdi), %ymm2 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa 1200(%rdi), %xmm0 ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vpalignr {{.*#+}} ymm1 = ymm9[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm9[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] +; AVX2-SLOW-NEXT: vpalignr {{.*#+}} ymm1 = ymm11[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm11[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7] ; AVX2-SLOW-NEXT: vpbroadcastd 1128(%rdi), %xmm1 ; AVX2-SLOW-NEXT: vmovdqa 1152(%rdi), %xmm2 ; AVX2-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm1 = ymm1[0],mem[0],ymm1[2],mem[2] +; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[2],ymm3[2] ; AVX2-SLOW-NEXT: vpbroadcastd 1324(%rdi), %ymm2 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa 1648(%rdi), %xmm0 ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vpalignr {{.*#+}} ymm1 = ymm5[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm5[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] +; AVX2-SLOW-NEXT: vpalignr {{.*#+}} ymm1 = ymm6[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm6[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7] ; AVX2-SLOW-NEXT: vpbroadcastd 1576(%rdi), %xmm1 ; AVX2-SLOW-NEXT: vmovdqa 1600(%rdi), %xmm2 @@ -9029,29 +9024,30 @@ ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa 1424(%rdi), %xmm0 -; AVX2-SLOW-NEXT: vpalignr {{.*#+}} ymm1 = ymm15[8,9,10,11,12,13,14,15],ymm7[0,1,2,3,4,5,6,7],ymm15[24,25,26,27,28,29,30,31],ymm7[16,17,18,19,20,21,22,23] +; AVX2-SLOW-NEXT: vpalignr {{.*#+}} ymm1 = ymm14[8,9,10,11,12,13,14,15],ymm7[0,1,2,3,4,5,6,7],ymm14[24,25,26,27,28,29,30,31],ymm7[16,17,18,19,20,21,22,23] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7] ; AVX2-SLOW-NEXT: vpbroadcastd 1352(%rdi), %xmm1 ; AVX2-SLOW-NEXT: vmovdqa 1376(%rdi), %xmm2 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm1[0],ymm12[0],ymm1[2],ymm12[2] +; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm1[0],ymm13[0],ymm1[2],ymm13[2] ; AVX2-SLOW-NEXT: vpbroadcastd 1548(%rdi), %ymm7 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm7[7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa 80(%rdi), %xmm0 -; AVX2-SLOW-NEXT: vpalignr {{.*#+}} ymm1 = ymm8[8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4,5,6,7],ymm8[24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20,21,22,23] +; AVX2-SLOW-NEXT: vpalignr {{.*#+}} ymm1 = ymm8[8,9,10,11,12,13,14,15],ymm12[0,1,2,3,4,5,6,7],ymm8[24,25,26,27,28,29,30,31],ymm12[16,17,18,19,20,21,22,23] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7] ; AVX2-SLOW-NEXT: vpbroadcastd 8(%rdi), %xmm1 -; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %xmm5 -; AVX2-SLOW-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm5[1],xmm1[2,3] +; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %xmm6 +; AVX2-SLOW-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm6[1],xmm1[2,3] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm14[0],ymm13[0],ymm14[2],ymm13[2] -; AVX2-SLOW-NEXT: vpbroadcastd 204(%rdi), %ymm15 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm15[7] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm1[0],ymm15[0],ymm1[2],ymm15[2] +; AVX2-SLOW-NEXT: vpbroadcastd 204(%rdi), %ymm14 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm14[7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa 528(%rdi), %xmm0 @@ -9059,72 +9055,70 @@ ; AVX2-SLOW-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7] -; AVX2-SLOW-NEXT: vpbroadcastd 456(%rdi), %xmm15 -; AVX2-SLOW-NEXT: vmovdqa 480(%rdi), %xmm1 -; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm15 = xmm15[0],xmm1[1],xmm15[2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1],ymm0[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-SLOW-NEXT: vpbroadcastd 456(%rdi), %xmm14 +; AVX2-SLOW-NEXT: vmovdqa 480(%rdi), %xmm15 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm14 = xmm14[0],xmm15[1],xmm14[2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1],ymm0[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm12[0],ymm11[0],ymm12[2],ymm11[2] -; AVX2-SLOW-NEXT: vpbroadcastd 652(%rdi), %ymm14 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4,5,6],ymm14[7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm14[5,6,7] +; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm10[0],ymm11[0],ymm10[2],ymm11[2] +; AVX2-SLOW-NEXT: vpbroadcastd 652(%rdi), %ymm13 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3,4,5,6],ymm13[7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm13[5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa 976(%rdi), %xmm0 -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-SLOW-NEXT: vpalignr {{.*#+}} ymm14 = ymm9[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],ymm9[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2],ymm0[3],ymm14[4,5,6,7] -; AVX2-SLOW-NEXT: vpbroadcastd 904(%rdi), %xmm15 +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-SLOW-NEXT: vpalignr {{.*#+}} ymm13 = ymm8[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm8[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2],ymm0[3],ymm13[4,5,6,7] +; AVX2-SLOW-NEXT: vpbroadcastd 904(%rdi), %xmm14 ; AVX2-SLOW-NEXT: vmovdqa 928(%rdi), %xmm0 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm15 = xmm15[0],xmm0[1],xmm15[2,3] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1],ymm14[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm14 = xmm14[0],xmm0[1],xmm14[2,3] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1],ymm13[2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm7[0],ymm1[0],ymm7[2],ymm1[2] -; AVX2-SLOW-NEXT: vpbroadcastd 1100(%rdi), %ymm13 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1,2,3,4,5,6],ymm13[7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3,4],ymm13[5,6,7] -; AVX2-SLOW-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm13 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm13 = ymm5[0],mem[1],ymm5[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm6 = xmm5[0,1,2],mem[3] +; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm7[0],ymm1[0],ymm7[2],ymm1[2] +; AVX2-SLOW-NEXT: vpbroadcastd 1100(%rdi), %ymm12 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm14[0,1,2,3,4,5,6],ymm12[7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3,4],ymm12[5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm12 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm12 = ymm6[0],mem[1],ymm6[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1,2],mem[3] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm6 = xmm6[3,2,2,3] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm13 = ymm13[3,1,1,0,7,5,5,4] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm13[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm13 = ymm10[0,2],ymm4[1,3],ymm10[4,6],ymm4[5,7] -; AVX2-SLOW-NEXT: vbroadcastss 432(%rdi), %ymm14 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5,6],ymm14[7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm13[5,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm12 = ymm12[3,1,1,0,7,5,5,4] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm12[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm12 = ymm5[0,2],ymm9[1,3],ymm5[4,6],ymm9[5,7] +; AVX2-SLOW-NEXT: vbroadcastss 432(%rdi), %ymm13 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5,6],ymm13[7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm12[5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm6 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm6 = ymm4[0],mem[1],ymm4[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm5 = xmm4[0,1,2],mem[3] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-SLOW-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm6 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm6 = ymm5[0],mem[1],ymm5[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1,2],mem[3] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm5 = xmm5[3,2,2,3] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm6 = ymm6[3,1,1,0,7,5,5,4] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm6[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vshufps $216, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm6 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm6 = ymm3[0,2],mem[1,3],ymm3[4,6],mem[5,7] -; AVX2-SLOW-NEXT: vbroadcastss 880(%rdi), %ymm13 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm13[7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3,4],ymm6[5,6,7] -; AVX2-SLOW-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovups (%rsp), %ymm3 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm5 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm5 = ymm3[0],mem[1],ymm3[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm4 = xmm3[0,1,2],mem[3] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm6 = ymm6[0,2],ymm4[1,3],ymm6[4,6],ymm4[5,7] +; AVX2-SLOW-NEXT: vbroadcastss 880(%rdi), %ymm12 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm12[7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm6[5,6,7] +; AVX2-SLOW-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-SLOW-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm5 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm5 = ymm4[0],mem[1],ymm4[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1,2],mem[3] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm4 = xmm4[3,2,2,3] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm5 = ymm5[3,1,1,0,7,5,5,4] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm5[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm5 = ymm13[0,2],ymm14[1,3],ymm13[4,6],ymm14[5,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm5 = ymm14[0,2],ymm3[1,3],ymm14[4,6],ymm3[5,7] ; AVX2-SLOW-NEXT: vbroadcastss 1328(%rdi), %ymm6 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm6[7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm5[5,6,7] @@ -9151,39 +9145,38 @@ ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,2,2,3] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[3,1,1,0,7,5,5,4] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-SLOW-NEXT: vshufps $216, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm3 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm3 = ymm15[0,2],mem[1,3],ymm15[4,6],mem[5,7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-SLOW-NEXT: vshufps $216, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm3 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm3 = ymm12[0,2],mem[1,3],ymm12[4,6],mem[5,7] ; AVX2-SLOW-NEXT: vbroadcastss 1552(%rdi), %ymm4 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm4[7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0],ymm8[1],ymm9[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm2 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm2 = ymm8[0],mem[1],ymm8[2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],mem[3] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,2,2,3] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[3,1,1,0,7,5,5,4] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm7[0,2],ymm1[1,3],ymm7[4,6],ymm1[5,7] -; AVX2-SLOW-NEXT: vmovaps %ymm1, %ymm10 +; AVX2-SLOW-NEXT: vmovaps %ymm1, %ymm13 ; AVX2-SLOW-NEXT: vmovaps %ymm7, %ymm8 ; AVX2-SLOW-NEXT: vbroadcastss 1104(%rdi), %ymm3 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm0 = ymm0[0],mem[1],ymm0[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1,2],mem[3] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,2,2,3] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa %ymm11, %ymm7 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm12[0,2],ymm11[1,3],ymm12[4,6],ymm11[5,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm1 = xmm15[0,1,2],mem[3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,2,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm10[0,2],ymm11[1,3],ymm10[4,6],ymm11[5,7] ; AVX2-SLOW-NEXT: vbroadcastss 656(%rdi), %ymm2 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] -; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm0 = ymm0[0],mem[1],ymm0[2,3,4,5,6,7] @@ -9193,211 +9186,200 @@ ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm3[0,2],ymm4[1,3],ymm3[4,6],ymm4[5,7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm3[0,2],ymm9[1,3],ymm3[4,6],ymm9[5,7] ; AVX2-SLOW-NEXT: vbroadcastss 208(%rdi), %ymm2 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vbroadcastss 100(%rdi), %xmm1 -; AVX2-SLOW-NEXT: vmovaps 64(%rdi), %xmm0 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm0[0,1,2],xmm1[3] +; AVX2-SLOW-NEXT: vbroadcastss 100(%rdi), %xmm0 +; AVX2-SLOW-NEXT: vmovaps 64(%rdi), %xmm1 +; AVX2-SLOW-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3] ; AVX2-SLOW-NEXT: vmovaps {{.*#+}} xmm5 = <4,3,u,u> -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm2 = mem[0,1,2,3],ymm2[4,5,6,7] -; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm5, %ymm2 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] -; AVX2-SLOW-NEXT: vbroadcastsd {{.*#+}} ymm9 = [30064771072,30064771072,30064771072,30064771072] -; AVX2-SLOW-NEXT: vpermps %ymm3, %ymm9, %ymm2 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6,7] -; AVX2-SLOW-NEXT: vbroadcastss 212(%rdi), %ymm3 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-SLOW-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] -; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm5, %ymm2 -; AVX2-SLOW-NEXT: vbroadcastss 324(%rdi), %xmm3 +; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm5, %ymm1 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] +; AVX2-SLOW-NEXT: vmovaps {{.*#+}} ymm6 = +; AVX2-SLOW-NEXT: vpermps %ymm3, %ymm6, %ymm1 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm9[6,7] +; AVX2-SLOW-NEXT: vbroadcastss 212(%rdi), %ymm2 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] +; AVX2-SLOW-NEXT: vpermps %ymm0, %ymm5, %ymm0 +; AVX2-SLOW-NEXT: vbroadcastss 324(%rdi), %xmm2 ; AVX2-SLOW-NEXT: vmovaps 288(%rdi), %xmm1 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm3 = xmm1[0,1,2],xmm3[3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3] -; AVX2-SLOW-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm3 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm2 = xmm1[0,1,2],xmm2[3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-SLOW-NEXT: vpermps %ymm15, %ymm6, %ymm2 +; AVX2-SLOW-NEXT: vblendps $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm2 = ymm2[0,1,2,3,4,5],mem[6,7] +; AVX2-SLOW-NEXT: vbroadcastss 436(%rdi), %ymm3 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] +; AVX2-SLOW-NEXT: vpermps %ymm0, %ymm5, %ymm0 +; AVX2-SLOW-NEXT: vbroadcastss 548(%rdi), %xmm3 +; AVX2-SLOW-NEXT: vmovaps 512(%rdi), %xmm2 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm3 = xmm2[0,1,2],xmm3[3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3] +; AVX2-SLOW-NEXT: vpermps %ymm10, %ymm6, %ymm3 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm11[6,7] -; AVX2-SLOW-NEXT: vbroadcastss 436(%rdi), %ymm4 +; AVX2-SLOW-NEXT: vbroadcastss 660(%rdi), %ymm4 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm4[7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm2 = mem[0,1,2,3],ymm2[4,5,6,7] -; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm5, %ymm3 -; AVX2-SLOW-NEXT: vbroadcastss 548(%rdi), %xmm4 -; AVX2-SLOW-NEXT: vmovaps 512(%rdi), %xmm2 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm4 = xmm2[0,1,2],xmm4[3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3] -; AVX2-SLOW-NEXT: vpermps %ymm12, %ymm9, %ymm4 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm7[6,7] -; AVX2-SLOW-NEXT: vbroadcastss 660(%rdi), %ymm6 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm6[7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] -; AVX2-SLOW-NEXT: vmovups %ymm3, (%rsp) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm3 = mem[0,1,2,3],ymm3[4,5,6,7] -; AVX2-SLOW-NEXT: vpermps %ymm3, %ymm5, %ymm4 -; AVX2-SLOW-NEXT: vbroadcastss 772(%rdi), %xmm6 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] +; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] +; AVX2-SLOW-NEXT: vpermps %ymm0, %ymm5, %ymm0 +; AVX2-SLOW-NEXT: vbroadcastss 772(%rdi), %xmm4 ; AVX2-SLOW-NEXT: vmovaps 736(%rdi), %xmm3 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm6 = xmm3[0,1,2],xmm6[3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],xmm6[2,3] -; AVX2-SLOW-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm6 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: vblendps $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm6 = ymm6[0,1,2,3,4,5],mem[6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm4 = xmm3[0,1,2],xmm4[3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,3] +; AVX2-SLOW-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm4 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: vblendps $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm4 = ymm4[0,1,2,3,4,5],mem[6,7] ; AVX2-SLOW-NEXT: vbroadcastss 884(%rdi), %ymm7 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm7[7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] -; AVX2-SLOW-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm4 = mem[0,1,2,3],ymm4[4,5,6,7] -; AVX2-SLOW-NEXT: vpermps %ymm4, %ymm5, %ymm6 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm7[7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7] +; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] +; AVX2-SLOW-NEXT: vpermps %ymm0, %ymm5, %ymm0 ; AVX2-SLOW-NEXT: vbroadcastss 996(%rdi), %xmm7 ; AVX2-SLOW-NEXT: vmovaps 960(%rdi), %xmm4 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm7 = xmm4[0,1,2],xmm7[3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1],xmm7[2,3] -; AVX2-SLOW-NEXT: vpermps %ymm8, %ymm9, %ymm7 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm10[6,7] -; AVX2-SLOW-NEXT: vbroadcastss 1108(%rdi), %ymm8 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm8[7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7] -; AVX2-SLOW-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm6 = mem[0,1,2,3],ymm6[4,5,6,7] -; AVX2-SLOW-NEXT: vpermps %ymm6, %ymm5, %ymm6 -; AVX2-SLOW-NEXT: vbroadcastss 1220(%rdi), %xmm8 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm7[2,3] +; AVX2-SLOW-NEXT: vpermps %ymm8, %ymm6, %ymm7 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm13[6,7] +; AVX2-SLOW-NEXT: vbroadcastss 1108(%rdi), %ymm9 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm9[7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm7[4,5,6,7] +; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] +; AVX2-SLOW-NEXT: vpermps %ymm0, %ymm5, %ymm0 +; AVX2-SLOW-NEXT: vbroadcastss 1220(%rdi), %xmm9 ; AVX2-SLOW-NEXT: vmovaps 1184(%rdi), %xmm7 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm8 = xmm7[0,1,2],xmm8[3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1],xmm8[2,3] -; AVX2-SLOW-NEXT: vpermps %ymm13, %ymm9, %ymm8 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm14[6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm9 = xmm7[0,1,2],xmm9[3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm9[2,3] +; AVX2-SLOW-NEXT: vpermps %ymm14, %ymm6, %ymm9 +; AVX2-SLOW-NEXT: vblendps $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm9 = ymm9[0,1,2,3,4,5],mem[6,7] ; AVX2-SLOW-NEXT: vbroadcastss 1332(%rdi), %ymm10 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6],ymm10[7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] -; AVX2-SLOW-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm6 = mem[0,1,2,3],ymm6[4,5,6,7] -; AVX2-SLOW-NEXT: vpermps %ymm6, %ymm5, %ymm6 -; AVX2-SLOW-NEXT: vbroadcastss 1444(%rdi), %xmm8 -; AVX2-SLOW-NEXT: vmovaps 1408(%rdi), %xmm13 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm8 = xmm13[0,1,2],xmm8[3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1],xmm8[2,3] -; AVX2-SLOW-NEXT: vpermps %ymm15, %ymm9, %ymm8 -; AVX2-SLOW-NEXT: vmovaps %ymm15, %ymm14 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5,6],ymm10[7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm9[4,5,6,7] +; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] +; AVX2-SLOW-NEXT: vpermps %ymm0, %ymm5, %ymm9 +; AVX2-SLOW-NEXT: vbroadcastss 1444(%rdi), %xmm10 +; AVX2-SLOW-NEXT: vmovaps 1408(%rdi), %xmm0 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm10 = xmm0[0,1,2],xmm10[3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm9 = xmm9[0,1],xmm10[2,3] +; AVX2-SLOW-NEXT: vpermps %ymm12, %ymm6, %ymm10 +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm11[6,7] +; AVX2-SLOW-NEXT: vbroadcastss 1556(%rdi), %ymm12 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5,6],ymm12[7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7] +; AVX2-SLOW-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-SLOW-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm9 = mem[0,1,2,3],ymm9[4,5,6,7] +; AVX2-SLOW-NEXT: vpermps %ymm9, %ymm5, %ymm9 +; AVX2-SLOW-NEXT: vbroadcastss 1668(%rdi), %xmm10 +; AVX2-SLOW-NEXT: vmovaps 1632(%rdi), %xmm5 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm10 = xmm5[0,1,2],xmm10[3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm9 = xmm9[0,1],xmm10[2,3] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-SLOW-NEXT: vpermps %ymm13, %ymm6, %ymm6 ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm12[6,7] -; AVX2-SLOW-NEXT: vbroadcastss 1556(%rdi), %ymm10 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6],ymm10[7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm12[6,7] +; AVX2-SLOW-NEXT: vbroadcastss 1780(%rdi), %ymm10 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm10[7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm9[0,1,2,3],ymm6[4,5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm6 = ymm6[0,1,2,3],mem[4,5,6,7] -; AVX2-SLOW-NEXT: vpermps %ymm6, %ymm5, %ymm5 -; AVX2-SLOW-NEXT: vbroadcastss 1668(%rdi), %xmm6 -; AVX2-SLOW-NEXT: vmovaps 1632(%rdi), %xmm15 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm6 = xmm15[0,1,2],xmm6[3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1],xmm6[2,3] -; AVX2-SLOW-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm6 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm10[6,7] -; AVX2-SLOW-NEXT: vbroadcastss 1780(%rdi), %ymm8 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm8[7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] -; AVX2-SLOW-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm5 = ymm5[0],mem[1],ymm5[2,3,4],mem[5],ymm5[6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm5 = ymm5[1,0,3,3,5,4,7,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,0,3] -; AVX2-SLOW-NEXT: vbroadcastss 216(%rdi), %ymm6 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm6[7] -; AVX2-SLOW-NEXT: vmovaps 96(%rdi), %xmm6 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm6[0,1,2],xmm0[3] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1,3,2] +; AVX2-SLOW-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm6 = ymm6[0],mem[1],ymm6[2,3,4],mem[5],ymm6[6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm6 = ymm6[1,0,3,3,5,4,7,7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,0,3] +; AVX2-SLOW-NEXT: vbroadcastss 216(%rdi), %ymm9 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm9[7] +; AVX2-SLOW-NEXT: vmovaps 96(%rdi), %xmm10 +; AVX2-SLOW-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm8 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm8 = xmm10[0,1,2],mem[3] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm8 = xmm8[0,1,3,2] +; AVX2-SLOW-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm9 = mem[1,0,2,3,5,4,6,7] +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm9, %xmm9 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm6[4,5,6,7] +; AVX2-SLOW-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovaps 320(%rdi), %xmm6 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm6[0,1,2],xmm1[3] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,3,2] ; AVX2-SLOW-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm8 = mem[1,0,2,3,5,4,6,7] ; AVX2-SLOW-NEXT: vextractf128 $1, %ymm8, %xmm8 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm8[0,1],xmm0[2,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7] -; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 320(%rdi), %xmm0 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm0[0,1,2],xmm1[3] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,3,2] -; AVX2-SLOW-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm5 = mem[1,0,2,3,5,4,6,7] -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm5, %xmm5 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm5[0,1],xmm1[2,3] -; AVX2-SLOW-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm5 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm5 = ymm11[0],mem[1],ymm11[2,3,4],mem[5],ymm11[6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm5 = ymm5[1,0,3,3,5,4,7,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,0,3] -; AVX2-SLOW-NEXT: vbroadcastss 440(%rdi), %ymm8 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm8[7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5,6,7] -; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 544(%rdi), %xmm5 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm5[0,1,2],xmm2[3] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,3,2] -; AVX2-SLOW-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm2 = mem[1,0,2,3,5,4,6,7] -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm2, %xmm2 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm2 = ymm2[0],mem[1],ymm2[2,3,4],mem[5],ymm2[6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm2[1,0,3,3,5,4,7,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3] -; AVX2-SLOW-NEXT: vbroadcastss 664(%rdi), %ymm8 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm8[7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 768(%rdi), %xmm8 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm8[0,1,2],xmm3[3] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,3,2] -; AVX2-SLOW-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm2 = mem[1,0,2,3,5,4,6,7] -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm2, %xmm2 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm2 = ymm2[0],mem[1],ymm2[2,3,4],mem[5],ymm2[6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm2[1,0,3,3,5,4,7,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3] -; AVX2-SLOW-NEXT: vbroadcastss 888(%rdi), %ymm3 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm8[0,1],xmm1[2,3] +; AVX2-SLOW-NEXT: vblendps $221, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm8 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm8 = mem[0],ymm15[1],mem[2,3,4],ymm15[5],mem[6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm8 = ymm8[1,0,3,3,5,4,7,7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,1,0,3] +; AVX2-SLOW-NEXT: vbroadcastss 440(%rdi), %ymm9 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6],ymm9[7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm8[4,5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 992(%rdi), %xmm1 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm2 = xmm1[0,1,2],xmm4[3] +; AVX2-SLOW-NEXT: vmovaps 544(%rdi), %xmm1 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm2 = xmm1[0,1,2],xmm2[3] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,1,3,2] +; AVX2-SLOW-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm8 = mem[1,0,2,3,5,4,6,7] +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm8, %xmm8 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm2 = xmm8[0,1],xmm2[2,3] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-SLOW-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm8 = ymm8[0],mem[1],ymm8[2,3,4],mem[5],ymm8[6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm8 = ymm8[1,0,3,3,5,4,7,7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,1,0,3] +; AVX2-SLOW-NEXT: vbroadcastss 664(%rdi), %ymm9 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6],ymm9[7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7] +; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovaps 768(%rdi), %xmm15 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm2 = xmm15[0,1,2],xmm3[3] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,1,3,2] ; AVX2-SLOW-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm3 = mem[1,0,2,3,5,4,6,7] ; AVX2-SLOW-NEXT: vextractf128 $1, %ymm3, %xmm3 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-SLOW-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm3 = ymm3[0],mem[1],ymm3[2,3,4],mem[5],ymm3[6,7] +; AVX2-SLOW-NEXT: vblendps $221, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm3 = mem[0],ymm3[1],mem[2,3,4],ymm3[5],mem[6,7] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm3 = ymm3[1,0,3,3,5,4,7,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,3] -; AVX2-SLOW-NEXT: vbroadcastss 1112(%rdi), %ymm4 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm4[7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm11 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX2-SLOW-NEXT: vmovaps 1216(%rdi), %xmm2 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm3 = xmm2[0,1,2],xmm7[3] +; AVX2-SLOW-NEXT: vbroadcastss 888(%rdi), %ymm8 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm8[7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovaps 992(%rdi), %xmm2 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm3 = xmm2[0,1,2],xmm4[3] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,1,3,2] ; AVX2-SLOW-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm4 = mem[1,0,2,3,5,4,6,7] @@ -9408,267 +9390,282 @@ ; AVX2-SLOW-NEXT: # ymm4 = ymm4[0],mem[1],ymm4[2,3,4],mem[5],ymm4[6,7] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm4 = ymm4[1,0,3,3,5,4,7,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,0,3] -; AVX2-SLOW-NEXT: vbroadcastss 1336(%rdi), %ymm7 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm7[7] +; AVX2-SLOW-NEXT: vbroadcastss 1112(%rdi), %ymm8 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm8[7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovaps 1440(%rdi), %xmm3 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm4 = xmm3[0,1,2],xmm13[3] +; AVX2-SLOW-NEXT: vmovaps 1216(%rdi), %xmm3 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm4 = xmm3[0,1,2],xmm7[3] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm4 = xmm4[0,1,3,2] ; AVX2-SLOW-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm7 = mem[1,0,2,3,5,4,6,7] ; AVX2-SLOW-NEXT: vextractf128 $1, %ymm7, %xmm7 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm4 = xmm7[0,1],xmm4[2,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm12[0],ymm14[1],ymm12[2,3,4],ymm14[5],ymm12[6,7] +; AVX2-SLOW-NEXT: vblendps $221, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm7 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm7 = mem[0],ymm14[1],mem[2,3,4],ymm14[5],mem[6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm7 = ymm7[1,0,3,3,5,4,7,7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,0,3] +; AVX2-SLOW-NEXT: vbroadcastss 1336(%rdi), %ymm8 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm8[7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm4[0,1,2,3],ymm7[4,5,6,7] +; AVX2-SLOW-NEXT: vmovaps 1440(%rdi), %xmm4 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm4[0,1,2],xmm0[3] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1,3,2] +; AVX2-SLOW-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm7 = mem[1,0,2,3,5,4,6,7] +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm7, %xmm7 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm7[0,1],xmm0[2,3] +; AVX2-SLOW-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm7 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm7 = ymm11[0],mem[1],ymm11[2,3,4],mem[5],ymm11[6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm7 = ymm7[1,0,3,3,5,4,7,7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,0,3] +; AVX2-SLOW-NEXT: vbroadcastss 1560(%rdi), %ymm8 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm8[7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm0[0,1,2,3],ymm7[4,5,6,7] +; AVX2-SLOW-NEXT: vmovaps 1664(%rdi), %xmm0 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm5 = xmm0[0,1,2],xmm5[3] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm5 = xmm5[0,1,3,2] +; AVX2-SLOW-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm7 = mem[1,0,2,3,5,4,6,7] +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm7, %xmm7 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm5 = xmm7[0,1],xmm5[2,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm12[0],ymm13[1],ymm12[2,3,4],ymm13[5],ymm12[6,7] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm7 = ymm7[1,0,3,3,5,4,7,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,0,3] -; AVX2-SLOW-NEXT: vbroadcastss 1560(%rdi), %ymm13 +; AVX2-SLOW-NEXT: vbroadcastss 1784(%rdi), %ymm13 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm13[7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm4[0,1,2,3],ymm7[4,5,6,7] -; AVX2-SLOW-NEXT: vmovaps 1664(%rdi), %xmm4 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm13 = xmm4[0,1,2],xmm15[3] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm13 = xmm13[0,1,3,2] -; AVX2-SLOW-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm15 = mem[1,0,2,3,5,4,6,7] -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm15, %xmm15 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm13 = xmm15[0,1],xmm13[2,3] -; AVX2-SLOW-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm15 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm15 = ymm10[0],mem[1],ymm10[2,3,4],mem[5],ymm10[6,7] -; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm15 = ymm15[1,0,3,3,5,4,7,7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[0,1,0,3] -; AVX2-SLOW-NEXT: vbroadcastss 1784(%rdi), %ymm12 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm12 = ymm15[0,1,2,3,4,5,6],ymm12[7] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm12[4,5,6,7] -; AVX2-SLOW-NEXT: vbroadcastss 136(%rdi), %xmm12 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm5[0,1,2,3],ymm7[4,5,6,7] +; AVX2-SLOW-NEXT: vbroadcastss 136(%rdi), %xmm5 +; AVX2-SLOW-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload +; AVX2-SLOW-NEXT: # xmm5 = xmm5[0],mem[1],xmm5[2,3] +; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm13 +; AVX2-SLOW-NEXT: vmovaps {{.*#+}} ymm5 = +; AVX2-SLOW-NEXT: vpermps 192(%rdi), %ymm5, %ymm12 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0,1,2,3,4,5],ymm12[6,7] +; AVX2-SLOW-NEXT: vbroadcastss 80(%rdi), %ymm13 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm13 = xmm13[0,1,2],xmm10[3] +; AVX2-SLOW-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm14 = mem[2,3,2,3,6,7,6,7] +; AVX2-SLOW-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm14 = ymm14[0],mem[1],ymm14[2,3,4],mem[5],ymm14[6,7] +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm14, %xmm14 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm13 = xmm14[0,1],xmm13[2,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm14 = ymm13[0,1,2,3],ymm12[4,5,6,7] +; AVX2-SLOW-NEXT: vbroadcastss 360(%rdi), %xmm12 ; AVX2-SLOW-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm12 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm12 = xmm12[0],mem[1],xmm12[2,3] ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm12 -; AVX2-SLOW-NEXT: vpermps 192(%rdi), %ymm9, %ymm15 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],ymm15[6,7] -; AVX2-SLOW-NEXT: vbroadcastss 80(%rdi), %ymm15 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm6 = xmm15[0,1,2],xmm6[3] -; AVX2-SLOW-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm15 = mem[2,3,2,3,6,7,6,7] -; AVX2-SLOW-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm15 = ymm15[0],mem[1],ymm15[2,3,4],mem[5],ymm15[6,7] -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm15, %xmm15 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm6 = xmm15[0,1],xmm6[2,3] +; AVX2-SLOW-NEXT: vpermps 416(%rdi), %ymm5, %ymm13 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],ymm13[6,7] +; AVX2-SLOW-NEXT: vbroadcastss 304(%rdi), %ymm13 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm6 = xmm13[0,1,2],xmm6[3] +; AVX2-SLOW-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm13 = mem[2,3,2,3,6,7,6,7] +; AVX2-SLOW-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm13 = ymm13[0],mem[1],ymm13[2,3,4],mem[5],ymm13[6,7] +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm13, %xmm13 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm6 = xmm13[0,1],xmm6[2,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm12[4,5,6,7] -; AVX2-SLOW-NEXT: vbroadcastss 360(%rdi), %xmm12 +; AVX2-SLOW-NEXT: vbroadcastss 584(%rdi), %xmm12 ; AVX2-SLOW-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm12 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm12 = xmm12[0],mem[1],xmm12[2,3] ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm12 -; AVX2-SLOW-NEXT: vpermps 416(%rdi), %ymm9, %ymm15 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],ymm15[6,7] -; AVX2-SLOW-NEXT: vbroadcastss 304(%rdi), %ymm15 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm15[0,1,2],xmm0[3] -; AVX2-SLOW-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm15 = mem[2,3,2,3,6,7,6,7] -; AVX2-SLOW-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm15 = ymm15[0],mem[1],ymm15[2,3,4],mem[5],ymm15[6,7] -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm15, %xmm15 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm15[0,1],xmm0[2,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm12[4,5,6,7] -; AVX2-SLOW-NEXT: vbroadcastss 584(%rdi), %xmm12 -; AVX2-SLOW-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm12 # 16-byte Folded Reload -; AVX2-SLOW-NEXT: # xmm12 = xmm12[0],mem[1],xmm12[2,3] -; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm12 -; AVX2-SLOW-NEXT: vpermps 640(%rdi), %ymm9, %ymm15 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],ymm15[6,7] -; AVX2-SLOW-NEXT: vbroadcastss 528(%rdi), %ymm15 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm5 = xmm15[0,1,2],xmm5[3] -; AVX2-SLOW-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm15 = mem[2,3,2,3,6,7,6,7] -; AVX2-SLOW-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm15 = ymm15[0],mem[1],ymm15[2,3,4],mem[5],ymm15[6,7] -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm15, %xmm15 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm5 = xmm15[0,1],xmm5[2,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm12[4,5,6,7] +; AVX2-SLOW-NEXT: vpermps 640(%rdi), %ymm5, %ymm13 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],ymm13[6,7] +; AVX2-SLOW-NEXT: vbroadcastss 528(%rdi), %ymm13 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm13[0,1,2],xmm1[3] +; AVX2-SLOW-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm13 = mem[2,3,2,3,6,7,6,7] +; AVX2-SLOW-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm13 = ymm13[0],mem[1],ymm13[2,3,4],mem[5],ymm13[6,7] +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm13, %xmm13 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm13[0,1],xmm1[2,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm12[4,5,6,7] ; AVX2-SLOW-NEXT: vbroadcastss 808(%rdi), %xmm12 ; AVX2-SLOW-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm12 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm12 = xmm12[0],mem[1],xmm12[2,3] ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm12 -; AVX2-SLOW-NEXT: vpermps 864(%rdi), %ymm9, %ymm15 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],ymm15[6,7] -; AVX2-SLOW-NEXT: vbroadcastss 752(%rdi), %ymm15 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm14 = xmm15[0,1,2],xmm8[3] +; AVX2-SLOW-NEXT: vpermps 864(%rdi), %ymm5, %ymm13 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],ymm13[6,7] +; AVX2-SLOW-NEXT: vbroadcastss 752(%rdi), %ymm13 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm13 = xmm13[0,1,2],xmm15[3] ; AVX2-SLOW-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm15 = mem[2,3,2,3,6,7,6,7] ; AVX2-SLOW-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm15 = ymm15[0],mem[1],ymm15[2,3,4],mem[5],ymm15[6,7] ; AVX2-SLOW-NEXT: vextractf128 $1, %ymm15, %xmm15 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm14 = xmm15[0,1],xmm14[2,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm12[4,5,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm13 = xmm15[0,1],xmm13[2,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm15 = ymm13[0,1,2,3],ymm12[4,5,6,7] ; AVX2-SLOW-NEXT: vbroadcastss 1032(%rdi), %xmm12 ; AVX2-SLOW-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm12 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm12 = xmm12[0],mem[1],xmm12[2,3] ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm12 -; AVX2-SLOW-NEXT: vpermps 1088(%rdi), %ymm9, %ymm15 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],ymm15[6,7] -; AVX2-SLOW-NEXT: vbroadcastss 976(%rdi), %ymm15 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm15[0,1,2],xmm1[3] -; AVX2-SLOW-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm15 = mem[2,3,2,3,6,7,6,7] -; AVX2-SLOW-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm15 = ymm15[0],mem[1],ymm15[2,3,4],mem[5],ymm15[6,7] -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm15, %xmm15 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm15[0,1],xmm1[2,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm12[4,5,6,7] +; AVX2-SLOW-NEXT: vpermps 1088(%rdi), %ymm5, %ymm13 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],ymm13[6,7] +; AVX2-SLOW-NEXT: vbroadcastss 976(%rdi), %ymm13 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm2 = xmm13[0,1,2],xmm2[3] +; AVX2-SLOW-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm13 = mem[2,3,2,3,6,7,6,7] +; AVX2-SLOW-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm13 = ymm13[0],mem[1],ymm13[2,3,4],mem[5],ymm13[6,7] +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm13, %xmm13 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm2 = xmm13[0,1],xmm2[2,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm12[4,5,6,7] ; AVX2-SLOW-NEXT: vbroadcastss 1256(%rdi), %xmm12 ; AVX2-SLOW-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm12 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm12 = xmm12[0],mem[1],xmm12[2,3] ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm12 -; AVX2-SLOW-NEXT: vpermps 1312(%rdi), %ymm9, %ymm15 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],ymm15[6,7] -; AVX2-SLOW-NEXT: vbroadcastss 1200(%rdi), %ymm15 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm2 = xmm15[0,1,2],xmm2[3] -; AVX2-SLOW-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm15 = mem[2,3,2,3,6,7,6,7] -; AVX2-SLOW-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm15 = ymm15[0],mem[1],ymm15[2,3,4],mem[5],ymm15[6,7] -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm15, %xmm15 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm2 = xmm15[0,1],xmm2[2,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm12[4,5,6,7] +; AVX2-SLOW-NEXT: vpermps 1312(%rdi), %ymm5, %ymm13 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],ymm13[6,7] +; AVX2-SLOW-NEXT: vbroadcastss 1200(%rdi), %ymm13 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm3 = xmm13[0,1,2],xmm3[3] +; AVX2-SLOW-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm13 = mem[2,3,2,3,6,7,6,7] +; AVX2-SLOW-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm13 = ymm13[0],mem[1],ymm13[2,3,4],mem[5],ymm13[6,7] +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm13, %xmm13 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm3 = xmm13[0,1],xmm3[2,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm12[4,5,6,7] ; AVX2-SLOW-NEXT: vbroadcastss 1480(%rdi), %xmm12 ; AVX2-SLOW-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm12 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm12 = xmm12[0],mem[1],xmm12[2,3] ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm12 -; AVX2-SLOW-NEXT: vpermps 1536(%rdi), %ymm9, %ymm15 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],ymm15[6,7] -; AVX2-SLOW-NEXT: vbroadcastss 1424(%rdi), %ymm15 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm3 = xmm15[0,1,2],xmm3[3] -; AVX2-SLOW-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm15 = mem[2,3,2,3,6,7,6,7] -; AVX2-SLOW-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm15 = ymm15[0],mem[1],ymm15[2,3,4],mem[5],ymm15[6,7] -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm15, %xmm15 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm3 = xmm15[0,1],xmm3[2,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm12[4,5,6,7] +; AVX2-SLOW-NEXT: vpermps 1536(%rdi), %ymm5, %ymm13 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],ymm13[6,7] +; AVX2-SLOW-NEXT: vbroadcastss 1424(%rdi), %ymm13 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm4 = xmm13[0,1,2],xmm4[3] +; AVX2-SLOW-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm13 = mem[2,3,2,3,6,7,6,7] +; AVX2-SLOW-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm13 = ymm13[0],mem[1],ymm13[2,3,4],mem[5],ymm13[6,7] +; AVX2-SLOW-NEXT: vextractf128 $1, %ymm13, %xmm13 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm4 = xmm13[0,1],xmm4[2,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm12[4,5,6,7] ; AVX2-SLOW-NEXT: vbroadcastss 1704(%rdi), %xmm12 ; AVX2-SLOW-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm12 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm12 = xmm12[0],mem[1],xmm12[2,3] -; AVX2-SLOW-NEXT: vpermps 1760(%rdi), %ymm9, %ymm9 +; AVX2-SLOW-NEXT: vpermps 1760(%rdi), %ymm5, %ymm5 ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm12 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm12[0,1,2,3,4,5],ymm9[6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm12[0,1,2,3,4,5],ymm5[6,7] ; AVX2-SLOW-NEXT: vbroadcastss 1648(%rdi), %ymm12 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm4 = xmm12[0,1,2],xmm4[3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm12[0,1,2],xmm0[3] ; AVX2-SLOW-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm12 = mem[2,3,2,3,6,7,6,7] ; AVX2-SLOW-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm12 = ymm12[0],mem[1],ymm12[2,3,4],mem[5],ymm12[6,7] ; AVX2-SLOW-NEXT: vextractf128 $1, %ymm12, %xmm12 -; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm4 = xmm12[0,1],xmm4[2,3] -; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm9[4,5,6,7] -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm9, 192(%rsi) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm9, 128(%rsi) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm9, 64(%rsi) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm9, (%rsi) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm9, 224(%rsi) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm9, 160(%rsi) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm9, 96(%rsi) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm9, 32(%rsi) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm9, 192(%rdx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm9, 128(%rdx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm9, 64(%rdx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm9, (%rdx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm9, 224(%rdx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm9, 160(%rdx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm9, 96(%rdx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm9, 32(%rdx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm9, 128(%rcx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm9, 64(%rcx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm9, (%rcx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm9, 192(%rcx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm9, 224(%rcx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm9, 160(%rcx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm9, 96(%rcx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm9, 32(%rcx) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm9, (%r8) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm9, 64(%r8) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm9, 128(%r8) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm9, 192(%r8) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm9, 224(%r8) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm9, 160(%r8) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm9, 96(%r8) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm9, 32(%r8) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm8, 224(%r9) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm8, 192(%r9) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm8, 160(%r9) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm8, 128(%r9) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm8, 96(%r9) -; AVX2-SLOW-NEXT: vmovups (%rsp), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm9, 64(%r9) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm9, 32(%r9) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm9, (%r9) -; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-SLOW-NEXT: vmovaps %ymm13, 224(%rax) -; AVX2-SLOW-NEXT: vmovaps %ymm7, 192(%rax) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm7, 160(%rax) -; AVX2-SLOW-NEXT: vmovaps %ymm11, 128(%rax) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm7, 96(%rax) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm7, 64(%rax) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm7, 32(%rax) -; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovaps %ymm7, (%rax) +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm12[0,1],xmm0[2,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7] +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm5, 192(%rsi) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm5, 128(%rsi) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm5, 64(%rsi) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm5, (%rsi) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm5, 224(%rsi) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm5, 160(%rsi) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm5, 96(%rsi) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm5, 32(%rsi) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm5, 192(%rdx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm5, 128(%rdx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm5, 64(%rdx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm5, (%rdx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm5, 224(%rdx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm5, 160(%rdx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm5, 96(%rdx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm5, 32(%rdx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm5, 128(%rcx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm5, 64(%rcx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm5, (%rcx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm5, 192(%rcx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm5, 224(%rcx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm5, 160(%rcx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm5, 96(%rcx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm5, 32(%rcx) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm5, (%r8) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm5, 64(%r8) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm5, 128(%r8) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm5, 192(%r8) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm5, 224(%r8) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm5, 160(%r8) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm5, 96(%r8) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm5, 32(%r8) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm5, 224(%r9) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm5, 192(%r9) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm5, 160(%r9) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm5, 128(%r9) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm5, 96(%r9) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm5, 64(%r9) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm5, 32(%r9) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm5, (%r9) ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-SLOW-NEXT: vmovaps %ymm4, 224(%rax) -; AVX2-SLOW-NEXT: vmovaps %ymm3, 192(%rax) -; AVX2-SLOW-NEXT: vmovaps %ymm2, 160(%rax) -; AVX2-SLOW-NEXT: vmovaps %ymm1, 128(%rax) -; AVX2-SLOW-NEXT: vmovaps %ymm14, 96(%rax) +; AVX2-SLOW-NEXT: vmovaps %ymm7, 224(%rax) +; AVX2-SLOW-NEXT: vmovaps %ymm8, 192(%rax) +; AVX2-SLOW-NEXT: vmovaps %ymm9, 160(%rax) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm5, 128(%rax) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm5, 96(%rax) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm5, 64(%rax) -; AVX2-SLOW-NEXT: vmovaps %ymm0, 32(%rax) -; AVX2-SLOW-NEXT: vmovaps %ymm6, (%rax) -; AVX2-SLOW-NEXT: addq $2680, %rsp # imm = 0xA78 +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm5, 32(%rax) +; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovaps %ymm5, (%rax) +; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX2-SLOW-NEXT: vmovaps %ymm0, 224(%rax) +; AVX2-SLOW-NEXT: vmovaps %ymm4, 192(%rax) +; AVX2-SLOW-NEXT: vmovaps %ymm3, 160(%rax) +; AVX2-SLOW-NEXT: vmovaps %ymm2, 128(%rax) +; AVX2-SLOW-NEXT: vmovaps %ymm15, 96(%rax) +; AVX2-SLOW-NEXT: vmovaps %ymm1, 64(%rax) +; AVX2-SLOW-NEXT: vmovaps %ymm6, 32(%rax) +; AVX2-SLOW-NEXT: vmovaps %ymm14, (%rax) +; AVX2-SLOW-NEXT: addq $2664, %rsp # imm = 0xA68 ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-LABEL: load_i32_stride7_vf64: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: subq $2680, %rsp # imm = 0xA78 +; AVX2-FAST-NEXT: subq $2664, %rsp # imm = 0xA68 ; AVX2-FAST-NEXT: vmovdqa 1216(%rdi), %ymm8 ; AVX2-FAST-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa 1152(%rdi), %ymm11 @@ -9739,9 +9736,9 @@ ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6],ymm1[7] ; AVX2-FAST-NEXT: vpermd %ymm1, %ymm0, %ymm1 ; AVX2-FAST-NEXT: vmovdqa 1664(%rdi), %ymm3 +; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpbroadcastq 1648(%rdi), %ymm2 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqa %ymm3, %ymm6 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] ; AVX2-FAST-NEXT: vmovdqa 1696(%rdi), %xmm2 ; AVX2-FAST-NEXT: vmovdqa 1728(%rdi), %xmm3 @@ -9793,9 +9790,9 @@ ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa 928(%rdi), %ymm2 -; AVX2-FAST-NEXT: vmovdqa 896(%rdi), %ymm1 -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6],ymm1[7] +; AVX2-FAST-NEXT: vmovdqa 896(%rdi), %ymm6 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5],ymm2[6],ymm6[7] +; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa %ymm2, %ymm15 ; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpermd %ymm1, %ymm0, %ymm1 @@ -9820,7 +9817,7 @@ ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6],ymm1[7] ; AVX2-FAST-NEXT: vpermd %ymm1, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vmovdqa 1440(%rdi), %ymm2 -; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqu %ymm2, (%rsp) # 32-byte Spill ; AVX2-FAST-NEXT: vpbroadcastq 1424(%rdi), %ymm1 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] @@ -9845,13 +9842,13 @@ ; AVX2-FAST-NEXT: vmovdqa 288(%rdi), %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1],ymm0[2,3],ymm8[4,5],ymm0[6,7] -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm1 = [25769803781,25769803781,25769803781,25769803781] -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm3 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [1,0,7,7,5,4,7,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1],ymm0[2,3],ymm8[4,5],ymm0[6,7] +; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm5 = [25769803781,25769803781,25769803781,25769803781] +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm5, %ymm3 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [1,0,7,7,5,4,7,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0],ymm13[1],ymm12[2,3,4],ymm13[5],ymm12[6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm5, %ymm4 +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm4 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3,4,5,6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -9868,10 +9865,10 @@ ; AVX2-FAST-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0,1],ymm12[2,3],ymm7[4,5],ymm12[6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermd %ymm3, %ymm1, %ymm3 +; AVX2-FAST-NEXT: vpermd %ymm3, %ymm5, %ymm3 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0],ymm14[1],ymm10[2,3,4],ymm14[5],ymm10[6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm5, %ymm4 +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm4 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3,4,5,6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -9888,41 +9885,40 @@ ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm9[0,1],ymm0[2,3],ymm9[4,5],ymm0[6,7] -; AVX2-FAST-NEXT: vpermd %ymm3, %ymm1, %ymm3 +; AVX2-FAST-NEXT: vpermd %ymm3, %ymm5, %ymm3 ; AVX2-FAST-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm0 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm0 = ymm11[0],mem[1],ymm11[2,3,4],mem[5],ymm11[6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm5, %ymm4 +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm4 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3,4,5,6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm2 = mem[2,2,2,2] ; AVX2-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FAST-NEXT: vmovdqa 1728(%rdi), %ymm0 -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 1696(%rdi), %ymm3 +; AVX2-FAST-NEXT: vmovdqa 1728(%rdi), %ymm3 ; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpalignr {{.*#+}} ymm3 = ymm0[12,13,14,15],ymm3[0,1,2,3,4,5,6,7,8,9,10,11],ymm0[28,29,30,31],ymm3[16,17,18,19,20,21,22,23,24,25,26,27] +; AVX2-FAST-NEXT: vmovdqa 1696(%rdi), %ymm0 +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpalignr {{.*#+}} ymm3 = ymm3[12,13,14,15],ymm0[0,1,2,3,4,5,6,7,8,9,10,11],ymm3[28,29,30,31],ymm0[16,17,18,19,20,21,22,23,24,25,26,27] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,2,0] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7] ; AVX2-FAST-NEXT: vmovdqa 1632(%rdi), %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1],ymm0[2,3],ymm6[4,5],ymm0[6,7] -; AVX2-FAST-NEXT: vmovdqa %ymm6, %ymm14 -; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermd %ymm3, %ymm1, %ymm3 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm10[0,1],ymm0[2,3],ymm10[4,5],ymm0[6,7] +; AVX2-FAST-NEXT: vpermd %ymm3, %ymm5, %ymm3 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm4 = ymm0[0],mem[1],ymm0[2,3,4],mem[5],ymm0[6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermd %ymm4, %ymm5, %ymm4 +; AVX2-FAST-NEXT: vpermd %ymm4, %ymm1, %ymm4 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3,4,5,6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm2 = mem[2,2,2,2] ; AVX2-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-FAST-NEXT: vmovdqa 160(%rdi), %ymm0 -; AVX2-FAST-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa 128(%rdi), %ymm3 ; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpalignr {{.*#+}} ymm3 = ymm0[12,13,14,15],ymm3[0,1,2,3,4,5,6,7,8,9,10,11],ymm0[28,29,30,31],ymm3[16,17,18,19,20,21,22,23,24,25,26,27] @@ -9930,14 +9926,14 @@ ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7] ; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm11[0,1],ymm0[2,3],ymm11[4,5],ymm0[6,7] -; AVX2-FAST-NEXT: vpermd %ymm3, %ymm1, %ymm3 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm13[0,1],ymm0[2,3],ymm13[4,5],ymm0[6,7] +; AVX2-FAST-NEXT: vpermd %ymm3, %ymm5, %ymm3 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $221, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm4 = mem[0],ymm0[1],mem[2,3,4],ymm0[5],mem[6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermd %ymm4, %ymm5, %ymm4 +; AVX2-FAST-NEXT: vpblendd $221, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm0 = mem[0],ymm0[1],mem[2,3,4],ymm0[5],mem[6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm4 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3,4,5,6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -9952,14 +9948,14 @@ ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7] ; AVX2-FAST-NEXT: vmovdqa 512(%rdi), %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm13[0,1],ymm0[2,3],ymm13[4,5],ymm0[6,7] -; AVX2-FAST-NEXT: vpermd %ymm3, %ymm1, %ymm3 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm14[0,1],ymm0[2,3],ymm14[4,5],ymm0[6,7] +; AVX2-FAST-NEXT: vpermd %ymm3, %ymm5, %ymm3 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm0 = ymm0[0],mem[1],ymm0[2,3,4],mem[5],ymm0[6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm5, %ymm4 +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm4 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3,4,5,6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -9976,34 +9972,33 @@ ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpblendd $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm3 = mem[0,1],ymm0[2,3],mem[4,5],ymm0[6,7] -; AVX2-FAST-NEXT: vpermd %ymm3, %ymm1, %ymm3 -; AVX2-FAST-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm0 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm0 = ymm15[0],mem[1],ymm15[2,3,4],mem[5],ymm15[6,7] +; AVX2-FAST-NEXT: vpermd %ymm3, %ymm5, %ymm3 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0],ymm6[1],ymm15[2,3,4],ymm6[5],ymm15[6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm5, %ymm4 +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm4 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3,4,5,6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm2 = mem[2,2,2,2] ; AVX2-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-FAST-NEXT: vmovdqa 1504(%rdi), %ymm0 -; AVX2-FAST-NEXT: vmovdqa 1472(%rdi), %ymm10 -; AVX2-FAST-NEXT: vpalignr {{.*#+}} ymm3 = ymm0[12,13,14,15],ymm10[0,1,2,3,4,5,6,7,8,9,10,11],ymm0[28,29,30,31],ymm10[16,17,18,19,20,21,22,23,24,25,26,27] -; AVX2-FAST-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 1472(%rdi), %ymm11 +; AVX2-FAST-NEXT: vpalignr {{.*#+}} ymm3 = ymm0[12,13,14,15],ymm11[0,1,2,3,4,5,6,7,8,9,10,11],ymm0[28,29,30,31],ymm11[16,17,18,19,20,21,22,23,24,25,26,27] +; AVX2-FAST-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa %ymm0, %ymm15 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,2,0] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7] ; AVX2-FAST-NEXT: vmovdqa 1408(%rdi), %ymm6 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vmovdqu (%rsp), %ymm4 # 32-byte Reload ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1],ymm6[2,3],ymm4[4,5],ymm6[6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermd %ymm3, %ymm1, %ymm0 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm1 = ymm1[0],mem[1],ymm1[2,3,4],mem[5],ymm1[6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermd %ymm1, %ymm5, %ymm1 +; AVX2-FAST-NEXT: vpermd %ymm3, %ymm5, %ymm0 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm3 = ymm3[0],mem[1],ymm3[2,3,4],mem[5],ymm3[6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermd %ymm3, %ymm1, %ymm1 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -10055,16 +10050,16 @@ ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa 1648(%rdi), %xmm0 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vpalignr {{.*#+}} ymm1 = ymm14[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm14[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] +; AVX2-FAST-NEXT: vpalignr {{.*#+}} ymm1 = ymm10[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm10[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7] ; AVX2-FAST-NEXT: vpbroadcastd 1576(%rdi), %xmm1 ; AVX2-FAST-NEXT: vmovdqa 1600(%rdi), %xmm2 ; AVX2-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm1 = ymm1[0],mem[0],ymm1[2],mem[2] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FAST-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm1 = ymm8[0],mem[0],ymm8[2],mem[2] ; AVX2-FAST-NEXT: vpbroadcastd 1772(%rdi), %ymm2 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] @@ -10076,14 +10071,14 @@ ; AVX2-FAST-NEXT: vmovdqa 1376(%rdi), %xmm2 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm10[0],ymm15[0],ymm10[2],ymm15[2] +; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm11[0],ymm15[0],ymm11[2],ymm15[2] ; AVX2-FAST-NEXT: vpbroadcastd 1548(%rdi), %ymm7 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm7[7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa 80(%rdi), %xmm0 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vpalignr {{.*#+}} ymm1 = ymm11[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm11[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] +; AVX2-FAST-NEXT: vpalignr {{.*#+}} ymm1 = ymm13[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm13[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7] ; AVX2-FAST-NEXT: vpbroadcastd 8(%rdi), %xmm1 ; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %xmm4 @@ -10091,7 +10086,7 @@ ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm4[1],xmm1[2,3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vpunpcklqdq (%rsp), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm1 = ymm1[0],mem[0],ymm1[2],mem[2] ; AVX2-FAST-NEXT: vpbroadcastd 204(%rdi), %ymm15 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm15[7] @@ -10099,32 +10094,32 @@ ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa 528(%rdi), %xmm0 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vpalignr {{.*#+}} ymm1 = ymm13[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm13[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] +; AVX2-FAST-NEXT: vpalignr {{.*#+}} ymm1 = ymm14[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm14[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7] ; AVX2-FAST-NEXT: vpbroadcastd 456(%rdi), %xmm15 ; AVX2-FAST-NEXT: vmovdqa 480(%rdi), %xmm1 ; AVX2-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm15 = xmm15[0],xmm1[1],xmm15[2,3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1],ymm0[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm15 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm15 = ymm9[0],mem[0],ymm9[2],mem[2] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm15 = ymm1[0],mem[0],ymm1[2],mem[2] ; AVX2-FAST-NEXT: vpbroadcastd 652(%rdi), %ymm14 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4,5,6],ymm14[7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm14[5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa 976(%rdi), %xmm0 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FAST-NEXT: vpalignr {{.*#+}} ymm14 = ymm10[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],ymm10[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vpalignr {{.*#+}} ymm14 = ymm7[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm7[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2],ymm0[3],ymm14[4,5,6,7] ; AVX2-FAST-NEXT: vpbroadcastd 904(%rdi), %xmm15 ; AVX2-FAST-NEXT: vmovdqa 928(%rdi), %xmm0 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm15 = xmm15[0],xmm0[1],xmm15[2,3] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1],ymm14[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm7[0],ymm1[0],ymm7[2],ymm1[2] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm9[0],ymm10[0],ymm9[2],ymm10[2] ; AVX2-FAST-NEXT: vpbroadcastd 1100(%rdi), %ymm13 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1,2,3,4,5,6],ymm13[7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm14[0,1,2,3,4],ymm13[5,6,7] @@ -10150,8 +10145,8 @@ ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[3,2,2,3] ; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[3,1,1,0,7,5,5,4] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1],ymm6[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm6 = ymm12[0,2],ymm14[1,3],ymm12[4,6],ymm14[5,7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm6 = ymm12[0,2],ymm13[1,3],ymm12[4,6],ymm13[5,7] ; AVX2-FAST-NEXT: vbroadcastss 880(%rdi), %ymm12 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm12[7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm6[5,6,7] @@ -10164,9 +10159,9 @@ ; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm4 = xmm4[3,2,2,3] ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm5 = ymm5[3,1,1,0,7,5,5,4] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm5[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps $216, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm5 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm5 = ymm3[0,2],mem[1,3],ymm3[4,6],mem[5,7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm5 = ymm11[0,2],ymm12[1,3],ymm11[4,6],ymm12[5,7] ; AVX2-FAST-NEXT: vbroadcastss 1328(%rdi), %ymm6 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm6[7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm5[5,6,7] @@ -10179,35 +10174,32 @@ ; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm3 = xmm3[3,2,2,3] ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm4[3,1,1,0,7,5,5,4] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps $216, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm4 = ymm4[0,2],mem[1,3],ymm4[4,6],mem[5,7] +; AVX2-FAST-NEXT: vshufps $216, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm4 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm4 = ymm8[0,2],mem[1,3],ymm8[4,6],mem[5,7] ; AVX2-FAST-NEXT: vbroadcastss 1776(%rdi), %ymm5 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm5[7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vmovdqu (%rsp), %ymm3 # 32-byte Reload ; AVX2-FAST-NEXT: vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm3 = ymm3[0],mem[1],ymm3[2,3,4,5,6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1,2],mem[3] ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,2,2,3] ; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[3,1,1,0,7,5,5,4] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps $216, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm3 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm3 = ymm12[0,2],mem[1,3],ymm12[4,6],mem[5,7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vshufps $216, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm3 = ymm3[0,2],mem[1,3],ymm3[4,6],mem[5,7] ; AVX2-FAST-NEXT: vbroadcastss 1552(%rdi), %ymm4 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm4[7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0],ymm8[1],ymm10[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0],ymm1[1],ymm7[2,3,4,5,6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],mem[3] ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,2,2,3] ; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[3,1,1,0,7,5,5,4] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm7[0,2],ymm1[1,3],ymm7[4,6],ymm1[5,7] -; AVX2-FAST-NEXT: vmovaps %ymm1, %ymm11 -; AVX2-FAST-NEXT: vmovaps %ymm7, %ymm8 +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm9[0,2],ymm10[1,3],ymm9[4,6],ymm10[5,7] ; AVX2-FAST-NEXT: vbroadcastss 1104(%rdi), %ymm3 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] @@ -10220,8 +10212,9 @@ ; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,2,2,3] ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm9[0,2],ymm7[1,3],ymm9[4,6],ymm7[5,7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm8[0,2],ymm14[1,3],ymm8[4,6],ymm14[5,7] ; AVX2-FAST-NEXT: vbroadcastss 656(%rdi), %ymm2 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] @@ -10235,7 +10228,7 @@ ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vmovups (%rsp), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm3[0,2],ymm4[1,3],ymm3[4,6],ymm4[5,7] ; AVX2-FAST-NEXT: vbroadcastss 208(%rdi), %ymm2 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] @@ -10250,8 +10243,8 @@ ; AVX2-FAST-NEXT: # ymm2 = ymm2[0,1,2,3],mem[4,5,6,7] ; AVX2-FAST-NEXT: vpermps %ymm2, %ymm5, %ymm2 ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] -; AVX2-FAST-NEXT: vbroadcastsd {{.*#+}} ymm10 = [30064771072,30064771072,30064771072,30064771072] -; AVX2-FAST-NEXT: vpermps %ymm3, %ymm10, %ymm2 +; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm6 = +; AVX2-FAST-NEXT: vpermps %ymm3, %ymm6, %ymm2 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6,7] ; AVX2-FAST-NEXT: vbroadcastss 212(%rdi), %ymm3 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] @@ -10265,7 +10258,7 @@ ; AVX2-FAST-NEXT: vmovaps 288(%rdi), %xmm1 ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm3 = xmm1[0,1,2],xmm3[3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3] -; AVX2-FAST-NEXT: vpermps %ymm15, %ymm10, %ymm3 +; AVX2-FAST-NEXT: vpermps %ymm15, %ymm6, %ymm3 ; AVX2-FAST-NEXT: vblendps $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm3 = ymm3[0,1,2,3,4,5],mem[6,7] ; AVX2-FAST-NEXT: vbroadcastss 436(%rdi), %ymm4 @@ -10280,103 +10273,103 @@ ; AVX2-FAST-NEXT: vmovaps 512(%rdi), %xmm2 ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm4 = xmm2[0,1,2],xmm4[3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3] -; AVX2-FAST-NEXT: vpermps %ymm9, %ymm10, %ymm4 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm7[6,7] -; AVX2-FAST-NEXT: vbroadcastss 660(%rdi), %ymm6 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm6[7] +; AVX2-FAST-NEXT: vpermps %ymm8, %ymm6, %ymm4 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm14[6,7] +; AVX2-FAST-NEXT: vbroadcastss 660(%rdi), %ymm7 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm7[7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-FAST-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm3 = mem[0,1,2,3],ymm3[4,5,6,7] ; AVX2-FAST-NEXT: vpermps %ymm3, %ymm5, %ymm4 -; AVX2-FAST-NEXT: vbroadcastss 772(%rdi), %xmm6 +; AVX2-FAST-NEXT: vbroadcastss 772(%rdi), %xmm7 ; AVX2-FAST-NEXT: vmovaps 736(%rdi), %xmm3 -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm6 = xmm3[0,1,2],xmm6[3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],xmm6[2,3] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-FAST-NEXT: vpermps %ymm15, %ymm10, %ymm6 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm14[6,7] -; AVX2-FAST-NEXT: vbroadcastss 884(%rdi), %ymm7 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm7[7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] -; AVX2-FAST-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm7 = xmm3[0,1,2],xmm7[3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],xmm7[2,3] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-FAST-NEXT: vpermps %ymm14, %ymm6, %ymm7 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm13[6,7] +; AVX2-FAST-NEXT: vbroadcastss 884(%rdi), %ymm8 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm8[7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm7[4,5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm4, (%rsp) # 32-byte Spill ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX2-FAST-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm4 = mem[0,1,2,3],ymm4[4,5,6,7] -; AVX2-FAST-NEXT: vpermps %ymm4, %ymm5, %ymm6 -; AVX2-FAST-NEXT: vbroadcastss 996(%rdi), %xmm7 +; AVX2-FAST-NEXT: vpermps %ymm4, %ymm5, %ymm7 +; AVX2-FAST-NEXT: vbroadcastss 996(%rdi), %xmm8 ; AVX2-FAST-NEXT: vmovaps 960(%rdi), %xmm4 -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm7 = xmm4[0,1,2],xmm7[3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1],xmm7[2,3] -; AVX2-FAST-NEXT: vpermps %ymm8, %ymm10, %ymm7 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm11[6,7] -; AVX2-FAST-NEXT: vbroadcastss 1108(%rdi), %ymm8 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm8[7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7] -; AVX2-FAST-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm6 = mem[0,1,2,3],ymm6[4,5,6,7] -; AVX2-FAST-NEXT: vpermps %ymm6, %ymm5, %ymm6 -; AVX2-FAST-NEXT: vbroadcastss 1220(%rdi), %xmm8 -; AVX2-FAST-NEXT: vmovaps 1184(%rdi), %xmm7 -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm8 = xmm7[0,1,2],xmm8[3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1],xmm8[2,3] -; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm8 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vblendps $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm8 = ymm8[0,1,2,3,4,5],mem[6,7] -; AVX2-FAST-NEXT: vbroadcastss 1332(%rdi), %ymm9 +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm8 = xmm4[0,1,2],xmm8[3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm8[2,3] +; AVX2-FAST-NEXT: vpermps %ymm9, %ymm6, %ymm8 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm10[6,7] +; AVX2-FAST-NEXT: vbroadcastss 1108(%rdi), %ymm9 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6],ymm9[7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] -; AVX2-FAST-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm6 = mem[0,1,2,3],ymm6[4,5,6,7] -; AVX2-FAST-NEXT: vpermps %ymm6, %ymm5, %ymm6 -; AVX2-FAST-NEXT: vbroadcastss 1444(%rdi), %xmm9 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FAST-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm7 = mem[0,1,2,3],ymm7[4,5,6,7] +; AVX2-FAST-NEXT: vpermps %ymm7, %ymm5, %ymm8 +; AVX2-FAST-NEXT: vbroadcastss 1220(%rdi), %xmm9 +; AVX2-FAST-NEXT: vmovaps 1184(%rdi), %xmm7 +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm9 = xmm7[0,1,2],xmm9[3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0,1],xmm9[2,3] +; AVX2-FAST-NEXT: vpermps %ymm11, %ymm6, %ymm9 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm12[6,7] +; AVX2-FAST-NEXT: vbroadcastss 1332(%rdi), %ymm10 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5,6],ymm10[7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FAST-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm8 = mem[0,1,2,3],ymm8[4,5,6,7] +; AVX2-FAST-NEXT: vpermps %ymm8, %ymm5, %ymm9 +; AVX2-FAST-NEXT: vbroadcastss 1444(%rdi), %xmm10 ; AVX2-FAST-NEXT: vmovaps 1408(%rdi), %xmm8 -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm9 = xmm8[0,1,2],xmm9[3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1],xmm9[2,3] -; AVX2-FAST-NEXT: vpermps %ymm12, %ymm10, %ymm9 +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm10 = xmm8[0,1,2],xmm10[3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm9 = xmm9[0,1],xmm10[2,3] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-FAST-NEXT: vpermps %ymm15, %ymm6, %ymm10 ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm12[6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm12[6,7] ; AVX2-FAST-NEXT: vbroadcastss 1556(%rdi), %ymm11 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5,6],ymm11[7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm9[4,5,6,7] -; AVX2-FAST-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm6 = mem[0,1,2,3],ymm6[4,5,6,7] -; AVX2-FAST-NEXT: vpermps %ymm6, %ymm5, %ymm5 -; AVX2-FAST-NEXT: vbroadcastss 1668(%rdi), %xmm6 -; AVX2-FAST-NEXT: vmovaps 1632(%rdi), %xmm11 -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm6 = xmm11[0,1,2],xmm6[3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1],xmm6[2,3] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-FAST-NEXT: vpermps %ymm13, %ymm10, %ymm6 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5,6],ymm11[7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-FAST-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm9 = mem[0,1,2,3],ymm9[4,5,6,7] +; AVX2-FAST-NEXT: vpermps %ymm9, %ymm5, %ymm5 +; AVX2-FAST-NEXT: vbroadcastss 1668(%rdi), %xmm10 +; AVX2-FAST-NEXT: vmovaps 1632(%rdi), %xmm9 +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm10 = xmm9[0,1,2],xmm10[3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1],xmm10[2,3] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-FAST-NEXT: vpermps %ymm11, %ymm6, %ymm6 ; AVX2-FAST-NEXT: vblendps $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm6 = ymm6[0,1,2,3,4,5],mem[6,7] -; AVX2-FAST-NEXT: vbroadcastss 1780(%rdi), %ymm9 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm9[7] +; AVX2-FAST-NEXT: vbroadcastss 1780(%rdi), %ymm10 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm10[7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovups (%rsp), %ymm5 # 32-byte Reload +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload ; AVX2-FAST-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm5 = ymm5[0],mem[1],ymm5[2,3,4],mem[5],ymm5[6,7] -; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm14 = [1,0,3,3,1,0,7,7] -; AVX2-FAST-NEXT: vpermps %ymm5, %ymm14, %ymm5 +; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm13 = [1,0,3,3,1,0,7,7] +; AVX2-FAST-NEXT: vpermps %ymm5, %ymm13, %ymm5 ; AVX2-FAST-NEXT: vbroadcastss 216(%rdi), %ymm6 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm6[7] -; AVX2-FAST-NEXT: vmovaps 96(%rdi), %xmm9 -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm0 = xmm9[0,1,2],xmm0[3] +; AVX2-FAST-NEXT: vmovaps 96(%rdi), %xmm10 +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm0 = xmm10[0,1,2],xmm0[3] ; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1,3,2] ; AVX2-FAST-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm6 = mem[1,0,2,3,5,4,6,7] ; AVX2-FAST-NEXT: vextractf128 $1, %ymm6, %xmm6 ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm0 = xmm6[0,1],xmm0[2,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7] -; AVX2-FAST-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill +; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps 320(%rdi), %xmm6 ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm0 = xmm6[0,1,2],xmm1[3] ; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1,3,2] @@ -10387,57 +10380,42 @@ ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-NEXT: vblendps $221, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm1 = mem[0],ymm1[1],mem[2,3,4],ymm1[5],mem[6,7] -; AVX2-FAST-NEXT: vpermps %ymm1, %ymm14, %ymm1 +; AVX2-FAST-NEXT: vpermps %ymm1, %ymm13, %ymm1 ; AVX2-FAST-NEXT: vbroadcastss 440(%rdi), %ymm5 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm5[7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 544(%rdi), %xmm5 -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm0 = xmm5[0,1,2],xmm2[3] -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1,3,2] -; AVX2-FAST-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm1 = mem[1,0,2,3,5,4,6,7] -; AVX2-FAST-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm1 = ymm1[0],mem[1],ymm1[2,3,4],mem[5],ymm1[6,7] -; AVX2-FAST-NEXT: vpermps %ymm1, %ymm14, %ymm1 -; AVX2-FAST-NEXT: vbroadcastss 664(%rdi), %ymm2 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 768(%rdi), %xmm0 -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm0[0,1,2],xmm3[3] +; AVX2-FAST-NEXT: vmovaps 544(%rdi), %xmm0 +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm0[0,1,2],xmm2[3] ; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,3,2] ; AVX2-FAST-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm2 = mem[1,0,2,3,5,4,6,7] ; AVX2-FAST-NEXT: vextractf128 $1, %ymm2, %xmm2 ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] -; AVX2-FAST-NEXT: vblendps $221, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm2 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm2 = mem[0],ymm15[1],mem[2,3,4],ymm15[5],mem[6,7] -; AVX2-FAST-NEXT: vpermps %ymm2, %ymm14, %ymm2 -; AVX2-FAST-NEXT: vbroadcastss 888(%rdi), %ymm3 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm2 = ymm2[0],mem[1],ymm2[2,3,4],mem[5],ymm2[6,7] +; AVX2-FAST-NEXT: vpermps %ymm2, %ymm13, %ymm2 +; AVX2-FAST-NEXT: vbroadcastss 664(%rdi), %ymm5 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm5[7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 992(%rdi), %xmm1 -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm2 = xmm1[0,1,2],xmm4[3] +; AVX2-FAST-NEXT: vmovaps 768(%rdi), %xmm1 +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm2 = xmm1[0,1,2],xmm3[3] ; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,1,3,2] ; AVX2-FAST-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm3 = mem[1,0,2,3,5,4,6,7] ; AVX2-FAST-NEXT: vextractf128 $1, %ymm3, %xmm3 ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm3 = ymm3[0],mem[1],ymm3[2,3,4],mem[5],ymm3[6,7] -; AVX2-FAST-NEXT: vpermps %ymm3, %ymm14, %ymm3 -; AVX2-FAST-NEXT: vbroadcastss 1112(%rdi), %ymm4 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm4[7] +; AVX2-FAST-NEXT: vblendps $221, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm3 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm3 = mem[0],ymm14[1],mem[2,3,4],ymm14[5],mem[6,7] +; AVX2-FAST-NEXT: vpermps %ymm3, %ymm13, %ymm3 +; AVX2-FAST-NEXT: vbroadcastss 888(%rdi), %ymm5 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm5[7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 1216(%rdi), %xmm2 -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm3 = xmm2[0,1,2],xmm7[3] +; AVX2-FAST-NEXT: vmovaps 992(%rdi), %xmm2 +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm3 = xmm2[0,1,2],xmm4[3] ; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,1,3,2] ; AVX2-FAST-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm4 = mem[1,0,2,3,5,4,6,7] @@ -10446,241 +10424,254 @@ ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX2-FAST-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm4 = ymm4[0],mem[1],ymm4[2,3,4],mem[5],ymm4[6,7] -; AVX2-FAST-NEXT: vpermps %ymm4, %ymm14, %ymm4 -; AVX2-FAST-NEXT: vbroadcastss 1336(%rdi), %ymm7 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm7[7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm3[0,1,2,3],ymm4[4,5,6,7] -; AVX2-FAST-NEXT: vmovaps 1440(%rdi), %xmm3 -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm4 = xmm3[0,1,2],xmm8[3] +; AVX2-FAST-NEXT: vpermps %ymm4, %ymm13, %ymm4 +; AVX2-FAST-NEXT: vbroadcastss 1112(%rdi), %ymm5 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm5[7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm3[0,1,2,3],ymm4[4,5,6,7] +; AVX2-FAST-NEXT: vmovaps 1216(%rdi), %xmm3 +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm4 = xmm3[0,1,2],xmm7[3] ; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm4 = xmm4[0,1,3,2] -; AVX2-FAST-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm8 = mem[1,0,2,3,5,4,6,7] -; AVX2-FAST-NEXT: vextractf128 $1, %ymm8, %xmm8 -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm4 = xmm8[0,1],xmm4[2,3] -; AVX2-FAST-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm8 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm8 = ymm12[0],mem[1],ymm12[2,3,4],mem[5],ymm12[6,7] -; AVX2-FAST-NEXT: vpermps %ymm8, %ymm14, %ymm8 +; AVX2-FAST-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm7 = mem[1,0,2,3,5,4,6,7] +; AVX2-FAST-NEXT: vextractf128 $1, %ymm7, %xmm7 +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm4 = xmm7[0,1],xmm4[2,3] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FAST-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm7 = ymm7[0],mem[1],ymm7[2,3,4],mem[5],ymm7[6,7] +; AVX2-FAST-NEXT: vpermps %ymm7, %ymm13, %ymm7 +; AVX2-FAST-NEXT: vbroadcastss 1336(%rdi), %ymm14 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm14[7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm4[0,1,2,3],ymm7[4,5,6,7] +; AVX2-FAST-NEXT: vmovaps 1440(%rdi), %xmm4 +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm8 = xmm4[0,1,2],xmm8[3] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm8 = xmm8[0,1,3,2] +; AVX2-FAST-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm14 = mem[1,0,2,3,5,4,6,7] +; AVX2-FAST-NEXT: vextractf128 $1, %ymm14, %xmm14 +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm8 = xmm14[0,1],xmm8[2,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm14 = ymm12[0],ymm15[1],ymm12[2,3,4],ymm15[5],ymm12[6,7] +; AVX2-FAST-NEXT: vpermps %ymm14, %ymm13, %ymm14 ; AVX2-FAST-NEXT: vbroadcastss 1560(%rdi), %ymm15 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6],ymm15[7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm4[0,1,2,3],ymm8[4,5,6,7] -; AVX2-FAST-NEXT: vmovaps 1664(%rdi), %xmm4 -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm11 = xmm4[0,1,2],xmm11[3] -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm11 = xmm11[0,1,3,2] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5,6],ymm15[7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm14[4,5,6,7] +; AVX2-FAST-NEXT: vmovaps 1664(%rdi), %xmm14 +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm9 = xmm14[0,1,2],xmm9[3] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm9 = xmm9[0,1,3,2] ; AVX2-FAST-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm15 = mem[1,0,2,3,5,4,6,7] ; AVX2-FAST-NEXT: vextractf128 $1, %ymm15, %xmm15 -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm11 = xmm15[0,1],xmm11[2,3] -; AVX2-FAST-NEXT: vblendps $221, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm15 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm15 = mem[0],ymm13[1],mem[2,3,4],ymm13[5],mem[6,7] -; AVX2-FAST-NEXT: vpermps %ymm15, %ymm14, %ymm14 -; AVX2-FAST-NEXT: vbroadcastss 1784(%rdi), %ymm15 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5,6],ymm15[7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm14[4,5,6,7] -; AVX2-FAST-NEXT: vbroadcastss 136(%rdi), %xmm14 -; AVX2-FAST-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm14 = xmm14[0],mem[1],xmm14[2,3] -; AVX2-FAST-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14 -; AVX2-FAST-NEXT: vpermps 192(%rdi), %ymm10, %ymm15 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] -; AVX2-FAST-NEXT: vbroadcastss 80(%rdi), %ymm15 -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm9 = xmm15[0,1,2],xmm9[3] -; AVX2-FAST-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm15 = mem[2,3,2,3,6,7,6,7] -; AVX2-FAST-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm15 = ymm15[0],mem[1],ymm15[2,3,4],mem[5],ymm15[6,7] -; AVX2-FAST-NEXT: vextractf128 $1, %ymm15, %xmm15 ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm9 = xmm15[0,1],xmm9[2,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm14[4,5,6,7] -; AVX2-FAST-NEXT: vbroadcastss 360(%rdi), %xmm14 -; AVX2-FAST-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm14 = xmm14[0],mem[1],xmm14[2,3] -; AVX2-FAST-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14 -; AVX2-FAST-NEXT: vpermps 416(%rdi), %ymm10, %ymm15 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] -; AVX2-FAST-NEXT: vbroadcastss 304(%rdi), %ymm15 -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm13 = xmm15[0,1,2],xmm6[3] -; AVX2-FAST-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm15 = mem[2,3,2,3,6,7,6,7] -; AVX2-FAST-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm15 = ymm15[0],mem[1],ymm15[2,3,4],mem[5],ymm15[6,7] -; AVX2-FAST-NEXT: vextractf128 $1, %ymm15, %xmm15 -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm13 = xmm15[0,1],xmm13[2,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm14[4,5,6,7] -; AVX2-FAST-NEXT: vbroadcastss 584(%rdi), %xmm14 -; AVX2-FAST-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm14 = xmm14[0],mem[1],xmm14[2,3] -; AVX2-FAST-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14 -; AVX2-FAST-NEXT: vpermps 640(%rdi), %ymm10, %ymm15 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] -; AVX2-FAST-NEXT: vbroadcastss 528(%rdi), %ymm15 -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm12 = xmm15[0,1,2],xmm5[3] -; AVX2-FAST-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm15 = mem[2,3,2,3,6,7,6,7] -; AVX2-FAST-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm15 = ymm15[0],mem[1],ymm15[2,3,4],mem[5],ymm15[6,7] -; AVX2-FAST-NEXT: vextractf128 $1, %ymm15, %xmm15 -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm12 = xmm15[0,1],xmm12[2,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm14 = ymm12[0,1,2,3],ymm14[4,5,6,7] -; AVX2-FAST-NEXT: vbroadcastss 808(%rdi), %xmm12 -; AVX2-FAST-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm12 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm12 = xmm12[0],mem[1],xmm12[2,3] -; AVX2-FAST-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm12 -; AVX2-FAST-NEXT: vpermps 864(%rdi), %ymm10, %ymm15 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],ymm15[6,7] -; AVX2-FAST-NEXT: vbroadcastss 752(%rdi), %ymm15 -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm0 = xmm15[0,1,2],xmm0[3] -; AVX2-FAST-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm15 = mem[2,3,2,3,6,7,6,7] -; AVX2-FAST-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm15 = ymm15[0],mem[1],ymm15[2,3,4],mem[5],ymm15[6,7] -; AVX2-FAST-NEXT: vextractf128 $1, %ymm15, %xmm15 -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm0 = xmm15[0,1],xmm0[2,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm12[4,5,6,7] -; AVX2-FAST-NEXT: vbroadcastss 1032(%rdi), %xmm12 -; AVX2-FAST-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm12 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm12 = xmm12[0],mem[1],xmm12[2,3] -; AVX2-FAST-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm12 -; AVX2-FAST-NEXT: vpermps 1088(%rdi), %ymm10, %ymm15 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],ymm15[6,7] -; AVX2-FAST-NEXT: vbroadcastss 976(%rdi), %ymm15 -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm15[0,1,2],xmm1[3] -; AVX2-FAST-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm15 = mem[2,3,2,3,6,7,6,7] -; AVX2-FAST-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm15 = ymm15[0],mem[1],ymm15[2,3,4],mem[5],ymm15[6,7] -; AVX2-FAST-NEXT: vextractf128 $1, %ymm15, %xmm15 -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm15[0,1],xmm1[2,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm12[4,5,6,7] -; AVX2-FAST-NEXT: vbroadcastss 1256(%rdi), %xmm12 -; AVX2-FAST-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm12 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm12 = xmm12[0],mem[1],xmm12[2,3] -; AVX2-FAST-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm12 -; AVX2-FAST-NEXT: vpermps 1312(%rdi), %ymm10, %ymm15 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],ymm15[6,7] -; AVX2-FAST-NEXT: vbroadcastss 1200(%rdi), %ymm15 -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm2 = xmm15[0,1,2],xmm2[3] -; AVX2-FAST-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm15 = mem[2,3,2,3,6,7,6,7] -; AVX2-FAST-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm15 = ymm15[0],mem[1],ymm15[2,3,4],mem[5],ymm15[6,7] -; AVX2-FAST-NEXT: vextractf128 $1, %ymm15, %xmm15 -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm2 = xmm15[0,1],xmm2[2,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm12[4,5,6,7] -; AVX2-FAST-NEXT: vbroadcastss 1480(%rdi), %xmm12 -; AVX2-FAST-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm12 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm12 = xmm12[0],mem[1],xmm12[2,3] -; AVX2-FAST-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm12 -; AVX2-FAST-NEXT: vpermps 1536(%rdi), %ymm10, %ymm15 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],ymm15[6,7] -; AVX2-FAST-NEXT: vbroadcastss 1424(%rdi), %ymm15 -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm3 = xmm15[0,1,2],xmm3[3] -; AVX2-FAST-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm15 = mem[2,3,2,3,6,7,6,7] -; AVX2-FAST-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm15 = ymm15[0],mem[1],ymm15[2,3,4],mem[5],ymm15[6,7] -; AVX2-FAST-NEXT: vextractf128 $1, %ymm15, %xmm15 -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm3 = xmm15[0,1],xmm3[2,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm12[4,5,6,7] -; AVX2-FAST-NEXT: vbroadcastss 1704(%rdi), %xmm12 -; AVX2-FAST-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm12 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm12 = xmm12[0],mem[1],xmm12[2,3] -; AVX2-FAST-NEXT: vpermps 1760(%rdi), %ymm10, %ymm10 -; AVX2-FAST-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm12 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm10 = ymm12[0,1,2,3,4,5],ymm10[6,7] -; AVX2-FAST-NEXT: vbroadcastss 1648(%rdi), %ymm12 -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm4 = xmm12[0,1,2],xmm4[3] -; AVX2-FAST-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm12 = mem[2,3,2,3,6,7,6,7] -; AVX2-FAST-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm12 = ymm12[0],mem[1],ymm12[2,3,4],mem[5],ymm12[6,7] -; AVX2-FAST-NEXT: vextractf128 $1, %ymm12, %xmm12 -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm4 = xmm12[0,1],xmm4[2,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm10[4,5,6,7] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm10, 192(%rsi) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm10, 128(%rsi) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm10, 64(%rsi) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm10, (%rsi) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm10, 224(%rsi) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm10, 160(%rsi) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm10, 96(%rsi) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm10, 32(%rsi) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm10, 192(%rdx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm10, 128(%rdx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm10, 64(%rdx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm10, (%rdx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm10, 224(%rdx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm10, 160(%rdx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm10, 96(%rdx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm10, 32(%rdx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm10, 128(%rcx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm10, 64(%rcx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm10, (%rcx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm10, 192(%rcx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm10, 224(%rcx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm10, 160(%rcx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm10, 96(%rcx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm10, 32(%rcx) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm10, (%r8) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm10, 64(%r8) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm10, 128(%r8) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm10, 192(%r8) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm10, 224(%r8) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm10, 160(%r8) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm10, 96(%r8) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm10, 32(%r8) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm5, 224(%r9) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm5, 192(%r9) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm5, 160(%r9) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm5, 128(%r9) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm5, 96(%r9) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm10, 64(%r9) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm10, 32(%r9) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm10, (%r9) +; AVX2-FAST-NEXT: vblendps $221, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm15 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm15 = mem[0],ymm11[1],mem[2,3,4],ymm11[5],mem[6,7] +; AVX2-FAST-NEXT: vpermps %ymm15, %ymm13, %ymm13 +; AVX2-FAST-NEXT: vbroadcastss 1784(%rdi), %ymm15 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5,6],ymm15[7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm13[4,5,6,7] +; AVX2-FAST-NEXT: vbroadcastss 136(%rdi), %xmm13 +; AVX2-FAST-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm13 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm13 = xmm13[0],mem[1],xmm13[2,3] +; AVX2-FAST-NEXT: vinsertf128 $1, %xmm13, %ymm0, %ymm13 +; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm15 = +; AVX2-FAST-NEXT: vpermps 192(%rdi), %ymm15, %ymm11 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm11 = ymm13[0,1,2,3,4,5],ymm11[6,7] +; AVX2-FAST-NEXT: vbroadcastss 80(%rdi), %ymm13 +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm12 = xmm13[0,1,2],xmm10[3] +; AVX2-FAST-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm13 = mem[2,3,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm13 = ymm13[0],mem[1],ymm13[2,3,4],mem[5],ymm13[6,7] +; AVX2-FAST-NEXT: vextractf128 $1, %ymm13, %xmm13 +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm12 = xmm13[0,1],xmm12[2,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm11[4,5,6,7] +; AVX2-FAST-NEXT: vbroadcastss 360(%rdi), %xmm11 +; AVX2-FAST-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm11 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm11 = xmm11[0],mem[1],xmm11[2,3] +; AVX2-FAST-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm11 +; AVX2-FAST-NEXT: vpermps 416(%rdi), %ymm15, %ymm13 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],ymm13[6,7] +; AVX2-FAST-NEXT: vbroadcastss 304(%rdi), %ymm13 +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm10 = xmm13[0,1,2],xmm6[3] +; AVX2-FAST-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm13 = mem[2,3,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm13 = ymm13[0],mem[1],ymm13[2,3,4],mem[5],ymm13[6,7] +; AVX2-FAST-NEXT: vextractf128 $1, %ymm13, %xmm13 +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm10 = xmm13[0,1],xmm10[2,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5,6,7] +; AVX2-FAST-NEXT: vbroadcastss 584(%rdi), %xmm11 +; AVX2-FAST-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm11 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm11 = xmm11[0],mem[1],xmm11[2,3] +; AVX2-FAST-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm11 +; AVX2-FAST-NEXT: vpermps 640(%rdi), %ymm15, %ymm13 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],ymm13[6,7] +; AVX2-FAST-NEXT: vbroadcastss 528(%rdi), %ymm13 +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm0 = xmm13[0,1,2],xmm0[3] +; AVX2-FAST-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm13 = mem[2,3,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm13 = ymm13[0],mem[1],ymm13[2,3,4],mem[5],ymm13[6,7] +; AVX2-FAST-NEXT: vextractf128 $1, %ymm13, %xmm13 +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm0 = xmm13[0,1],xmm0[2,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm13 = ymm0[0,1,2,3],ymm11[4,5,6,7] +; AVX2-FAST-NEXT: vbroadcastss 808(%rdi), %xmm0 +; AVX2-FAST-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm0 = xmm0[0],mem[1],xmm0[2,3] +; AVX2-FAST-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpermps 864(%rdi), %ymm15, %ymm11 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm11[6,7] +; AVX2-FAST-NEXT: vbroadcastss 752(%rdi), %ymm11 +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm11[0,1,2],xmm1[3] +; AVX2-FAST-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm11 = mem[2,3,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm11 = ymm11[0],mem[1],ymm11[2,3,4],mem[5],ymm11[6,7] +; AVX2-FAST-NEXT: vextractf128 $1, %ymm11, %xmm11 +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm11[0,1],xmm1[2,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vbroadcastss 1032(%rdi), %xmm1 +; AVX2-FAST-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm1 = xmm1[0],mem[1],xmm1[2,3] +; AVX2-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-FAST-NEXT: vpermps 1088(%rdi), %ymm15, %ymm11 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm11[6,7] +; AVX2-FAST-NEXT: vbroadcastss 976(%rdi), %ymm11 +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm2 = xmm11[0,1,2],xmm2[3] +; AVX2-FAST-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm11 = mem[2,3,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm11 = ymm11[0],mem[1],ymm11[2,3,4],mem[5],ymm11[6,7] +; AVX2-FAST-NEXT: vextractf128 $1, %ymm11, %xmm11 +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm2 = xmm11[0,1],xmm2[2,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-NEXT: vbroadcastss 1256(%rdi), %xmm2 +; AVX2-FAST-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm2 = xmm2[0],mem[1],xmm2[2,3] +; AVX2-FAST-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-FAST-NEXT: vpermps 1312(%rdi), %ymm15, %ymm11 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm11[6,7] +; AVX2-FAST-NEXT: vbroadcastss 1200(%rdi), %ymm11 +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm3 = xmm11[0,1,2],xmm3[3] +; AVX2-FAST-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm11 = mem[2,3,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm11 = ymm11[0],mem[1],ymm11[2,3,4],mem[5],ymm11[6,7] +; AVX2-FAST-NEXT: vextractf128 $1, %ymm11, %xmm11 +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm3 = xmm11[0,1],xmm3[2,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FAST-NEXT: vbroadcastss 1480(%rdi), %xmm3 +; AVX2-FAST-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm3 = xmm3[0],mem[1],xmm3[2,3] +; AVX2-FAST-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 +; AVX2-FAST-NEXT: vpermps 1536(%rdi), %ymm15, %ymm11 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm11[6,7] +; AVX2-FAST-NEXT: vbroadcastss 1424(%rdi), %ymm11 +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm4 = xmm11[0,1,2],xmm4[3] +; AVX2-FAST-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm11 = mem[2,3,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm11 = ymm11[0],mem[1],ymm11[2,3,4],mem[5],ymm11[6,7] +; AVX2-FAST-NEXT: vextractf128 $1, %ymm11, %xmm11 +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm4 = xmm11[0,1],xmm4[2,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] +; AVX2-FAST-NEXT: vbroadcastss 1704(%rdi), %xmm4 +; AVX2-FAST-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm4 = xmm4[0],mem[1],xmm4[2,3] +; AVX2-FAST-NEXT: vpermps 1760(%rdi), %ymm15, %ymm11 +; AVX2-FAST-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm11[6,7] +; AVX2-FAST-NEXT: vbroadcastss 1648(%rdi), %ymm11 +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm11 = xmm11[0,1,2],xmm14[3] +; AVX2-FAST-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm14 = mem[2,3,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm14 = ymm14[0],mem[1],ymm14[2,3,4],mem[5],ymm14[6,7] +; AVX2-FAST-NEXT: vextractf128 $1, %ymm14, %xmm14 +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm11 = xmm14[0,1],xmm11[2,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm11[0,1,2,3],ymm4[4,5,6,7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm11, 192(%rsi) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm11, 128(%rsi) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm11, 64(%rsi) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm11, (%rsi) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm11, 224(%rsi) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm11, 160(%rsi) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm11, 96(%rsi) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm11, 32(%rsi) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm11, 192(%rdx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm11, 128(%rdx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm11, 64(%rdx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm11, (%rdx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm11, 224(%rdx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm11, 160(%rdx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm11, 96(%rdx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm11, 32(%rdx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm11, 128(%rcx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm11, 64(%rcx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm11, (%rcx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm11, 192(%rcx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm11, 224(%rcx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm11, 160(%rcx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm11, 96(%rcx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm11, 32(%rcx) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm11, (%r8) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm11, 64(%r8) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm11, 128(%r8) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm11, 192(%r8) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm11, 224(%r8) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm11, 160(%r8) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm11, 96(%r8) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm11, 32(%r8) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm6, 224(%r9) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm6, 192(%r9) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm6, 160(%r9) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm6, 128(%r9) +; AVX2-FAST-NEXT: vmovups (%rsp), %ymm6 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm6, 96(%r9) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm11, 64(%r9) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm11, 32(%r9) +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-FAST-NEXT: vmovaps %ymm11, (%r9) ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-NEXT: vmovaps %ymm11, 224(%rax) +; AVX2-FAST-NEXT: vmovaps %ymm9, 224(%rax) ; AVX2-FAST-NEXT: vmovaps %ymm8, 192(%rax) ; AVX2-FAST-NEXT: vmovaps %ymm7, 160(%rax) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm5, 128(%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm5, 96(%rax) @@ -10688,7 +10679,7 @@ ; AVX2-FAST-NEXT: vmovaps %ymm5, 64(%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm5, 32(%rax) -; AVX2-FAST-NEXT: vmovups (%rsp), %ymm5 # 32-byte Reload +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm5, (%rax) ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FAST-NEXT: vmovaps %ymm4, 224(%rax) @@ -10696,23 +10687,23 @@ ; AVX2-FAST-NEXT: vmovaps %ymm2, 160(%rax) ; AVX2-FAST-NEXT: vmovaps %ymm1, 128(%rax) ; AVX2-FAST-NEXT: vmovaps %ymm0, 96(%rax) -; AVX2-FAST-NEXT: vmovaps %ymm14, 64(%rax) -; AVX2-FAST-NEXT: vmovaps %ymm13, 32(%rax) -; AVX2-FAST-NEXT: vmovaps %ymm9, (%rax) -; AVX2-FAST-NEXT: addq $2680, %rsp # imm = 0xA78 +; AVX2-FAST-NEXT: vmovaps %ymm13, 64(%rax) +; AVX2-FAST-NEXT: vmovaps %ymm10, 32(%rax) +; AVX2-FAST-NEXT: vmovaps %ymm12, (%rax) +; AVX2-FAST-NEXT: addq $2664, %rsp # imm = 0xA68 ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; ; AVX2-FAST-PERLANE-LABEL: load_i32_stride7_vf64: ; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: subq $2680, %rsp # imm = 0xA78 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 1216(%rdi), %ymm12 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm12, (%rsp) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: subq $2664, %rsp # imm = 0xA68 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 1216(%rdi), %ymm9 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa 1152(%rdi), %ymm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa 1120(%rdi), %ymm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 768(%rdi), %ymm13 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 768(%rdi), %ymm12 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa 704(%rdi), %ymm6 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 672(%rdi), %ymm7 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 320(%rdi), %ymm8 @@ -10739,13 +10730,12 @@ ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5],ymm6[6],ymm7[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm7, %ymm9 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm6, %ymm8 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpermd %ymm1, %ymm0, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 752(%rdi), %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm13[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm12[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa 800(%rdi), %xmm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 832(%rdi), %xmm3 @@ -10757,10 +10747,11 @@ ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5],ymm4[6],ymm5[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm4, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpermd %ymm1, %ymm0, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 1200(%rdi), %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm12[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm9[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa 1248(%rdi), %xmm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 1280(%rdi), %xmm3 @@ -10771,11 +10762,11 @@ ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 1600(%rdi), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 1600(%rdi), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 1568(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 1568(%rdi), %ymm12 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3,4,5],ymm1[6],ymm12[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vpermd %ymm1, %ymm0, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 1664(%rdi), %ymm3 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -10812,16 +10803,16 @@ ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa 480(%rdi), %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 448(%rdi), %ymm13 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1,2,3,4,5],ymm2[6],ymm13[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 448(%rdi), %ymm12 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3,4,5],ymm2[6],ymm12[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, %ymm15 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpermd %ymm1, %ymm0, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 544(%rdi), %ymm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 544(%rdi), %ymm5 ; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 528(%rdi), %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm6[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa 576(%rdi), %xmm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 608(%rdi), %xmm3 @@ -10832,16 +10823,18 @@ ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa 928(%rdi), %ymm14 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 896(%rdi), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm14[6],ymm1[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 928(%rdi), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 896(%rdi), %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm2[6],ymm3[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, %ymm14 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, %ymm13 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpermd %ymm1, %ymm0, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 992(%rdi), %ymm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 992(%rdi), %ymm9 ; AVX2-FAST-PERLANE-NEXT: vpbroadcastq 976(%rdi), %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm7[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm9[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa 1024(%rdi), %xmm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 1056(%rdi), %xmm3 @@ -10905,11 +10898,11 @@ ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa 736(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1],ymm1[2,3],ymm11[4,5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1],ymm1[2,3],ymm10[4,5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,2,2,3,5,6,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0],ymm9[1],ymm8[2,3,4],ymm9[5],ymm8[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0],ymm7[1],ymm8[2,3,4],ymm7[5],ymm8[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[1,0,3,3,5,4,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,3,2,3] @@ -10927,12 +10920,12 @@ ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa 1184(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqu (%rsp), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1],ymm1[2,3],ymm9[4,5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1],ymm1[2,3],ymm11[4,5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,2,2,3,5,6,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vpblendd $221, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm2 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[0],ymm5[1],mem[2,3,4],ymm5[5],mem[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm2 = ymm6[0],mem[1],ymm6[2,3,4],mem[5],ymm6[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[1,0,3,3,5,4,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,3,2,3] @@ -10950,12 +10943,13 @@ ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa 1632(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1],ymm1[2,3],ymm5[4,5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1],ymm1[2,3],ymm6[4,5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,2,2,3,5,6,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vpblendd $221, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm2 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[0],ymm12[1],mem[2,3,4],ymm12[5],mem[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm2 = ymm2[0],mem[1],ymm2[2,3,4],mem[5],ymm2[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[1,0,3,3,5,4,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,3,2,3] @@ -10973,10 +10967,10 @@ ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa 512(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1],ymm1[2,3],ymm6[4,5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1],ymm1[2,3],ymm5[4,5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,2,2,3,5,6,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm15[0],ymm13[1],ymm15[2,3,4],ymm13[5],ymm15[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm15[0],ymm12[1],ymm15[2,3,4],ymm12[5],ymm15[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[1,0,3,3,5,4,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,3,2,3] @@ -10994,11 +10988,10 @@ ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa 960(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1],ymm1[2,3],ymm7[4,5],ymm1[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1],ymm1[2,3],ymm9[4,5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,2,2,3,5,6,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vpblendd $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm2 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm2 = ymm14[0],mem[1],ymm14[2,3,4],mem[5],ymm14[6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0],ymm14[1],ymm13[2,3,4],ymm14[5],ymm13[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[1,0,3,3,5,4,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,3,2,3] @@ -11011,13 +11004,13 @@ ; AVX2-FAST-PERLANE-NEXT: vmovdqa 1472(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpalignr {{.*#+}} ymm1 = ymm2[12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10,11],ymm2[28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26,27] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, %ymm12 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, %ymm13 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,0] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa 1408(%rdi), %ymm7 -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm15[0,1],ymm7[2,3],ymm15[4,5],ymm7[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm14[0,1],ymm7[2,3],ymm14[4,5],ymm7[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,2,2,3,5,6,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] @@ -11032,18 +11025,17 @@ ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm0 = mem[2,2,2,2] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 160(%rdi), %ymm13 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 128(%rdi), %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpalignr {{.*#+}} ymm1 = ymm13[12,13,14,15],ymm3[0,1,2,3,4,5,6,7,8,9,10,11],ymm13[28,29,30,31],ymm3[16,17,18,19,20,21,22,23,24,25,26,27] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, %ymm14 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 160(%rdi), %ymm15 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 128(%rdi), %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpalignr {{.*#+}} ymm1 = ymm15[12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10,11],ymm15[28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26,27] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,0] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %ymm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %ymm12 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1],ymm6[2,3],ymm8[4,5],ymm6[6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1],ymm12[2,3],ymm8[4,5],ymm12[6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,2,2,3,5,6,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload @@ -11064,48 +11056,48 @@ ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm10[0],ymm4[0],ymm10[2],ymm4[2] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm5[0],ymm9[0],ymm5[2],ymm9[2] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 428(%rdi), %ymm2 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa 752(%rdi), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpalignr {{.*#+}} ymm1 = ymm11[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm11[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] +; AVX2-FAST-PERLANE-NEXT: vpalignr {{.*#+}} ymm1 = ymm10[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm10[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 680(%rdi), %xmm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 704(%rdi), %xmm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm1 = ymm3[0],mem[0],ymm3[2],mem[2] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm1[0],ymm4[0],ymm1[2],ymm4[2] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 876(%rdi), %ymm2 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa 1200(%rdi), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpalignr {{.*#+}} ymm1 = ymm9[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm9[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] +; AVX2-FAST-PERLANE-NEXT: vpalignr {{.*#+}} ymm1 = ymm11[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm11[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 1128(%rdi), %xmm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 1152(%rdi), %xmm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm1 = ymm1[0],mem[0],ymm1[2],mem[2] +; AVX2-FAST-PERLANE-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[2],ymm3[2] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 1324(%rdi), %ymm2 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa 1648(%rdi), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpalignr {{.*#+}} ymm1 = ymm5[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm5[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] +; AVX2-FAST-PERLANE-NEXT: vpalignr {{.*#+}} ymm1 = ymm6[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm6[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 1576(%rdi), %xmm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 1600(%rdi), %xmm2 @@ -11120,29 +11112,30 @@ ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa 1424(%rdi), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpalignr {{.*#+}} ymm1 = ymm15[8,9,10,11,12,13,14,15],ymm7[0,1,2,3,4,5,6,7],ymm15[24,25,26,27,28,29,30,31],ymm7[16,17,18,19,20,21,22,23] +; AVX2-FAST-PERLANE-NEXT: vpalignr {{.*#+}} ymm1 = ymm14[8,9,10,11,12,13,14,15],ymm7[0,1,2,3,4,5,6,7],ymm14[24,25,26,27,28,29,30,31],ymm7[16,17,18,19,20,21,22,23] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 1352(%rdi), %xmm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 1376(%rdi), %xmm2 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm1[0],ymm12[0],ymm1[2],ymm12[2] +; AVX2-FAST-PERLANE-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm1[0],ymm13[0],ymm1[2],ymm13[2] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 1548(%rdi), %ymm7 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm7[7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa 80(%rdi), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpalignr {{.*#+}} ymm1 = ymm8[8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4,5,6,7],ymm8[24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20,21,22,23] +; AVX2-FAST-PERLANE-NEXT: vpalignr {{.*#+}} ymm1 = ymm8[8,9,10,11,12,13,14,15],ymm12[0,1,2,3,4,5,6,7],ymm8[24,25,26,27,28,29,30,31],ymm12[16,17,18,19,20,21,22,23] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 8(%rdi), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %xmm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm5[1],xmm1[2,3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %xmm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm6[1],xmm1[2,3] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm14[0],ymm13[0],ymm14[2],ymm13[2] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 204(%rdi), %ymm15 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm15[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm1[0],ymm15[0],ymm1[2],ymm15[2] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 204(%rdi), %ymm14 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm14[7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa 528(%rdi), %xmm0 @@ -11150,72 +11143,70 @@ ; AVX2-FAST-PERLANE-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 456(%rdi), %xmm15 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 480(%rdi), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm15 = xmm15[0],xmm1[1],xmm15[2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1],ymm0[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 456(%rdi), %xmm14 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 480(%rdi), %xmm15 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm14 = xmm14[0],xmm15[1],xmm14[2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1],ymm0[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm12[0],ymm11[0],ymm12[2],ymm11[2] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 652(%rdi), %ymm14 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4,5,6],ymm14[7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm14[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm10[0],ymm11[0],ymm10[2],ymm11[2] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 652(%rdi), %ymm13 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3,4,5,6],ymm13[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm13[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa 976(%rdi), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpalignr {{.*#+}} ymm14 = ymm9[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],ymm9[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2],ymm0[3],ymm14[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 904(%rdi), %xmm15 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpalignr {{.*#+}} ymm13 = ymm8[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm8[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2],ymm0[3],ymm13[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 904(%rdi), %xmm14 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 928(%rdi), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm15 = xmm15[0],xmm0[1],xmm15[2,3] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1],ymm14[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm14 = xmm14[0],xmm0[1],xmm14[2,3] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1],ymm13[2,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpunpcklqdq {{.*#+}} ymm15 = ymm7[0],ymm1[0],ymm7[2],ymm1[2] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 1100(%rdi), %ymm13 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1,2,3,4,5,6],ymm13[7] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3,4],ymm13[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm13 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm13 = ymm5[0],mem[1],ymm5[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm6 = xmm5[0,1,2],mem[3] +; AVX2-FAST-PERLANE-NEXT: vpunpcklqdq {{.*#+}} ymm14 = ymm7[0],ymm1[0],ymm7[2],ymm1[2] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastd 1100(%rdi), %ymm12 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm12 = ymm14[0,1,2,3,4,5,6],ymm12[7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3,4],ymm12[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm12 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm12 = ymm6[0],mem[1],ymm6[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1,2],mem[3] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm6 = xmm6[3,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm13 = ymm13[3,1,1,0,7,5,5,4] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm13[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm13 = ymm10[0,2],ymm4[1,3],ymm10[4,6],ymm4[5,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 432(%rdi), %ymm14 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5,6],ymm14[7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm13[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm12 = ymm12[3,1,1,0,7,5,5,4] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm12[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm12 = ymm5[0,2],ymm9[1,3],ymm5[4,6],ymm9[5,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 432(%rdi), %ymm13 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5,6],ymm13[7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm12[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm6 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm6 = ymm4[0],mem[1],ymm4[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm5 = xmm4[0,1,2],mem[3] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm6 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm6 = ymm5[0],mem[1],ymm5[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1,2],mem[3] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm5 = xmm5[3,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm6 = ymm6[3,1,1,0,7,5,5,4] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm6[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps $216, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm6 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm6 = ymm3[0,2],mem[1,3],ymm3[4,6],mem[5,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 880(%rdi), %ymm13 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm13[7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3,4],ymm6[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovups (%rsp), %ymm3 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm5 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm5 = ymm3[0],mem[1],ymm3[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm4 = xmm3[0,1,2],mem[3] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm6 = ymm6[0,2],ymm4[1,3],ymm6[4,6],ymm4[5,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 880(%rdi), %ymm12 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm12[7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm6[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm5 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm5 = ymm4[0],mem[1],ymm4[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1,2],mem[3] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm4 = xmm4[3,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm5 = ymm5[3,1,1,0,7,5,5,4] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm5[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm5 = ymm13[0,2],ymm14[1,3],ymm13[4,6],ymm14[5,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm5 = ymm14[0,2],ymm3[1,3],ymm14[4,6],ymm3[5,7] ; AVX2-FAST-PERLANE-NEXT: vbroadcastss 1328(%rdi), %ymm6 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm6[7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm5[5,6,7] @@ -11242,39 +11233,38 @@ ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[3,1,1,0,7,5,5,4] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps $216, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm3 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm3 = ymm15[0,2],mem[1,3],ymm15[4,6],mem[5,7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vshufps $216, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm3 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm3 = ymm12[0,2],mem[1,3],ymm12[4,6],mem[5,7] ; AVX2-FAST-PERLANE-NEXT: vbroadcastss 1552(%rdi), %ymm4 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm4[7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0],ymm8[1],ymm9[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm2 = ymm8[0],mem[1],ymm8[2,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],mem[3] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[3,1,1,0,7,5,5,4] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm7[0,2],ymm1[1,3],ymm7[4,6],ymm1[5,7] -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, %ymm10 +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, %ymm13 ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm7, %ymm8 ; AVX2-FAST-PERLANE-NEXT: vbroadcastss 1104(%rdi), %ymm3 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm0 = ymm0[0],mem[1],ymm0[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1,2],mem[3] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm11, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm12[0,2],ymm11[1,3],ymm12[4,6],ymm11[5,7] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} xmm1 = xmm15[0,1,2],mem[3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm10[0,2],ymm11[1,3],ymm10[4,6],ymm11[5,7] ; AVX2-FAST-PERLANE-NEXT: vbroadcastss 656(%rdi), %ymm2 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm0 = ymm0[0],mem[1],ymm0[2,3,4,5,6,7] @@ -11284,211 +11274,200 @@ ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm3[0,2],ymm4[1,3],ymm3[4,6],ymm4[5,7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm3[0,2],ymm9[1,3],ymm3[4,6],ymm9[5,7] ; AVX2-FAST-PERLANE-NEXT: vbroadcastss 208(%rdi), %ymm2 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 100(%rdi), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%rdi), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm1 = xmm0[0,1,2],xmm1[3] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 100(%rdi), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%rdi), %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3] ; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} xmm5 = <4,3,u,u> -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[0,1,2,3],ymm2[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm2, %ymm5, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] -; AVX2-FAST-PERLANE-NEXT: vbroadcastsd {{.*#+}} ymm9 = [30064771072,30064771072,30064771072,30064771072] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm3, %ymm9, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 212(%rdi), %ymm3 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm5, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 324(%rdi), %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm5, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] +; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} ymm6 = +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm3, %ymm6, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm9[6,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 212(%rdi), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm0, %ymm5, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 324(%rdi), %xmm2 ; AVX2-FAST-PERLANE-NEXT: vmovaps 288(%rdi), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm3 = xmm1[0,1,2],xmm3[3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3] -; AVX2-FAST-PERLANE-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm3 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm2 = xmm1[0,1,2],xmm2[3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm15, %ymm6, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vblendps $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm2 = ymm2[0,1,2,3,4,5],mem[6,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 436(%rdi), %ymm3 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm0, %ymm5, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 548(%rdi), %xmm3 +; AVX2-FAST-PERLANE-NEXT: vmovaps 512(%rdi), %xmm2 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm3 = xmm2[0,1,2],xmm3[3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm10, %ymm6, %ymm3 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm11[6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 436(%rdi), %ymm4 +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 660(%rdi), %ymm4 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm4[7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[0,1,2,3],ymm2[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm2, %ymm5, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 548(%rdi), %xmm4 -; AVX2-FAST-PERLANE-NEXT: vmovaps 512(%rdi), %xmm2 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm4 = xmm2[0,1,2],xmm4[3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm12, %ymm9, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm7[6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 660(%rdi), %ymm6 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm6[7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm3, (%rsp) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm3 = mem[0,1,2,3],ymm3[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm3, %ymm5, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 772(%rdi), %xmm6 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm0, %ymm5, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 772(%rdi), %xmm4 ; AVX2-FAST-PERLANE-NEXT: vmovaps 736(%rdi), %xmm3 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm6 = xmm3[0,1,2],xmm6[3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],xmm6[2,3] -; AVX2-FAST-PERLANE-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm6 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: vblendps $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm6 = ymm6[0,1,2,3,4,5],mem[6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm4 = xmm3[0,1,2],xmm4[3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,3] +; AVX2-FAST-PERLANE-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm4 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: vblendps $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm4 = ymm4[0,1,2,3,4,5],mem[6,7] ; AVX2-FAST-PERLANE-NEXT: vbroadcastss 884(%rdi), %ymm7 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm7[7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm4 = mem[0,1,2,3],ymm4[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm4, %ymm5, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm7[7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm0, %ymm5, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vbroadcastss 996(%rdi), %xmm7 ; AVX2-FAST-PERLANE-NEXT: vmovaps 960(%rdi), %xmm4 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm7 = xmm4[0,1,2],xmm7[3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1],xmm7[2,3] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm8, %ymm9, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm10[6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 1108(%rdi), %ymm8 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm8[7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm6 = mem[0,1,2,3],ymm6[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm6, %ymm5, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 1220(%rdi), %xmm8 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm7[2,3] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm8, %ymm6, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm13[6,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 1108(%rdi), %ymm9 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm9[7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm7[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm0, %ymm5, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 1220(%rdi), %xmm9 ; AVX2-FAST-PERLANE-NEXT: vmovaps 1184(%rdi), %xmm7 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm8 = xmm7[0,1,2],xmm8[3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1],xmm8[2,3] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm13, %ymm9, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm14[6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm9 = xmm7[0,1,2],xmm9[3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm9[2,3] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm14, %ymm6, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vblendps $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm9 = ymm9[0,1,2,3,4,5],mem[6,7] ; AVX2-FAST-PERLANE-NEXT: vbroadcastss 1332(%rdi), %ymm10 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6],ymm10[7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm6 = mem[0,1,2,3],ymm6[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm6, %ymm5, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 1444(%rdi), %xmm8 -; AVX2-FAST-PERLANE-NEXT: vmovaps 1408(%rdi), %xmm13 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm8 = xmm13[0,1,2],xmm8[3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1],xmm8[2,3] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm15, %ymm9, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm15, %ymm14 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5,6],ymm10[7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm9[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm0, %ymm5, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 1444(%rdi), %xmm10 +; AVX2-FAST-PERLANE-NEXT: vmovaps 1408(%rdi), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm10 = xmm0[0,1,2],xmm10[3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm9 = xmm9[0,1],xmm10[2,3] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm12, %ymm6, %ymm10 +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm11[6,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 1556(%rdi), %ymm12 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5,6],ymm12[7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm9 = mem[0,1,2,3],ymm9[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm9, %ymm5, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 1668(%rdi), %xmm10 +; AVX2-FAST-PERLANE-NEXT: vmovaps 1632(%rdi), %xmm5 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm10 = xmm5[0,1,2],xmm10[3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm9 = xmm9[0,1],xmm10[2,3] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpermps %ymm13, %ymm6, %ymm6 ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm12[6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 1556(%rdi), %ymm10 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6],ymm10[7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm12[6,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 1780(%rdi), %ymm10 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm10[7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm9[0,1,2,3],ymm6[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm6 = ymm6[0,1,2,3],mem[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermps %ymm6, %ymm5, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 1668(%rdi), %xmm6 -; AVX2-FAST-PERLANE-NEXT: vmovaps 1632(%rdi), %xmm15 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm6 = xmm15[0,1,2],xmm6[3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1],xmm6[2,3] -; AVX2-FAST-PERLANE-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm6 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm10[6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 1780(%rdi), %ymm8 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm8[7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm5 = ymm5[0],mem[1],ymm5[2,3,4],mem[5],ymm5[6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm5 = ymm5[1,0,3,3,5,4,7,7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,0,3] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 216(%rdi), %ymm6 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm6[7] -; AVX2-FAST-PERLANE-NEXT: vmovaps 96(%rdi), %xmm6 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm0 = xmm6[0,1,2],xmm0[3] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1,3,2] +; AVX2-FAST-PERLANE-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm6 = ymm6[0],mem[1],ymm6[2,3,4],mem[5],ymm6[6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm6 = ymm6[1,0,3,3,5,4,7,7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,0,3] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 216(%rdi), %ymm9 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm9[7] +; AVX2-FAST-PERLANE-NEXT: vmovaps 96(%rdi), %xmm10 +; AVX2-FAST-PERLANE-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm8 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm8 = xmm10[0,1,2],mem[3] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm8 = xmm8[0,1,3,2] +; AVX2-FAST-PERLANE-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm9 = mem[1,0,2,3,5,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm9, %xmm9 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm6[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovaps 320(%rdi), %xmm6 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm1 = xmm6[0,1,2],xmm1[3] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,3,2] ; AVX2-FAST-PERLANE-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm8 = mem[1,0,2,3,5,4,6,7] ; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm8, %xmm8 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm0 = xmm8[0,1],xmm0[2,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 320(%rdi), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm1 = xmm0[0,1,2],xmm1[3] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,3,2] -; AVX2-FAST-PERLANE-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm5 = mem[1,0,2,3,5,4,6,7] -; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm5, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm1 = xmm5[0,1],xmm1[2,3] -; AVX2-FAST-PERLANE-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm5 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm5 = ymm11[0],mem[1],ymm11[2,3,4],mem[5],ymm11[6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm5 = ymm5[1,0,3,3,5,4,7,7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,0,3] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 440(%rdi), %ymm8 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm8[7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 544(%rdi), %xmm5 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm1 = xmm5[0,1,2],xmm2[3] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,3,2] -; AVX2-FAST-PERLANE-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[1,0,2,3,5,4,6,7] -; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm2, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm2 = ymm2[0],mem[1],ymm2[2,3,4],mem[5],ymm2[6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm2[1,0,3,3,5,4,7,7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 664(%rdi), %ymm8 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm8[7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 768(%rdi), %xmm8 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm1 = xmm8[0,1,2],xmm3[3] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,3,2] -; AVX2-FAST-PERLANE-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[1,0,2,3,5,4,6,7] -; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm2, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm2 = ymm2[0],mem[1],ymm2[2,3,4],mem[5],ymm2[6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm2[1,0,3,3,5,4,7,7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 888(%rdi), %ymm3 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm1 = xmm8[0,1],xmm1[2,3] +; AVX2-FAST-PERLANE-NEXT: vblendps $221, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm8 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm8 = mem[0],ymm15[1],mem[2,3,4],ymm15[5],mem[6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm8 = ymm8[1,0,3,3,5,4,7,7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,1,0,3] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 440(%rdi), %ymm9 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6],ymm9[7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm8[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 992(%rdi), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm2 = xmm1[0,1,2],xmm4[3] +; AVX2-FAST-PERLANE-NEXT: vmovaps 544(%rdi), %xmm1 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm2 = xmm1[0,1,2],xmm2[3] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,1,3,2] +; AVX2-FAST-PERLANE-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm8 = mem[1,0,2,3,5,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm8, %xmm8 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm2 = xmm8[0,1],xmm2[2,3] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm8 = ymm8[0],mem[1],ymm8[2,3,4],mem[5],ymm8[6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm8 = ymm8[1,0,3,3,5,4,7,7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,1,0,3] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 664(%rdi), %ymm9 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6],ymm9[7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovaps 768(%rdi), %xmm15 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm2 = xmm15[0,1,2],xmm3[3] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,1,3,2] ; AVX2-FAST-PERLANE-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm3 = mem[1,0,2,3,5,4,6,7] ; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm3, %xmm3 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm3 = ymm3[0],mem[1],ymm3[2,3,4],mem[5],ymm3[6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps $221, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm3 = mem[0],ymm3[1],mem[2,3,4],ymm3[5],mem[6,7] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm3 = ymm3[1,0,3,3,5,4,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,3] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 1112(%rdi), %ymm4 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm4[7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm11 = ymm2[0,1,2,3],ymm3[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovaps 1216(%rdi), %xmm2 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm3 = xmm2[0,1,2],xmm7[3] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 888(%rdi), %ymm8 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm8[7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovaps 992(%rdi), %xmm2 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm3 = xmm2[0,1,2],xmm4[3] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,1,3,2] ; AVX2-FAST-PERLANE-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm4 = mem[1,0,2,3,5,4,6,7] @@ -11499,261 +11478,276 @@ ; AVX2-FAST-PERLANE-NEXT: # ymm4 = ymm4[0],mem[1],ymm4[2,3,4],mem[5],ymm4[6,7] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm4 = ymm4[1,0,3,3,5,4,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,0,3] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 1336(%rdi), %ymm7 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm7[7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 1112(%rdi), %ymm8 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm8[7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovaps 1440(%rdi), %xmm3 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm4 = xmm3[0,1,2],xmm13[3] +; AVX2-FAST-PERLANE-NEXT: vmovaps 1216(%rdi), %xmm3 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm4 = xmm3[0,1,2],xmm7[3] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm4 = xmm4[0,1,3,2] ; AVX2-FAST-PERLANE-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm7 = mem[1,0,2,3,5,4,6,7] ; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm7, %xmm7 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm4 = xmm7[0,1],xmm4[2,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm12[0],ymm14[1],ymm12[2,3,4],ymm14[5],ymm12[6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps $221, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm7 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm7 = mem[0],ymm14[1],mem[2,3,4],ymm14[5],mem[6,7] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm7 = ymm7[1,0,3,3,5,4,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,0,3] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 1560(%rdi), %ymm13 +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 1336(%rdi), %ymm8 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm8[7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm4[0,1,2,3],ymm7[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovaps 1440(%rdi), %xmm4 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm0 = xmm4[0,1,2],xmm0[3] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1,3,2] +; AVX2-FAST-PERLANE-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm7 = mem[1,0,2,3,5,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm7, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm0 = xmm7[0,1],xmm0[2,3] +; AVX2-FAST-PERLANE-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm7 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm7 = ymm11[0],mem[1],ymm11[2,3,4],mem[5],ymm11[6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm7 = ymm7[1,0,3,3,5,4,7,7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,0,3] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 1560(%rdi), %ymm8 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm8[7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm0[0,1,2,3],ymm7[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovaps 1664(%rdi), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm5 = xmm0[0,1,2],xmm5[3] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm5 = xmm5[0,1,3,2] +; AVX2-FAST-PERLANE-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm7 = mem[1,0,2,3,5,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm7, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm5 = xmm7[0,1],xmm5[2,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm12[0],ymm13[1],ymm12[2,3,4],ymm13[5],ymm12[6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm7 = ymm7[1,0,3,3,5,4,7,7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,0,3] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 1784(%rdi), %ymm13 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm13[7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm4[0,1,2,3],ymm7[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovaps 1664(%rdi), %xmm4 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm13 = xmm4[0,1,2],xmm15[3] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm13 = xmm13[0,1,3,2] -; AVX2-FAST-PERLANE-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm15 = mem[1,0,2,3,5,4,6,7] -; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm15, %xmm15 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm13 = xmm15[0,1],xmm13[2,3] -; AVX2-FAST-PERLANE-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm15 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm15 = ymm10[0],mem[1],ymm10[2,3,4],mem[5],ymm10[6,7] -; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm15 = ymm15[1,0,3,3,5,4,7,7] -; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[0,1,0,3] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 1784(%rdi), %ymm12 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm12 = ymm15[0,1,2,3,4,5,6],ymm12[7] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm12[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 136(%rdi), %xmm12 -; AVX2-FAST-PERLANE-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm12 # 16-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # xmm12 = xmm12[0],mem[1],xmm12[2,3] -; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm12 -; AVX2-FAST-PERLANE-NEXT: vpermps 192(%rdi), %ymm9, %ymm15 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],ymm15[6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 80(%rdi), %ymm15 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm6 = xmm15[0,1,2],xmm6[3] -; AVX2-FAST-PERLANE-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm15 = mem[2,3,2,3,6,7,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm15 = ymm15[0],mem[1],ymm15[2,3,4],mem[5],ymm15[6,7] -; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm15, %xmm15 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm6 = xmm15[0,1],xmm6[2,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm12[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm5[0,1,2,3],ymm7[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 136(%rdi), %xmm5 +; AVX2-FAST-PERLANE-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # xmm5 = xmm5[0],mem[1],xmm5[2,3] +; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm13 +; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} ymm5 = +; AVX2-FAST-PERLANE-NEXT: vpermps 192(%rdi), %ymm5, %ymm12 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0,1,2,3,4,5],ymm12[6,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 80(%rdi), %ymm13 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm13 = xmm13[0,1,2],xmm10[3] +; AVX2-FAST-PERLANE-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm14 = mem[2,3,2,3,6,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm14 = ymm14[0],mem[1],ymm14[2,3,4],mem[5],ymm14[6,7] +; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm14, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm13 = xmm14[0,1],xmm13[2,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm14 = ymm13[0,1,2,3],ymm12[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vbroadcastss 360(%rdi), %xmm12 ; AVX2-FAST-PERLANE-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm12 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm12 = xmm12[0],mem[1],xmm12[2,3] ; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm12 -; AVX2-FAST-PERLANE-NEXT: vpermps 416(%rdi), %ymm9, %ymm15 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],ymm15[6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 304(%rdi), %ymm15 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm0 = xmm15[0,1,2],xmm0[3] -; AVX2-FAST-PERLANE-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm15 = mem[2,3,2,3,6,7,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm15 = ymm15[0],mem[1],ymm15[2,3,4],mem[5],ymm15[6,7] -; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm15, %xmm15 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm0 = xmm15[0,1],xmm0[2,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm12[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps 416(%rdi), %ymm5, %ymm13 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],ymm13[6,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 304(%rdi), %ymm13 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm6 = xmm13[0,1,2],xmm6[3] +; AVX2-FAST-PERLANE-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm13 = mem[2,3,2,3,6,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm13 = ymm13[0],mem[1],ymm13[2,3,4],mem[5],ymm13[6,7] +; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm13, %xmm13 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm6 = xmm13[0,1],xmm6[2,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm12[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vbroadcastss 584(%rdi), %xmm12 ; AVX2-FAST-PERLANE-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm12 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm12 = xmm12[0],mem[1],xmm12[2,3] ; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm12 -; AVX2-FAST-PERLANE-NEXT: vpermps 640(%rdi), %ymm9, %ymm15 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],ymm15[6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 528(%rdi), %ymm15 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm5 = xmm15[0,1,2],xmm5[3] -; AVX2-FAST-PERLANE-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm15 = mem[2,3,2,3,6,7,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm15 = ymm15[0],mem[1],ymm15[2,3,4],mem[5],ymm15[6,7] -; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm15, %xmm15 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm5 = xmm15[0,1],xmm5[2,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm12[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps 640(%rdi), %ymm5, %ymm13 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],ymm13[6,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 528(%rdi), %ymm13 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm1 = xmm13[0,1,2],xmm1[3] +; AVX2-FAST-PERLANE-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm13 = mem[2,3,2,3,6,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm13 = ymm13[0],mem[1],ymm13[2,3,4],mem[5],ymm13[6,7] +; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm13, %xmm13 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm1 = xmm13[0,1],xmm1[2,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm12[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vbroadcastss 808(%rdi), %xmm12 ; AVX2-FAST-PERLANE-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm12 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm12 = xmm12[0],mem[1],xmm12[2,3] ; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm12 -; AVX2-FAST-PERLANE-NEXT: vpermps 864(%rdi), %ymm9, %ymm15 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],ymm15[6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 752(%rdi), %ymm15 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm14 = xmm15[0,1,2],xmm8[3] +; AVX2-FAST-PERLANE-NEXT: vpermps 864(%rdi), %ymm5, %ymm13 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],ymm13[6,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 752(%rdi), %ymm13 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm13 = xmm13[0,1,2],xmm15[3] ; AVX2-FAST-PERLANE-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm15 = mem[2,3,2,3,6,7,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm15 = ymm15[0],mem[1],ymm15[2,3,4],mem[5],ymm15[6,7] ; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm15, %xmm15 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm14 = xmm15[0,1],xmm14[2,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm12[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm13 = xmm15[0,1],xmm13[2,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm15 = ymm13[0,1,2,3],ymm12[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vbroadcastss 1032(%rdi), %xmm12 ; AVX2-FAST-PERLANE-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm12 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm12 = xmm12[0],mem[1],xmm12[2,3] ; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm12 -; AVX2-FAST-PERLANE-NEXT: vpermps 1088(%rdi), %ymm9, %ymm15 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],ymm15[6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 976(%rdi), %ymm15 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm1 = xmm15[0,1,2],xmm1[3] -; AVX2-FAST-PERLANE-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm15 = mem[2,3,2,3,6,7,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm15 = ymm15[0],mem[1],ymm15[2,3,4],mem[5],ymm15[6,7] -; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm15, %xmm15 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm1 = xmm15[0,1],xmm1[2,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm12[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps 1088(%rdi), %ymm5, %ymm13 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],ymm13[6,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 976(%rdi), %ymm13 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm2 = xmm13[0,1,2],xmm2[3] +; AVX2-FAST-PERLANE-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm13 = mem[2,3,2,3,6,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm13 = ymm13[0],mem[1],ymm13[2,3,4],mem[5],ymm13[6,7] +; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm13, %xmm13 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm2 = xmm13[0,1],xmm2[2,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm12[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vbroadcastss 1256(%rdi), %xmm12 ; AVX2-FAST-PERLANE-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm12 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm12 = xmm12[0],mem[1],xmm12[2,3] ; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm12 -; AVX2-FAST-PERLANE-NEXT: vpermps 1312(%rdi), %ymm9, %ymm15 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],ymm15[6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 1200(%rdi), %ymm15 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm2 = xmm15[0,1,2],xmm2[3] -; AVX2-FAST-PERLANE-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm15 = mem[2,3,2,3,6,7,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm15 = ymm15[0],mem[1],ymm15[2,3,4],mem[5],ymm15[6,7] -; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm15, %xmm15 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm2 = xmm15[0,1],xmm2[2,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm12[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps 1312(%rdi), %ymm5, %ymm13 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],ymm13[6,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 1200(%rdi), %ymm13 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm3 = xmm13[0,1,2],xmm3[3] +; AVX2-FAST-PERLANE-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm13 = mem[2,3,2,3,6,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm13 = ymm13[0],mem[1],ymm13[2,3,4],mem[5],ymm13[6,7] +; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm13, %xmm13 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm3 = xmm13[0,1],xmm3[2,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm12[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vbroadcastss 1480(%rdi), %xmm12 ; AVX2-FAST-PERLANE-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm12 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm12 = xmm12[0],mem[1],xmm12[2,3] ; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm12 -; AVX2-FAST-PERLANE-NEXT: vpermps 1536(%rdi), %ymm9, %ymm15 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],ymm15[6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastss 1424(%rdi), %ymm15 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm3 = xmm15[0,1,2],xmm3[3] -; AVX2-FAST-PERLANE-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm15 = mem[2,3,2,3,6,7,6,7] -; AVX2-FAST-PERLANE-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm15 = ymm15[0],mem[1],ymm15[2,3,4],mem[5],ymm15[6,7] -; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm15, %xmm15 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm3 = xmm15[0,1],xmm3[2,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm12[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermps 1536(%rdi), %ymm5, %ymm13 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],ymm13[6,7] +; AVX2-FAST-PERLANE-NEXT: vbroadcastss 1424(%rdi), %ymm13 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm4 = xmm13[0,1,2],xmm4[3] +; AVX2-FAST-PERLANE-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm13 = mem[2,3,2,3,6,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm13 = ymm13[0],mem[1],ymm13[2,3,4],mem[5],ymm13[6,7] +; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm13, %xmm13 +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm4 = xmm13[0,1],xmm4[2,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm12[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vbroadcastss 1704(%rdi), %xmm12 ; AVX2-FAST-PERLANE-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm12 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm12 = xmm12[0],mem[1],xmm12[2,3] -; AVX2-FAST-PERLANE-NEXT: vpermps 1760(%rdi), %ymm9, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vpermps 1760(%rdi), %ymm5, %ymm5 ; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm12 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm12[0,1,2,3,4,5],ymm9[6,7] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm12[0,1,2,3,4,5],ymm5[6,7] ; AVX2-FAST-PERLANE-NEXT: vbroadcastss 1648(%rdi), %ymm12 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm4 = xmm12[0,1,2],xmm4[3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm0 = xmm12[0,1,2],xmm0[3] ; AVX2-FAST-PERLANE-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm12 = mem[2,3,2,3,6,7,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm12 = ymm12[0],mem[1],ymm12[2,3,4],mem[5],ymm12[6,7] ; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm12, %xmm12 -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm4 = xmm12[0,1],xmm4[2,3] -; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm9[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm9, 192(%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm9, 128(%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm9, 64(%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm9, (%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm9, 224(%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm9, 160(%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm9, 96(%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm9, 32(%rsi) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm9, 192(%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm9, 128(%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm9, 64(%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm9, (%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm9, 224(%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm9, 160(%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm9, 96(%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm9, 32(%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm9, 128(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm9, 64(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm9, (%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm9, 192(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm9, 224(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm9, 160(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm9, 96(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm9, 32(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm9, (%r8) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm9, 64(%r8) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm9, 128(%r8) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm9, 192(%r8) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm9, 224(%r8) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm9, 160(%r8) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm9, 96(%r8) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm9, 32(%r8) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm8, 224(%r9) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm8, 192(%r9) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm8, 160(%r9) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm8, 128(%r9) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm8, 96(%r9) -; AVX2-FAST-PERLANE-NEXT: vmovups (%rsp), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm9, 64(%r9) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm9, 32(%r9) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm9, (%r9) -; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm13, 224(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm7, 192(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm7, 160(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm11, 128(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm7, 96(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm7, 64(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm7, 32(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm7, (%rax) +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm0 = xmm12[0,1],xmm0[2,3] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, 192(%rsi) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, 128(%rsi) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, 64(%rsi) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, (%rsi) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, 224(%rsi) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, 160(%rsi) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, 96(%rsi) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, 32(%rsi) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, 192(%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, 128(%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, 64(%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, (%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, 224(%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, 160(%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, 96(%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, 32(%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, 128(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, 64(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, (%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, 192(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, 224(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, 160(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, 96(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, 32(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, (%r8) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, 64(%r8) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, 128(%r8) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, 192(%r8) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, 224(%r8) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, 160(%r8) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, 96(%r8) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, 32(%r8) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, 224(%r9) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, 192(%r9) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, 160(%r9) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, 128(%r9) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, 96(%r9) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, 64(%r9) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, 32(%r9) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, (%r9) ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 224(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, 192(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, 160(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 128(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm14, 96(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm7, 224(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm8, 192(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm9, 160(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, 128(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, 96(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, 64(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 32(%rax) -; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, (%rax) -; AVX2-FAST-PERLANE-NEXT: addq $2680, %rsp # imm = 0xA78 +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, 32(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, (%rax) +; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, 224(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, 192(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, 160(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, 128(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm15, 96(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, 64(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 32(%rax) +; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm14, (%rax) +; AVX2-FAST-PERLANE-NEXT: addq $2664, %rsp # imm = 0xA68 ; AVX2-FAST-PERLANE-NEXT: vzeroupper ; AVX2-FAST-PERLANE-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-2.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-2.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-2.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-2.ll @@ -174,21 +174,37 @@ ; SSE-NEXT: movdqa %xmm0, (%rdx) ; SSE-NEXT: retq ; -; AVX1-LABEL: load_i8_stride2_vf16: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255] -; AVX1-NEXT: vmovdqa (%rdi), %xmm1 -; AVX1-NEXT: vmovdqa 16(%rdi), %xmm2 -; AVX1-NEXT: vpand %xmm0, %xmm2, %xmm3 -; AVX1-NEXT: vpand %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vpackuswb %xmm3, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = <1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u> -; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] -; AVX1-NEXT: vmovdqa %xmm0, (%rsi) -; AVX1-NEXT: vmovdqa %xmm1, (%rdx) -; AVX1-NEXT: retq +; AVX1-ONLY-LABEL: load_i8_stride2_vf16: +; AVX1-ONLY: # %bb.0: +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255] +; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vpand %xmm0, %xmm2, %xmm3 +; AVX1-ONLY-NEXT: vpand %xmm0, %xmm1, %xmm0 +; AVX1-ONLY-NEXT: vpackuswb %xmm3, %xmm0, %xmm0 +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm3 = <1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u> +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX1-ONLY-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX1-ONLY-NEXT: vmovdqa %xmm0, (%rsi) +; AVX1-ONLY-NEXT: vmovdqa %xmm1, (%rdx) +; AVX1-ONLY-NEXT: retq +; +; AVX2-ONLY-LABEL: load_i8_stride2_vf16: +; AVX2-ONLY: # %bb.0: +; AVX2-ONLY-NEXT: vpbroadcastw {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255] +; AVX2-ONLY-NEXT: vmovdqa (%rdi), %xmm1 +; AVX2-ONLY-NEXT: vmovdqa 16(%rdi), %xmm2 +; AVX2-ONLY-NEXT: vpand %xmm0, %xmm2, %xmm3 +; AVX2-ONLY-NEXT: vpand %xmm0, %xmm1, %xmm0 +; AVX2-ONLY-NEXT: vpackuswb %xmm3, %xmm0, %xmm0 +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm3 = <1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u> +; AVX2-ONLY-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX2-ONLY-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX2-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX2-ONLY-NEXT: vmovdqa %xmm0, (%rsi) +; AVX2-ONLY-NEXT: vmovdqa %xmm1, (%rdx) +; AVX2-ONLY-NEXT: retq ; ; AVX512F-LABEL: load_i8_stride2_vf16: ; AVX512F: # %bb.0: diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-3.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-3.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-3.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-3.ll @@ -655,8 +655,7 @@ ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm2 ; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm1 = xmm7[11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10] ; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm4 = xmm6[11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7,8,9,10] -; AVX1-ONLY-NEXT: vbroadcastf128 {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0] -; AVX1-ONLY-NEXT: # ymm5 = mem[0,1,0,1] +; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0] ; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm5, %ymm2 ; AVX1-ONLY-NEXT: vandps %ymm5, %ymm8, %ymm5 ; AVX1-ONLY-NEXT: vorps %ymm2, %ymm5, %ymm2 @@ -1366,15 +1365,13 @@ ; AVX2-ONLY-NEXT: vinserti128 $1, 144(%rdi), %ymm3, %ymm2 ; AVX2-ONLY-NEXT: vinserti128 $1, 160(%rdi), %ymm4, %ymm3 ; AVX2-ONLY-NEXT: vinserti128 $1, 176(%rdi), %ymm5, %ymm4 -; AVX2-ONLY-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [255,0,255,255,0,255,255,0,255,255,0,255,255,0,255,255,255,0,255,255,0,255,255,0,255,255,0,255,255,0,255,255] -; AVX2-ONLY-NEXT: # ymm5 = mem[0,1,0,1] +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm5 = [255,0,255,255,0,255,255,0,255,255,0,255,255,0,255,255,255,0,255,255,0,255,255,0,255,255,0,255,255,0,255,255] ; AVX2-ONLY-NEXT: vpblendvb %ymm5, %ymm6, %ymm1, %ymm7 ; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm8 = [1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14] ; AVX2-ONLY-NEXT: vpshufb %ymm8, %ymm7, %ymm7 ; AVX2-ONLY-NEXT: vpblendvb %ymm5, %ymm2, %ymm4, %ymm5 ; AVX2-ONLY-NEXT: vpshufb %ymm8, %ymm5, %ymm5 -; AVX2-ONLY-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [255,0,0,255,0,0,255,0,0,255,0,0,255,0,0,255,255,0,0,255,0,0,255,0,0,255,0,0,255,0,0,255] -; AVX2-ONLY-NEXT: # ymm8 = mem[0,1,0,1] +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm8 = <255,0,u,255,0,u,255,0,u,255,0,u,255,0,u,255,255,0,u,255,0,u,255,0,u,255,0,u,255,0,u,255> ; AVX2-ONLY-NEXT: vpblendvb %ymm8, %ymm0, %ymm6, %ymm6 ; AVX2-ONLY-NEXT: vpblendvb %ymm8, %ymm3, %ymm2, %ymm2 ; AVX2-ONLY-NEXT: vpblendvb %ymm8, %ymm1, %ymm0, %ymm9 @@ -1395,8 +1392,7 @@ ; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm4 = ymm5[5,6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,21,22,23,24,25,26,27,28,29,30,31,16,17,18,19,20] ; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm5 = ; AVX2-ONLY-NEXT: vpshufb %ymm5, %ymm3, %ymm3 -; AVX2-ONLY-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [255,255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,255,255,255,255,255] -; AVX2-ONLY-NEXT: # ymm6 = mem[0,1,0,1] +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,255,255,255,255,255] ; AVX2-ONLY-NEXT: vpblendvb %ymm6, %ymm4, %ymm3, %ymm3 ; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm4 = ymm7[5,6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,21,22,23,24,25,26,27,28,29,30,31,16,17,18,19,20] ; AVX2-ONLY-NEXT: vpshufb %ymm5, %ymm0, %ymm0 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-4.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-4.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-4.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-4.ll @@ -852,50 +852,51 @@ ; AVX2-ONLY-NEXT: vpshufb %xmm7, %xmm3, %xmm8 ; AVX2-ONLY-NEXT: vpshufb %xmm7, %xmm2, %xmm7 ; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm7 = xmm7[0,1],xmm6[2,3] -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm8 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX2-ONLY-NEXT: vpshufb %ymm8, %ymm1, %ymm9 -; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} ymm6 = [17179869184,17179869184,17179869184,17179869184] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm8 = xmm7[0,1],xmm6[2,3] +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm7 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX2-ONLY-NEXT: vpshufb %ymm7, %ymm1, %ymm9 +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm6 = ; AVX2-ONLY-NEXT: vpermd %ymm9, %ymm6, %ymm9 -; AVX2-ONLY-NEXT: vpshufb %ymm8, %ymm0, %ymm8 -; AVX2-ONLY-NEXT: vpermd %ymm8, %ymm6, %ymm8 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm9[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm8 = -; AVX2-ONLY-NEXT: vpshufb %xmm8, %xmm5, %xmm9 -; AVX2-ONLY-NEXT: vpshufb %xmm8, %xmm4, %xmm8 -; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1] -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm9 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX2-ONLY-NEXT: vpshufb %xmm9, %xmm3, %xmm10 -; AVX2-ONLY-NEXT: vpshufb %xmm9, %xmm2, %xmm9 -; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3] -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm9 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX2-ONLY-NEXT: vpshufb %ymm9, %ymm1, %ymm10 -; AVX2-ONLY-NEXT: vpermd %ymm10, %ymm6, %ymm10 -; AVX2-ONLY-NEXT: vpshufb %ymm9, %ymm0, %ymm9 -; AVX2-ONLY-NEXT: vpermd %ymm9, %ymm6, %ymm9 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm10[6,7] +; AVX2-ONLY-NEXT: vpshufb %ymm7, %ymm0, %ymm10 +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm7 = +; AVX2-ONLY-NEXT: vpermd %ymm10, %ymm7, %ymm10 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3,4,5],ymm9[6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm9 = +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm9 = ; AVX2-ONLY-NEXT: vpshufb %xmm9, %xmm5, %xmm10 ; AVX2-ONLY-NEXT: vpshufb %xmm9, %xmm4, %xmm9 ; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1] -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm10 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm10 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u> ; AVX2-ONLY-NEXT: vpshufb %xmm10, %xmm3, %xmm11 ; AVX2-ONLY-NEXT: vpshufb %xmm10, %xmm2, %xmm10 ; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0,1],xmm9[2,3] -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm10 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u,2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm10 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u> ; AVX2-ONLY-NEXT: vpshufb %ymm10, %ymm1, %ymm11 ; AVX2-ONLY-NEXT: vpermd %ymm11, %ymm6, %ymm11 ; AVX2-ONLY-NEXT: vpshufb %ymm10, %ymm0, %ymm10 -; AVX2-ONLY-NEXT: vpermd %ymm10, %ymm6, %ymm10 +; AVX2-ONLY-NEXT: vpermd %ymm10, %ymm7, %ymm10 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm11[6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm10 = -; AVX2-ONLY-NEXT: vpshufb %xmm10, %xmm5, %xmm5 -; AVX2-ONLY-NEXT: vpshufb %xmm10, %xmm4, %xmm4 +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm10 = +; AVX2-ONLY-NEXT: vpshufb %xmm10, %xmm5, %xmm11 +; AVX2-ONLY-NEXT: vpshufb %xmm10, %xmm4, %xmm10 +; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1] +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm11 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX2-ONLY-NEXT: vpshufb %xmm11, %xmm3, %xmm12 +; AVX2-ONLY-NEXT: vpshufb %xmm11, %xmm2, %xmm11 +; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm10 = xmm11[0,1],xmm10[2,3] +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm11 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u,2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX2-ONLY-NEXT: vpshufb %ymm11, %ymm1, %ymm12 +; AVX2-ONLY-NEXT: vpermd %ymm12, %ymm6, %ymm12 +; AVX2-ONLY-NEXT: vpshufb %ymm11, %ymm0, %ymm11 +; AVX2-ONLY-NEXT: vpermd %ymm11, %ymm7, %ymm11 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],ymm12[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm11 = +; AVX2-ONLY-NEXT: vpshufb %xmm11, %xmm5, %xmm5 +; AVX2-ONLY-NEXT: vpshufb %xmm11, %xmm4, %xmm4 ; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] ; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm5 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u> ; AVX2-ONLY-NEXT: vpshufb %xmm5, %xmm3, %xmm3 @@ -906,12 +907,12 @@ ; AVX2-ONLY-NEXT: vpshufb %ymm3, %ymm1, %ymm1 ; AVX2-ONLY-NEXT: vpermd %ymm1, %ymm6, %ymm1 ; AVX2-ONLY-NEXT: vpshufb %ymm3, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vpermd %ymm0, %ymm6, %ymm0 +; AVX2-ONLY-NEXT: vpermd %ymm0, %ymm7, %ymm0 ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqa %ymm7, (%rsi) -; AVX2-ONLY-NEXT: vmovdqa %ymm8, (%rdx) -; AVX2-ONLY-NEXT: vmovdqa %ymm9, (%rcx) +; AVX2-ONLY-NEXT: vmovdqa %ymm8, (%rsi) +; AVX2-ONLY-NEXT: vmovdqa %ymm9, (%rdx) +; AVX2-ONLY-NEXT: vmovdqa %ymm10, (%rcx) ; AVX2-ONLY-NEXT: vmovdqa %ymm0, (%r8) ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq @@ -1736,179 +1737,195 @@ ; ; AVX2-ONLY-LABEL: load_i8_stride4_vf64: ; AVX2-ONLY: # %bb.0: -; AVX2-ONLY-NEXT: subq $216, %rsp +; AVX2-ONLY-NEXT: subq $248, %rsp ; AVX2-ONLY-NEXT: vmovdqa 64(%rdi), %ymm0 -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa 96(%rdi), %ymm1 -; AVX2-ONLY-NEXT: vmovdqa (%rdi), %xmm4 -; AVX2-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 16(%rdi), %xmm5 -; AVX2-ONLY-NEXT: vmovdqa 32(%rdi), %xmm6 -; AVX2-ONLY-NEXT: vmovdqa 48(%rdi), %xmm7 -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm3 = -; AVX2-ONLY-NEXT: vpshufb %xmm3, %xmm7, %xmm2 -; AVX2-ONLY-NEXT: vpshufb %xmm3, %xmm6, %xmm8 -; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm8[0],xmm2[0],xmm8[1],xmm2[1] -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm10 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX2-ONLY-NEXT: vpshufb %xmm10, %xmm5, %xmm8 -; AVX2-ONLY-NEXT: vpshufb %xmm10, %xmm4, %xmm9 -; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm8 = xmm8[0,1],xmm2[2,3] -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm13 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX2-ONLY-NEXT: vpshufb %ymm13, %ymm1, %ymm9 -; AVX2-ONLY-NEXT: vmovdqa %ymm1, %ymm4 ; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} ymm2 = [17179869184,17179869184,17179869184,17179869184] -; AVX2-ONLY-NEXT: vpermd %ymm9, %ymm2, %ymm9 -; AVX2-ONLY-NEXT: vpshufb %ymm13, %ymm0, %ymm11 -; AVX2-ONLY-NEXT: vpermd %ymm11, %ymm2, %ymm11 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3,4,5],ymm9[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm9[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa (%rdi), %xmm13 +; AVX2-ONLY-NEXT: vmovdqa 16(%rdi), %xmm6 +; AVX2-ONLY-NEXT: vmovdqa 32(%rdi), %xmm7 +; AVX2-ONLY-NEXT: vmovdqa 48(%rdi), %xmm4 +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm2 = +; AVX2-ONLY-NEXT: vpshufb %xmm2, %xmm4, %xmm3 +; AVX2-ONLY-NEXT: vmovdqa %xmm4, %xmm8 +; AVX2-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vpshufb %xmm2, %xmm7, %xmm4 +; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm12 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX2-ONLY-NEXT: vpshufb %xmm12, %xmm6, %xmm4 +; AVX2-ONLY-NEXT: vpshufb %xmm12, %xmm13, %xmm9 +; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm9[0],xmm4[0],xmm9[1],xmm4[1] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm9 = xmm4[0,1],xmm3[2,3] +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm14 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX2-ONLY-NEXT: vpshufb %ymm14, %ymm1, %ymm4 +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm1 = +; AVX2-ONLY-NEXT: vpermd %ymm4, %ymm1, %ymm10 +; AVX2-ONLY-NEXT: vpshufb %ymm14, %ymm0, %ymm11 +; AVX2-ONLY-NEXT: vmovdqa %ymm0, %ymm4 +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm0 = +; AVX2-ONLY-NEXT: vpermd %ymm11, %ymm0, %ymm11 +; AVX2-ONLY-NEXT: vmovdqa %ymm0, %ymm3 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3,4,5],ymm10[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm10[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa 176(%rdi), %xmm8 -; AVX2-ONLY-NEXT: vpshufb %xmm3, %xmm8, %xmm11 -; AVX2-ONLY-NEXT: vmovdqa 160(%rdi), %xmm9 -; AVX2-ONLY-NEXT: vpshufb %xmm3, %xmm9, %xmm3 -; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm11[0],xmm3[1],xmm11[1] -; AVX2-ONLY-NEXT: vmovdqa 144(%rdi), %xmm11 -; AVX2-ONLY-NEXT: vpshufb %xmm10, %xmm11, %xmm14 -; AVX2-ONLY-NEXT: vmovdqa 128(%rdi), %xmm12 -; AVX2-ONLY-NEXT: vpshufb %xmm10, %xmm12, %xmm10 -; AVX2-ONLY-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm14 = xmm10[0],xmm14[0],xmm10[1],xmm14[1] +; AVX2-ONLY-NEXT: vmovdqa 176(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vpshufb %xmm2, %xmm0, %xmm11 +; AVX2-ONLY-NEXT: vmovdqa 160(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vpshufb %xmm2, %xmm0, %xmm2 +; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm11[0],xmm2[1],xmm11[1] +; AVX2-ONLY-NEXT: vmovdqa 144(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vpshufb %xmm12, %xmm0, %xmm15 +; AVX2-ONLY-NEXT: vmovdqa %xmm0, %xmm11 +; AVX2-ONLY-NEXT: vmovdqa 128(%rdi), %xmm0 +; AVX2-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vpshufb %xmm12, %xmm0, %xmm12 +; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm12 = xmm12[0],xmm15[0],xmm12[1],xmm15[1] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm2 = xmm12[0,1],xmm2[2,3] ; AVX2-ONLY-NEXT: vmovdqa 224(%rdi), %ymm0 ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm3 = xmm14[0,1],xmm3[2,3] -; AVX2-ONLY-NEXT: vpshufb %ymm13, %ymm0, %ymm14 -; AVX2-ONLY-NEXT: vpermd %ymm14, %ymm2, %ymm15 -; AVX2-ONLY-NEXT: vmovdqa 192(%rdi), %ymm0 +; AVX2-ONLY-NEXT: vpshufb %ymm14, %ymm0, %ymm15 +; AVX2-ONLY-NEXT: vpermd %ymm15, %ymm1, %ymm0 +; AVX2-ONLY-NEXT: vmovdqa %ymm1, %ymm10 +; AVX2-ONLY-NEXT: vmovdqa 192(%rdi), %ymm1 +; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpshufb %ymm14, %ymm1, %ymm14 +; AVX2-ONLY-NEXT: vpermd %ymm14, %ymm3, %ymm14 +; AVX2-ONLY-NEXT: vmovdqa %ymm3, %ymm9 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3,4,5],ymm0[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpshufb %ymm13, %ymm0, %ymm13 -; AVX2-ONLY-NEXT: vpermd %ymm13, %ymm2, %ymm13 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],ymm15[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm13[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm1 = +; AVX2-ONLY-NEXT: vpshufb %xmm1, %xmm8, %xmm2 +; AVX2-ONLY-NEXT: vpshufb %xmm1, %xmm7, %xmm14 +; AVX2-ONLY-NEXT: vmovdqa %xmm7, %xmm12 +; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm14[0],xmm2[0],xmm14[1],xmm2[1] +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm14 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX2-ONLY-NEXT: vpshufb %xmm14, %xmm6, %xmm0 +; AVX2-ONLY-NEXT: vmovdqa %xmm6, %xmm15 +; AVX2-ONLY-NEXT: vpshufb %xmm14, %xmm13, %xmm3 +; AVX2-ONLY-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3] +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm2 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-ONLY-NEXT: vpshufb %ymm2, %ymm7, %ymm3 +; AVX2-ONLY-NEXT: vpermd %ymm3, %ymm10, %ymm3 +; AVX2-ONLY-NEXT: vmovdqa %ymm4, %ymm10 +; AVX2-ONLY-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpshufb %ymm2, %ymm4, %ymm4 +; AVX2-ONLY-NEXT: vpermd %ymm4, %ymm9, %ymm4 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm3 = -; AVX2-ONLY-NEXT: vpshufb %xmm3, %xmm7, %xmm13 -; AVX2-ONLY-NEXT: vmovdqa %xmm7, %xmm10 -; AVX2-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpshufb %xmm3, %xmm6, %xmm15 -; AVX2-ONLY-NEXT: vmovdqa %xmm6, %xmm7 -; AVX2-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm13 = xmm15[0],xmm13[0],xmm15[1],xmm13[1] -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm15 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX2-ONLY-NEXT: vpshufb %xmm15, %xmm5, %xmm0 -; AVX2-ONLY-NEXT: vmovdqa %xmm5, %xmm14 -; AVX2-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-ONLY-NEXT: vpshufb %xmm15, %xmm5, %xmm1 +; AVX2-ONLY-NEXT: vpshufb %xmm1, %xmm5, %xmm0 +; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX2-ONLY-NEXT: vpshufb %xmm1, %xmm8, %xmm1 ; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm13[2,3] -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm1 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX2-ONLY-NEXT: vpshufb %ymm1, %ymm4, %ymm13 -; AVX2-ONLY-NEXT: vpermd %ymm13, %ymm2, %ymm13 +; AVX2-ONLY-NEXT: vmovdqa %xmm11, %xmm9 +; AVX2-ONLY-NEXT: vpshufb %xmm14, %xmm11, %xmm1 +; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload +; AVX2-ONLY-NEXT: vpshufb %xmm14, %xmm11, %xmm3 +; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] ; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-ONLY-NEXT: vpshufb %ymm1, %ymm6, %ymm4 -; AVX2-ONLY-NEXT: vpermd %ymm4, %ymm2, %ymm4 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm13[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5,6,7] -; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpshufb %xmm3, %xmm8, %xmm0 -; AVX2-ONLY-NEXT: vpshufb %xmm3, %xmm9, %xmm3 -; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] -; AVX2-ONLY-NEXT: vpshufb %xmm15, %xmm11, %xmm3 -; AVX2-ONLY-NEXT: vpshufb %xmm15, %xmm12, %xmm4 -; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm3[0,1],xmm0[2,3] -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-ONLY-NEXT: vpshufb %ymm1, %ymm12, %ymm3 -; AVX2-ONLY-NEXT: vpermd %ymm3, %ymm2, %ymm3 -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-ONLY-NEXT: vpshufb %ymm1, %ymm4, %ymm1 -; AVX2-ONLY-NEXT: vpermd %ymm1, %ymm2, %ymm1 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7] +; AVX2-ONLY-NEXT: vpshufb %ymm2, %ymm6, %ymm1 +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm3 = +; AVX2-ONLY-NEXT: vpermd %ymm1, %ymm3, %ymm1 +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-ONLY-NEXT: vpshufb %ymm2, %ymm3, %ymm2 +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm3 = +; AVX2-ONLY-NEXT: vpermd %ymm2, %ymm3, %ymm2 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm0 = -; AVX2-ONLY-NEXT: vpshufb %xmm0, %xmm10, %xmm1 -; AVX2-ONLY-NEXT: vpshufb %xmm0, %xmm7, %xmm3 -; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm3 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX2-ONLY-NEXT: vpshufb %xmm3, %xmm14, %xmm4 -; AVX2-ONLY-NEXT: vpshufb %xmm3, %xmm5, %xmm13 -; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm13[0],xmm4[0],xmm13[1],xmm4[1] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm4[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm4 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u,2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vpshufb %ymm4, %ymm7, %ymm13 -; AVX2-ONLY-NEXT: vpermd %ymm13, %ymm2, %ymm13 -; AVX2-ONLY-NEXT: vpshufb %ymm4, %ymm6, %ymm15 -; AVX2-ONLY-NEXT: vpermd %ymm15, %ymm2, %ymm15 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1,2,3,4,5],ymm13[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm13 = ymm1[0,1,2,3],ymm13[4,5,6,7] -; AVX2-ONLY-NEXT: vpshufb %xmm0, %xmm8, %xmm1 -; AVX2-ONLY-NEXT: vpshufb %xmm0, %xmm9, %xmm0 +; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-ONLY-NEXT: vpshufb %xmm0, %xmm1, %xmm1 +; AVX2-ONLY-NEXT: vpshufb %xmm0, %xmm12, %xmm2 +; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm2 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX2-ONLY-NEXT: vpshufb %xmm2, %xmm15, %xmm3 +; AVX2-ONLY-NEXT: vpshufb %xmm2, %xmm13, %xmm4 +; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3] +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm3 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u,2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX2-ONLY-NEXT: vpshufb %ymm3, %ymm7, %ymm4 +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm7 = +; AVX2-ONLY-NEXT: vpermd %ymm4, %ymm7, %ymm4 +; AVX2-ONLY-NEXT: vpshufb %ymm3, %ymm10, %ymm14 +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm10 = +; AVX2-ONLY-NEXT: vpermd %ymm14, %ymm10, %ymm14 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm4 = ymm14[0,1,2,3,4,5],ymm4[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] +; AVX2-ONLY-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-ONLY-NEXT: vpshufb %xmm0, %xmm5, %xmm1 +; AVX2-ONLY-NEXT: vmovdqa %xmm5, %xmm14 +; AVX2-ONLY-NEXT: vpshufb %xmm0, %xmm8, %xmm0 ; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX2-ONLY-NEXT: vpshufb %xmm3, %xmm11, %xmm1 -; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-ONLY-NEXT: vpshufb %xmm3, %xmm14, %xmm3 -; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; AVX2-ONLY-NEXT: vpshufb %xmm2, %xmm9, %xmm1 +; AVX2-ONLY-NEXT: vpshufb %xmm2, %xmm11, %xmm2 +; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] -; AVX2-ONLY-NEXT: vmovdqa %ymm12, %ymm10 -; AVX2-ONLY-NEXT: vpshufb %ymm4, %ymm12, %ymm1 -; AVX2-ONLY-NEXT: vpermd %ymm1, %ymm2, %ymm1 -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-ONLY-NEXT: vpshufb %ymm4, %ymm12, %ymm3 -; AVX2-ONLY-NEXT: vpermd %ymm3, %ymm2, %ymm3 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-ONLY-NEXT: vpshufb %ymm3, %ymm6, %ymm1 +; AVX2-ONLY-NEXT: vmovdqa %ymm7, %ymm6 +; AVX2-ONLY-NEXT: vpermd %ymm1, %ymm7, %ymm1 +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-ONLY-NEXT: vpshufb %ymm3, %ymm7, %ymm2 +; AVX2-ONLY-NEXT: vpermd %ymm2, %ymm10, %ymm2 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm0 = ; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-ONLY-NEXT: vpshufb %xmm0, %xmm1, %xmm1 -; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-ONLY-NEXT: vpshufb %xmm0, %xmm4, %xmm4 -; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm4 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX2-ONLY-NEXT: vpshufb %xmm0, %xmm12, %xmm3 +; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} xmm3 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX2-ONLY-NEXT: vpshufb %xmm3, %xmm15, %xmm4 ; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-ONLY-NEXT: vpshufb %xmm4, %xmm5, %xmm5 -; AVX2-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-ONLY-NEXT: vpshufb %xmm4, %xmm6, %xmm6 -; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm5[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm5 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u,3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX2-ONLY-NEXT: vpshufb %ymm5, %ymm7, %ymm6 -; AVX2-ONLY-NEXT: vpermd %ymm6, %ymm2, %ymm6 -; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-ONLY-NEXT: vpshufb %ymm5, %ymm7, %ymm7 -; AVX2-ONLY-NEXT: vpermd %ymm7, %ymm2, %ymm7 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3,4,5],ymm6[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm6[4,5,6,7] -; AVX2-ONLY-NEXT: vpshufb %xmm0, %xmm8, %xmm6 -; AVX2-ONLY-NEXT: vpshufb %xmm0, %xmm9, %xmm0 -; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1] -; AVX2-ONLY-NEXT: vpshufb %xmm4, %xmm11, %xmm6 -; AVX2-ONLY-NEXT: vpshufb %xmm4, %xmm14, %xmm4 -; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm4[0,1],xmm0[2,3] -; AVX2-ONLY-NEXT: vpshufb %ymm5, %ymm10, %ymm4 -; AVX2-ONLY-NEXT: vpshufb %ymm5, %ymm12, %ymm5 -; AVX2-ONLY-NEXT: vpermd %ymm4, %ymm2, %ymm4 -; AVX2-ONLY-NEXT: vpermd %ymm5, %ymm2, %ymm2 -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6,7] -; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 32(%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, (%rsi) -; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, 32(%rdx) +; AVX2-ONLY-NEXT: vpshufb %xmm3, %xmm5, %xmm5 +; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm1 = xmm4[0,1],xmm1[2,3] +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm4 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u,3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload +; AVX2-ONLY-NEXT: vpshufb %ymm4, %ymm5, %ymm5 +; AVX2-ONLY-NEXT: vpermd %ymm5, %ymm6, %ymm5 +; AVX2-ONLY-NEXT: vmovdqa %ymm6, %ymm12 +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-ONLY-NEXT: vpshufb %ymm4, %ymm6, %ymm6 +; AVX2-ONLY-NEXT: vpermd %ymm6, %ymm10, %ymm6 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5],ymm5[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5,6,7] +; AVX2-ONLY-NEXT: vpshufb %xmm0, %xmm14, %xmm5 +; AVX2-ONLY-NEXT: vpshufb %xmm0, %xmm8, %xmm0 +; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] +; AVX2-ONLY-NEXT: vpshufb %xmm3, %xmm9, %xmm5 +; AVX2-ONLY-NEXT: vpshufb %xmm3, %xmm11, %xmm3 +; AVX2-ONLY-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} xmm0 = xmm3[0,1],xmm0[2,3] +; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-ONLY-NEXT: vpshufb %ymm4, %ymm3, %ymm3 +; AVX2-ONLY-NEXT: vpermd %ymm3, %ymm12, %ymm3 +; AVX2-ONLY-NEXT: vpshufb %ymm4, %ymm7, %ymm4 +; AVX2-ONLY-NEXT: vpermd %ymm4, %ymm10, %ymm4 +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7] +; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm3, 32(%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm3, (%rsi) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm3, 32(%rdx) +; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-ONLY-NEXT: vmovaps %ymm3, (%rdx) +; AVX2-ONLY-NEXT: vmovdqa %ymm2, 32(%rcx) ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-ONLY-NEXT: vmovaps %ymm2, (%rdx) -; AVX2-ONLY-NEXT: vmovdqa %ymm3, 32(%rcx) -; AVX2-ONLY-NEXT: vmovdqa %ymm13, (%rcx) +; AVX2-ONLY-NEXT: vmovaps %ymm2, (%rcx) ; AVX2-ONLY-NEXT: vmovdqa %ymm0, 32(%r8) ; AVX2-ONLY-NEXT: vmovdqa %ymm1, (%r8) -; AVX2-ONLY-NEXT: addq $216, %rsp +; AVX2-ONLY-NEXT: addq $248, %rsp ; AVX2-ONLY-NEXT: vzeroupper ; AVX2-ONLY-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-5.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-5.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-5.ll @@ -1741,7 +1741,7 @@ ; AVX2-ONLY-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,0,5,10,15,20,25,30,19,24,29,18,23,28,u,u,u,u,u,u,u] ; AVX2-ONLY-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2],ymm1[3,4,5,6,7] ; AVX2-ONLY-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,1,6,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,21,26,31,u,u,u,u,u,u,u,u] -; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} ymm2 = [21474836480,21474836480,21474836480,21474836480] +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm2 = ; AVX2-ONLY-NEXT: vpermd %ymm0, %ymm2, %ymm0 ; AVX2-ONLY-NEXT: vpblendvb %ymm8, %ymm1, %ymm0, %ymm0 ; AVX2-ONLY-NEXT: vmovdqa %ymm6, (%rsi) @@ -3523,8 +3523,7 @@ ; AVX2-ONLY-NEXT: vmovdqa %ymm2, %ymm5 ; AVX2-ONLY-NEXT: vmovdqa %ymm1, %ymm6 ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX2-ONLY-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [255,255,0,255,0,255,255,0,255,0,255,255,0,255,0,255,255,255,0,255,0,255,255,0,255,0,255,255,0,255,0,255] -; AVX2-ONLY-NEXT: # ymm2 = mem[0,1,0,1] +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm2 = ; AVX2-ONLY-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm12 ; AVX2-ONLY-NEXT: vpblendvb %ymm14, %ymm3, %ymm4, %ymm0 ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] @@ -3533,8 +3532,7 @@ ; AVX2-ONLY-NEXT: vpblendvb %ymm1, %ymm5, %ymm6, %ymm0 ; AVX2-ONLY-NEXT: vmovdqa %ymm1, %ymm7 ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX2-ONLY-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,255,255,0,255,0,255,255,0,255,0,255,255,0,255,0,0,255,255,0,255,0,255,255,0,255,0,255,255,0,255,0] -; AVX2-ONLY-NEXT: # ymm2 = mem[0,1,0,1] +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm2 = ; AVX2-ONLY-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm13 ; AVX2-ONLY-NEXT: vpblendvb %ymm7, %ymm4, %ymm3, %ymm0 ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] @@ -3545,8 +3543,7 @@ ; AVX2-ONLY-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqu %ymm5, (%rsp) # 32-byte Spill ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX2-ONLY-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [255,0,255,255,0,255,0,255,255,0,255,0,255,255,0,255,255,0,255,255,0,255,0,255,255,0,255,0,255,255,0,255] -; AVX2-ONLY-NEXT: # ymm2 = mem[0,1,0,1] +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm2 = ; AVX2-ONLY-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -3559,8 +3556,7 @@ ; AVX2-ONLY-NEXT: vpblendvb %ymm1, %ymm5, %ymm6, %ymm0 ; AVX2-ONLY-NEXT: vmovdqa %ymm1, %ymm5 ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX2-ONLY-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,255,0,255,255,0,255,0,255,255,0,255,0,255,255,0,0,255,0,255,255,0,255,0,255,255,0,255,0,255,255,0] -; AVX2-ONLY-NEXT: # ymm2 = mem[0,1,0,1] +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm2 = ; AVX2-ONLY-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-ONLY-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vpblendvb %ymm5, %ymm4, %ymm3, %ymm0 @@ -3655,8 +3651,7 @@ ; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-ONLY-NEXT: vpblendvb %ymm5, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm6 = ymm3[2,3,0,1] -; AVX2-ONLY-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [255,0,255,0,255,255,0,255,0,255,255,0,255,0,255,255,255,0,255,0,255,255,0,255,0,255,255,0,255,0,255,255] -; AVX2-ONLY-NEXT: # ymm11 = mem[0,1,0,1] +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm11 = <255,u,u,u,u,255,u,u,u,u,255,u,u,u,u,255,u,u,255,0,255,u,u,255,0,255,u,u,255,0,255,u> ; AVX2-ONLY-NEXT: vpblendvb %ymm11, %ymm3, %ymm6, %ymm6 ; AVX2-ONLY-NEXT: vpermq {{.*#+}} ymm3 = ymm5[2,3,0,1] ; AVX2-ONLY-NEXT: vpblendvb %ymm11, %ymm5, %ymm3, %ymm0 @@ -3724,7 +3719,7 @@ ; AVX2-ONLY-NEXT: vmovdqa 288(%rdi), %ymm10 ; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm2 = ; AVX2-ONLY-NEXT: vpshufb %ymm2, %ymm10, %ymm10 -; AVX2-ONLY-NEXT: vpbroadcastq {{.*#+}} ymm0 = [21474836480,21474836480,21474836480,21474836480] +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm0 = ; AVX2-ONLY-NEXT: vpermd %ymm10, %ymm0, %ymm10 ; AVX2-ONLY-NEXT: vpblendvb %ymm12, %ymm6, %ymm10, %ymm6 ; AVX2-ONLY-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-7.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-7.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-7.ll @@ -501,8 +501,7 @@ ; AVX1-ONLY-NEXT: vpalignr {{.*#+}} xmm5 = xmm2[3,4,5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2] ; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm6 = ; AVX1-ONLY-NEXT: vpshufb %xmm6, %xmm5, %xmm5 -; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm7 = [1099511627775,1099511627775] -; AVX1-ONLY-NEXT: # xmm7 = mem[0,0] +; AVX1-ONLY-NEXT: vmovdqa {{.*#+}} xmm7 = <255,255,255,255,255,0,0,0,u,u,u,u,u,u,u,u> ; AVX1-ONLY-NEXT: vpblendvb %xmm7, %xmm4, %xmm5, %xmm4 ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm1[6,13,u,u,u,u,u,u,u,u,u,u,u] ; AVX1-ONLY-NEXT: vpshufb {{.*#+}} xmm8 = xmm0[1,8,15],zero,zero,xmm0[u,u,u,u,u,u,u,u,u,u,u] @@ -3069,38 +3068,39 @@ ; AVX2-SLOW-NEXT: subq $104, %rsp ; AVX2-SLOW-NEXT: vmovdqa 160(%rdi), %ymm5 ; AVX2-SLOW-NEXT: vmovdqa 128(%rdi), %ymm6 -; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm3 +; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm1 ; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %ymm4 -; AVX2-SLOW-NEXT: vmovdqa 64(%rdi), %ymm2 +; AVX2-SLOW-NEXT: vmovdqa 64(%rdi), %ymm3 ; AVX2-SLOW-NEXT: vmovdqa 96(%rdi), %ymm14 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm15 = <255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0> -; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm3, %ymm4, %ymm0 -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm7 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = <255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0> +; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm1, %ymm4, %ymm2 +; AVX2-SLOW-NEXT: vmovdqa %ymm0, %ymm10 +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm7 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm7[5,12],zero,zero,xmm7[1,8,15,u,u,u,u,u,u] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,7,14],zero,zero,xmm0[3,10],zero,zero,zero,xmm0[u,u,u,u,u,u] -; AVX2-SLOW-NEXT: vpor %xmm7, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = -; AVX2-SLOW-NEXT: vpblendvb %ymm1, %ymm14, %ymm2, %ymm7 -; AVX2-SLOW-NEXT: vmovdqu %ymm2, (%rsp) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa %ymm1, %ymm13 +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,7,14],zero,zero,xmm2[3,10],zero,zero,zero,xmm2[u,u,u,u,u,u] +; AVX2-SLOW-NEXT: vpor %xmm7, %xmm2, %xmm2 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = +; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm14, %ymm3, %ymm7 +; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa %ymm0, %ymm13 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm7, %xmm8 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0,1],ymm8[2],ymm7[3,4],ymm8[5],ymm7[6,7,8,9],ymm8[10],ymm7[11,12],ymm8[13],ymm7[14,15] ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,u,u,u,u,u,u,u,6,13,4,11,2,9,16,23,30,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = <255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX2-SLOW-NEXT: vpblendvb %ymm8, %ymm0, %ymm7, %ymm0 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = <255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u> -; AVX2-SLOW-NEXT: vpblendvb %ymm1, %ymm5, %ymm6, %ymm7 +; AVX2-SLOW-NEXT: vpblendvb %ymm8, %ymm2, %ymm7, %ymm2 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = <255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u> +; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm5, %ymm6, %ymm7 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm7, %xmm8 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u],zero,zero,xmm8[3,10],zero,zero,zero,xmm8[6,13,u,u,u,u] ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,5,12],zero,zero,xmm7[1,8,15],zero,zero,xmm7[u,u,u,u] ; AVX2-SLOW-NEXT: vpor %xmm7, %xmm8, %xmm7 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = <255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u> -; AVX2-SLOW-NEXT: vpblendvb %ymm1, %ymm3, %ymm4, %ymm8 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm15 = <255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u> +; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm1, %ymm4, %ymm8 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm8, %xmm9 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,zero,xmm9[6,13],zero,zero,xmm9[2,9,u,u,u,u,u,u,u] ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[1,8,15],zero,zero,xmm8[4,11],zero,zero,xmm8[u,u,u,u,u,u,u] ; AVX2-SLOW-NEXT: vpor %xmm9, %xmm8, %xmm8 -; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm2, %ymm14, %ymm9 +; AVX2-SLOW-NEXT: vpblendvb %ymm10, %ymm3, %ymm14, %ymm9 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm9, %xmm10 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm9 = ymm9[0,1],ymm10[2],ymm9[3,4,5],ymm10[6],ymm9[7,8,9],ymm10[10],ymm9[11,12,13],ymm10[14],ymm9[15] ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[u,u,u,u,u,u,u,u,u,0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u] @@ -3115,47 +3115,46 @@ ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm9[7] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm0, %ymm7, %ymm0 +; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm2, %ymm7, %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendvb %ymm13, %ymm5, %ymm6, %ymm0 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = xmm0[u,u,u,6,13],zero,zero,xmm0[2,9],zero,zero,zero,xmm0[u,u,u,u] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u],zero,zero,xmm0[4,11],zero,zero,xmm0[0,7,14,u,u,u,u] -; AVX2-SLOW-NEXT: vpor %xmm7, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vpblendvb %ymm13, %ymm5, %ymm6, %ymm2 +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = xmm2[u,u,u,6,13],zero,zero,xmm2[2,9],zero,zero,zero,xmm2[u,u,u,u] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm2 +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u],zero,zero,xmm2[4,11],zero,zero,xmm2[0,7,14,u,u,u,u] +; AVX2-SLOW-NEXT: vpor %xmm7, %xmm2, %xmm2 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = xmm11[u,u,u,u,u,u,3,10,u,u,u,u,u,u,u,u] ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm12 = xmm10[u,u,u,u,u,u,5,12,u,u,u,u,u,u,u,u] ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm12[0],xmm7[0],xmm12[1],xmm7[1],xmm12[2],xmm7[2],xmm12[3],xmm7[3] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm7[7] -; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm8, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm7[7] +; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm8, %ymm2, %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendvb %ymm1, %ymm6, %ymm5, %ymm0 -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm7 +; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm6, %ymm5, %ymm2 +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm7 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u],zero,zero,zero,xmm7[6,13],zero,zero,xmm7[2,9,u,u,u,u,u] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,1,8,15],zero,zero,xmm0[4,11],zero,zero,xmm0[u,u,u,u,u] -; AVX2-SLOW-NEXT: vpor %xmm7, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,1,8,15],zero,zero,xmm2[4,11],zero,zero,xmm2[u,u,u,u,u] +; AVX2-SLOW-NEXT: vpor %xmm7, %xmm2, %xmm2 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = xmm11[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm11[5,12] ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm8 = xmm10[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero ; AVX2-SLOW-NEXT: vpor %xmm7, %xmm8, %xmm7 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm8 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,0,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,0,0,0,0,0] -; AVX2-SLOW-NEXT: # ymm1 = mem[0,1,0,1] -; AVX2-SLOW-NEXT: vpblendvb %ymm1, %ymm8, %ymm7, %ymm0 -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = -; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm6, %ymm5, %ymm7 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm8 = xmm7[u,u,2,9],zero,zero,zero,xmm7[5,12],zero,zero,xmm7[u,u,u,u,u] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm7, %xmm7 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u],zero,zero,xmm7[0,7,14],zero,zero,xmm7[3,10,u,u,u,u,u] -; AVX2-SLOW-NEXT: vpor %xmm7, %xmm8, %xmm7 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm8 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = +; AVX2-SLOW-NEXT: vpblendvb %ymm7, %ymm2, %ymm8, %ymm0 +; AVX2-SLOW-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = +; AVX2-SLOW-NEXT: vpblendvb %ymm3, %ymm6, %ymm5, %ymm2 +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm8 = xmm2[u,u,2,9],zero,zero,zero,xmm2[5,12],zero,zero,xmm2[u,u,u,u,u] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm2 +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u],zero,zero,xmm2[0,7,14],zero,zero,xmm2[3,10,u,u,u,u,u] +; AVX2-SLOW-NEXT: vpor %xmm2, %xmm8, %xmm2 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm8 = xmm11[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm11[6,13] ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm12 = xmm10[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero ; AVX2-SLOW-NEXT: vpor %xmm8, %xmm12, %xmm8 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX2-SLOW-NEXT: vpblendvb %ymm1, %ymm7, %ymm8, %ymm0 +; AVX2-SLOW-NEXT: vpblendvb %ymm7, %ymm2, %ymm8, %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = ; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm6, %ymm5, %ymm8 @@ -3168,9 +3167,10 @@ ; AVX2-SLOW-NEXT: vpor %xmm8, %xmm13, %xmm8 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX2-SLOW-NEXT: vpblendvb %ymm1, %ymm0, %ymm8, %ymm0 +; AVX2-SLOW-NEXT: vpblendvb %ymm7, %ymm0, %ymm8, %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm6, %ymm5, %ymm13 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = <255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0> +; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm6, %ymm5, %ymm13 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm12 = <255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u> ; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm5, %ymm6, %ymm5 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm6 @@ -3182,64 +3182,63 @@ ; AVX2-SLOW-NEXT: vpor %xmm6, %xmm15, %xmm6 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX2-SLOW-NEXT: vpblendvb %ymm1, %ymm5, %ymm6, %ymm0 +; AVX2-SLOW-NEXT: vpblendvb %ymm7, %ymm5, %ymm6, %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = -; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm4, %ymm3, %ymm5 -; AVX2-SLOW-NEXT: vmovdqa %ymm2, %ymm7 -; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm3, %ymm4, %ymm6 -; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm3, %ymm4, %ymm15 -; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm4, %ymm3, %ymm2 +; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm4, %ymm1, %ymm6 +; AVX2-SLOW-NEXT: vpblendvb %ymm3, %ymm1, %ymm4, %ymm7 +; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm1, %ymm4, %ymm15 +; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm4, %ymm1, %ymm2 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = <255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u> -; AVX2-SLOW-NEXT: vpblendvb %ymm8, %ymm4, %ymm3, %ymm3 -; AVX2-SLOW-NEXT: vmovdqu (%rsp), %ymm0 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendvb %ymm8, %ymm4, %ymm1, %ymm5 +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = <255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u> ; AVX2-SLOW-NEXT: vpblendvb %ymm1, %ymm0, %ymm14, %ymm4 -; AVX2-SLOW-NEXT: vpblendvb %ymm7, %ymm0, %ymm14, %ymm1 +; AVX2-SLOW-NEXT: vpblendvb %ymm3, %ymm0, %ymm14, %ymm1 ; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm0, %ymm14, %ymm9 -; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm14, %ymm0, %ymm7 -; AVX2-SLOW-NEXT: vpblendvb %ymm8, %ymm14, %ymm0, %ymm8 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm14 = xmm6[2,9],zero,zero,zero,xmm6[5,12],zero,zero,xmm6[u,u,u,u,u,u,u] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm6 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,xmm6[0,7,14],zero,zero,xmm6[3,10,u,u,u,u,u,u,u] -; AVX2-SLOW-NEXT: vpor %xmm6, %xmm14, %xmm6 -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm14 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1,2],ymm14[3],ymm4[4,5],ymm14[6],ymm4[7,8,9,10],ymm14[11],ymm4[12,13],ymm14[14],ymm4[15] +; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm14, %ymm0, %ymm3 +; AVX2-SLOW-NEXT: vpblendvb %ymm8, %ymm14, %ymm0, %ymm14 +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm7[2,9],zero,zero,zero,xmm7[5,12],zero,zero,xmm7[u,u,u,u,u,u,u] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm7, %xmm7 +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,xmm7[0,7,14],zero,zero,xmm7[3,10,u,u,u,u,u,u,u] +; AVX2-SLOW-NEXT: vpor %xmm0, %xmm7, %xmm0 +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm7 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1,2],ymm7[3],ymm4[4,5],ymm7[6],ymm4[7,8,9,10],ymm7[11],ymm4[12,13],ymm7[14],ymm4[15] ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,1,8,15,6,13,4,11,18,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm14 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0] -; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm6, %ymm4, %ymm4 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm6 = xmm15[3,10],zero,zero,zero,xmm15[6,13],zero,zero,xmm15[u,u,u,u,u,u,u] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm7 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0] +; AVX2-SLOW-NEXT: vpblendvb %ymm7, %ymm0, %ymm4, %ymm8 +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = xmm15[3,10],zero,zero,zero,xmm15[6,13],zero,zero,xmm15[u,u,u,u,u,u,u] ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm15, %xmm15 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm15 = zero,zero,xmm15[1,8,15],zero,zero,xmm15[4,11,u,u,u,u,u,u,u] -; AVX2-SLOW-NEXT: vpor %xmm6, %xmm15, %xmm6 +; AVX2-SLOW-NEXT: vpor %xmm4, %xmm15, %xmm4 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm15 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm15[0],ymm1[1,2],ymm15[3],ymm1[4,5,6],ymm15[7,8],ymm1[9,10],ymm15[11],ymm1[12,13,14],ymm15[15] ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,2,9,0,7,14,5,12,19,26,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm6, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm6 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,xmm6[2,9],zero,zero,zero,xmm6[5,12,u,u,u,u,u,u,u] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm1 = xmm2[4,11],zero,zero,xmm2[0,7,14],zero,zero,xmm2[u,u,u,u,u,u,u] -; AVX2-SLOW-NEXT: vpor %xmm6, %xmm1, %xmm1 -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm9, %xmm6 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0],ymm9[1,2,3],ymm6[4],ymm9[5,6],ymm6[7,8],ymm9[9,10,11],ymm6[12],ymm9[13,14],ymm6[15] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,u,u,u,u,u,3,10,1,8,15,6,13,20,27,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm1, %ymm6, %ymm1 -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm6 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,xmm6[3,10],zero,zero,zero,xmm6[6,13,u,u,u,u,u,u,u] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[5,12],zero,zero,xmm3[1,8,15],zero,zero,xmm3[u,u,u,u,u,u,u] -; AVX2-SLOW-NEXT: vpor %xmm6, %xmm3, %xmm3 -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm7, %xmm6 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0],ymm6[1],ymm7[2,3],ymm6[4],ymm7[5,6,7,8],ymm6[9],ymm7[10,11],ymm6[12],ymm7[13,14,15] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,u,u,u,u,u,4,11,2,9,0,7,14,21,28,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm3, %ymm6, %ymm3 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm6 = xmm5[6,13],zero,zero,xmm5[2,9],zero,zero,zero,xmm5[u,u,u,u,u,u,u] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm5 +; AVX2-SLOW-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm4 +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,xmm4[2,9],zero,zero,zero,xmm4[5,12,u,u,u,u,u,u,u] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[4,11],zero,zero,xmm2[0,7,14],zero,zero,xmm2[u,u,u,u,u,u,u] +; AVX2-SLOW-NEXT: vpor %xmm4, %xmm2, %xmm2 +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm9, %xmm4 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm9[1,2,3],ymm4[4],ymm9[5,6],ymm4[7,8],ymm9[9,10,11],ymm4[12],ymm9[13,14],ymm4[15] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,3,10,1,8,15,6,13,20,27,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-SLOW-NEXT: vpblendvb %ymm7, %ymm2, %ymm4, %ymm2 +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm4 +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,xmm4[3,10],zero,zero,zero,xmm4[6,13,u,u,u,u,u,u,u] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[5,12],zero,zero,xmm5[1,8,15],zero,zero,xmm5[u,u,u,u,u,u,u] +; AVX2-SLOW-NEXT: vpor %xmm4, %xmm5, %xmm4 +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm5 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm5[1],ymm3[2,3],ymm5[4],ymm3[5,6,7,8],ymm5[9],ymm3[10,11],ymm5[12],ymm3[13,14,15] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,4,11,2,9,0,7,14,21,28,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-SLOW-NEXT: vpblendvb %ymm7, %ymm4, %ymm3, %ymm3 +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm4 = xmm6[6,13],zero,zero,xmm6[2,9],zero,zero,zero,xmm6[u,u,u,u,u,u,u] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm5 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,xmm5[4,11],zero,zero,xmm5[0,7,14,u,u,u,u,u,u,u] -; AVX2-SLOW-NEXT: vpor %xmm6, %xmm5, %xmm5 -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm8, %xmm6 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm8[0],ymm6[1],ymm8[2,3,4],ymm6[5],ymm8[6,7,8],ymm6[9],ymm8[10,11,12],ymm6[13],ymm8[14,15] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,5,12,3,10,1,8,15,22,29,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm5, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vpor %xmm4, %xmm5, %xmm4 +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm14, %xmm5 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm14[0],ymm5[1],ymm14[2,3,4],ymm5[5],ymm14[6,7,8],ymm5[9],ymm14[10,11,12],ymm5[13],ymm14[14,15] +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,u,u,u,u,u,5,12,3,10,1,8,15,22,29,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-SLOW-NEXT: vpblendvb %ymm7, %ymm4, %ymm5, %ymm4 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm13, %xmm5 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u],zero,zero,zero,xmm5[5,12],zero,zero,xmm5[1,8,15,u,u,u,u] ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm6 = xmm13[u,u,0,7,14],zero,zero,xmm13[3,10],zero,zero,zero,xmm13[u,u,u,u] @@ -3250,31 +3249,31 @@ ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm6[7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm4[0],ymm5[1,2,3,4,5,6,7],ymm4[8],ymm5[9,10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] -; AVX2-SLOW-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm5 = ymm8[0],ymm5[1,2,3,4,5,6,7],ymm8[8],ymm5[9,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3],ymm5[4,5,6,7] +; AVX2-SLOW-NEXT: vpblendw $254, (%rsp), %ymm0, %ymm5 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm5 = ymm0[0],mem[1,2,3,4,5,6,7],ymm0[8],mem[9,10,11,12,13,14,15] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7] -; AVX2-SLOW-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm5 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm5 = ymm1[0],mem[1,2,3,4,5,6,7],ymm1[8],mem[9,10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5,6,7] -; AVX2-SLOW-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm5 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm5 = ymm3[0],mem[1,2,3,4,5,6,7],ymm3[8],mem[9,10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm5[4,5,6,7] ; AVX2-SLOW-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm5 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm5 = ymm2[0],mem[1,2,3,4,5,6,7],ymm2[8],mem[9,10,11,12,13,14,15] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7] +; AVX2-SLOW-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm5 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm5 = ymm3[0],mem[1,2,3,4,5,6,7],ymm3[8],mem[9,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm5[4,5,6,7] +; AVX2-SLOW-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm5 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm5 = ymm4[0],mem[1,2,3,4,5,6,7],ymm4[8],mem[9,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm5, (%rsi) ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm5, (%rdx) -; AVX2-SLOW-NEXT: vmovdqa %ymm4, (%rcx) +; AVX2-SLOW-NEXT: vmovdqa %ymm1, (%rcx) ; AVX2-SLOW-NEXT: vmovdqa %ymm0, (%r8) -; AVX2-SLOW-NEXT: vmovdqa %ymm1, (%r9) +; AVX2-SLOW-NEXT: vmovdqa %ymm2, (%r9) ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-SLOW-NEXT: vmovdqa %ymm3, (%rax) ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-SLOW-NEXT: vmovdqa %ymm2, (%rax) +; AVX2-SLOW-NEXT: vmovdqa %ymm4, (%rax) ; AVX2-SLOW-NEXT: addq $104, %rsp ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq @@ -3287,106 +3286,105 @@ ; AVX2-FAST-NEXT: vmovdqa 128(%rdi), %ymm7 ; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm4 ; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %ymm5 -; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %ymm2 +; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %ymm1 ; AVX2-FAST-NEXT: vmovdqa 96(%rdi), %ymm3 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0> -; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm4, %ymm5, %ymm0 -; AVX2-FAST-NEXT: vmovdqa %ymm1, %ymm13 -; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[5,12],zero,zero,xmm1[1,8,15,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0> +; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm4, %ymm5, %ymm0 +; AVX2-FAST-NEXT: vmovdqa %ymm2, %ymm13 +; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm2[5,12],zero,zero,xmm2[1,8,15,u,u,u,u,u,u] ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,7,14],zero,zero,xmm0[3,10],zero,zero,zero,xmm0[u,u,u,u,u,u] -; AVX2-FAST-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpor %xmm2, %xmm0, %xmm0 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = -; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm3, %ymm2, %ymm1 +; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm3, %ymm1, %ymm2 ; AVX2-FAST-NEXT: vmovdqa %ymm8, %ymm10 -; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm8 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm8[2],ymm1[3,4],ymm8[5],ymm1[6,7,8,9],ymm8[10],ymm1[11,12],ymm8[13],ymm1[14,15] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,6,13,4,11,2,9,16,23,30,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm2, %xmm8 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm8[2],ymm2[3,4],ymm8[5],ymm2[6,7,8,9],ymm8[10],ymm2[11,12],ymm8[13],ymm2[14,15] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,6,13,4,11,2,9,16,23,30,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = <255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm0, %ymm1, %ymm0 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u> -; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm6, %ymm7, %ymm1 -; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm8 +; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm0, %ymm2, %ymm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u> +; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm6, %ymm7, %ymm2 +; AVX2-FAST-NEXT: vextracti128 $1, %ymm2, %xmm8 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u],zero,zero,xmm8[3,10],zero,zero,zero,xmm8[6,13,u,u,u,u] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,5,12],zero,zero,xmm1[1,8,15],zero,zero,xmm1[u,u,u,u] -; AVX2-FAST-NEXT: vpor %xmm1, %xmm8, %xmm1 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,5,12],zero,zero,xmm2[1,8,15],zero,zero,xmm2[u,u,u,u] +; AVX2-FAST-NEXT: vpor %xmm2, %xmm8, %xmm2 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = ; AVX2-FAST-NEXT: vpermd %ymm9, %ymm8, %ymm8 ; AVX2-FAST-NEXT: vmovdqa %ymm9, %ymm11 ; AVX2-FAST-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,23,26,29] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm8[7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm8[7] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm0, %ymm2, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u> -; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm4, %ymm5, %ymm0 -; AVX2-FAST-NEXT: vmovdqa %ymm1, %ymm12 -; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[6,13],zero,zero,xmm1[2,9,u,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u> +; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm4, %ymm5, %ymm0 +; AVX2-FAST-NEXT: vmovdqa %ymm2, %ymm12 +; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm2[6,13],zero,zero,xmm2[2,9,u,u,u,u,u,u,u] ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,8,15],zero,zero,xmm0[4,11],zero,zero,xmm0[u,u,u,u,u,u,u] -; AVX2-FAST-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm2, %ymm3, %ymm1 -; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm9 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm9[2],ymm1[3,4,5],ymm9[6],ymm1[7,8,9],ymm9[10],ymm1[11,12,13],ymm9[14],ymm1[15] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vpor %xmm2, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm1, %ymm3, %ymm2 +; AVX2-FAST-NEXT: vextracti128 $1, %ymm2, %xmm9 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm9[2],ymm2[3,4,5],ymm9[6],ymm2[7,8,9],ymm9[10],ymm2[11,12,13],ymm9[14],ymm2[15] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = <0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,u,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm1, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm6, %ymm7, %ymm1 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm9 = xmm1[u,u,u,6,13],zero,zero,xmm1[2,9],zero,zero,zero,xmm1[u,u,u,u] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm1 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u],zero,zero,xmm1[4,11],zero,zero,xmm1[0,7,14,u,u,u,u] -; AVX2-FAST-NEXT: vpor %xmm1, %xmm9, %xmm1 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm2, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm6, %ymm7, %ymm2 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm9 = xmm2[u,u,u,6,13],zero,zero,xmm2[2,9],zero,zero,zero,xmm2[u,u,u,u] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm2, %xmm2 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u],zero,zero,xmm2[4,11],zero,zero,xmm2[0,7,14,u,u,u,u] +; AVX2-FAST-NEXT: vpor %xmm2, %xmm9, %xmm2 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = ; AVX2-FAST-NEXT: vpermd %ymm11, %ymm9, %ymm9 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,20,27,30] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm9[7] -; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm9[7] +; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm0, %ymm2, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpblendvb %ymm12, %ymm7, %ymm6, %ymm0 -; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u],zero,zero,zero,xmm1[6,13],zero,zero,xmm1[2,9,u,u,u,u,u] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u],zero,zero,zero,xmm2[6,13],zero,zero,xmm2[2,9,u,u,u,u,u] ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,1,8,15],zero,zero,xmm0[4,11],zero,zero,xmm0[u,u,u,u,u] -; AVX2-FAST-NEXT: vpor %xmm1, %xmm0, %xmm1 +; AVX2-FAST-NEXT: vpor %xmm2, %xmm0, %xmm2 ; AVX2-FAST-NEXT: vmovdqa 208(%rdi), %xmm15 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm15[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm15[5,12] ; AVX2-FAST-NEXT: vmovdqa 192(%rdi), %xmm0 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm9 = xmm0[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero ; AVX2-FAST-NEXT: vpor %xmm8, %xmm9, %xmm8 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm9 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,0,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,0,0,0,0,0] -; AVX2-FAST-NEXT: # ymm1 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm9, %ymm8, %ymm8 -; AVX2-FAST-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = +; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm2, %ymm8, %ymm2 +; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = -; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm7, %ymm6, %ymm8 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm9 = xmm8[u,u,2,9],zero,zero,zero,xmm8[5,12],zero,zero,xmm8[u,u,u,u,u] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm8, %xmm8 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u],zero,zero,xmm8[0,7,14],zero,zero,xmm8[3,10,u,u,u,u,u] -; AVX2-FAST-NEXT: vpor %xmm9, %xmm8, %xmm8 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm9 = xmm15[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm15[6,13] +; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm7, %ymm6, %ymm2 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm2[u,u,2,9],zero,zero,zero,xmm2[5,12],zero,zero,xmm2[u,u,u,u,u] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm2, %xmm2 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u],zero,zero,xmm2[0,7,14],zero,zero,xmm2[3,10,u,u,u,u,u] +; AVX2-FAST-NEXT: vpor %xmm2, %xmm8, %xmm2 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm15[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm15[6,13] ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm10 = xmm0[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero -; AVX2-FAST-NEXT: vpor %xmm9, %xmm10, %xmm9 +; AVX2-FAST-NEXT: vpor %xmm8, %xmm10, %xmm8 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 -; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm8, %ymm9, %ymm14 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = -; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm7, %ymm6, %ymm8 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm10 = xmm8[u,u,3,10],zero,zero,zero,xmm8[6,13],zero,zero,xmm8[u,u,u,u,u] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm8, %xmm8 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u],zero,zero,xmm8[1,8,15],zero,zero,xmm8[4,11,u,u,u,u,u] -; AVX2-FAST-NEXT: vpor %xmm10, %xmm8, %xmm8 +; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm2, %ymm8, %ymm14 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = +; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm7, %ymm6, %ymm2 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm10 = xmm2[u,u,3,10],zero,zero,zero,xmm2[6,13],zero,zero,xmm2[u,u,u,u,u] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm2, %xmm2 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u],zero,zero,xmm2[1,8,15],zero,zero,xmm2[4,11,u,u,u,u,u] +; AVX2-FAST-NEXT: vpor %xmm2, %xmm10, %xmm2 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm10 = xmm0[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm13 = xmm15[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm15[0,7,14] ; AVX2-FAST-NEXT: vpor %xmm10, %xmm13, %xmm10 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 -; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm8, %ymm10, %ymm12 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = <255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0> -; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm7, %ymm6, %ymm8 +; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm2, %ymm10, %ymm12 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0> +; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm7, %ymm6, %ymm2 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = <255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u> ; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm6, %ymm7, %ymm6 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm6, %xmm7 @@ -3398,45 +3396,45 @@ ; AVX2-FAST-NEXT: vpor %xmm0, %xmm7, %xmm0 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm6, %ymm0, %ymm10 +; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm6, %ymm0, %ymm10 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = -; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm5, %ymm4, %ymm1 -; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm4, %ymm5, %ymm6 -; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm4, %ymm5, %ymm7 +; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm5, %ymm4, %ymm6 +; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm4, %ymm5, %ymm7 +; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm4, %ymm5, %ymm9 ; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm5, %ymm4, %ymm15 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = <255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u> ; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm5, %ymm4, %ymm4 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = <255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u> -; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm2, %ymm3, %ymm5 -; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm2, %ymm3, %ymm11 -; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm2, %ymm3, %ymm9 -; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm3, %ymm2, %ymm13 -; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm3, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm6[2,9],zero,zero,zero,xmm6[5,12],zero,zero,xmm6[u,u,u,u,u,u,u] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm6, %xmm6 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,xmm6[0,7,14],zero,zero,xmm6[3,10,u,u,u,u,u,u,u] -; AVX2-FAST-NEXT: vpor %xmm3, %xmm6, %xmm3 -; AVX2-FAST-NEXT: vextracti128 $1, %ymm5, %xmm6 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1,2],ymm6[3],ymm5[4,5],ymm6[6],ymm5[7,8,9,10],ymm6[11],ymm5[12,13],ymm6[14],ymm5[15] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,u,u,u,u,u,1,8,15,6,13,4,11,18,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0] -; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm3, %ymm5, %ymm3 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm7[3,10],zero,zero,zero,xmm7[6,13],zero,zero,xmm7[u,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm1, %ymm3, %ymm5 +; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm1, %ymm3, %ymm11 +; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm1, %ymm3, %ymm8 +; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm3, %ymm1, %ymm13 +; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm3, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm7[2,9],zero,zero,zero,xmm7[5,12],zero,zero,xmm7[u,u,u,u,u,u,u] ; AVX2-FAST-NEXT: vextracti128 $1, %ymm7, %xmm7 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,xmm7[1,8,15],zero,zero,xmm7[4,11,u,u,u,u,u,u,u] -; AVX2-FAST-NEXT: vpor %xmm5, %xmm7, %xmm5 -; AVX2-FAST-NEXT: vextracti128 $1, %ymm11, %xmm7 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0],ymm11[1,2],ymm7[3],ymm11[4,5,6],ymm7[7,8],ymm11[9,10],ymm7[11],ymm11[12,13,14],ymm7[15] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,u,u,u,u,u,u,2,9,0,7,14,5,12,19,26,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm5, %ymm7, %ymm5 -; AVX2-FAST-NEXT: vextracti128 $1, %ymm15, %xmm7 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,xmm7[2,9],zero,zero,zero,xmm7[5,12,u,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,xmm7[0,7,14],zero,zero,xmm7[3,10,u,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vpor %xmm3, %xmm7, %xmm3 +; AVX2-FAST-NEXT: vextracti128 $1, %ymm5, %xmm7 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm5[0,1,2],ymm7[3],ymm5[4,5],ymm7[6],ymm5[7,8,9,10],ymm7[11],ymm5[12,13],ymm7[14],ymm5[15] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,u,u,u,u,u,1,8,15,6,13,4,11,18,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm7 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0] +; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm3, %ymm5, %ymm3 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm5 = xmm9[3,10],zero,zero,zero,xmm9[6,13],zero,zero,xmm9[u,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm9, %xmm9 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,xmm9[1,8,15],zero,zero,xmm9[4,11,u,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vpor %xmm5, %xmm9, %xmm5 +; AVX2-FAST-NEXT: vextracti128 $1, %ymm11, %xmm9 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm9 = ymm9[0],ymm11[1,2],ymm9[3],ymm11[4,5,6],ymm9[7,8],ymm11[9,10],ymm9[11],ymm11[12,13,14],ymm9[15] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[u,u,u,u,u,u,u,u,u,2,9,0,7,14,5,12,19,26,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm5, %ymm9, %ymm5 +; AVX2-FAST-NEXT: vextracti128 $1, %ymm15, %xmm9 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,xmm9[2,9],zero,zero,zero,xmm9[5,12,u,u,u,u,u,u,u] ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm15[4,11],zero,zero,xmm15[0,7,14],zero,zero,xmm15[u,u,u,u,u,u,u] -; AVX2-FAST-NEXT: vpor %xmm7, %xmm11, %xmm7 -; AVX2-FAST-NEXT: vextracti128 $1, %ymm9, %xmm11 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm9 = ymm11[0],ymm9[1,2,3],ymm11[4],ymm9[5,6],ymm11[7,8],ymm9[9,10,11],ymm11[12],ymm9[13,14],ymm11[15] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[u,u,u,u,u,u,u,u,u,3,10,1,8,15,6,13,20,27,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm7, %ymm9, %ymm7 +; AVX2-FAST-NEXT: vpor %xmm9, %xmm11, %xmm9 +; AVX2-FAST-NEXT: vextracti128 $1, %ymm8, %xmm11 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm11[0],ymm8[1,2,3],ymm11[4],ymm8[5,6],ymm11[7,8],ymm8[9,10,11],ymm11[12],ymm8[13,14],ymm11[15] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[u,u,u,u,u,u,u,u,u,3,10,1,8,15,6,13,20,27,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm9, %ymm8, %ymm8 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm4, %xmm9 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,xmm9[3,10],zero,zero,zero,xmm9[6,13,u,u,u,u,u,u,u] ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[5,12],zero,zero,xmm4[1,8,15],zero,zero,xmm4[u,u,u,u,u,u,u] @@ -3444,19 +3442,19 @@ ; AVX2-FAST-NEXT: vextracti128 $1, %ymm13, %xmm9 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm9 = ymm13[0],ymm9[1],ymm13[2,3],ymm9[4],ymm13[5,6,7,8],ymm9[9],ymm13[10,11],ymm9[12],ymm13[13,14,15] ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[u,u,u,u,u,u,u,u,u,4,11,2,9,0,7,14,21,28,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm4, %ymm9, %ymm4 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm9 = xmm1[6,13],zero,zero,xmm1[2,9],zero,zero,zero,xmm1[u,u,u,u,u,u,u] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm1 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,xmm1[4,11],zero,zero,xmm1[0,7,14,u,u,u,u,u,u,u] -; AVX2-FAST-NEXT: vpor %xmm1, %xmm9, %xmm1 -; AVX2-FAST-NEXT: vextracti128 $1, %ymm2, %xmm9 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm9[1],ymm2[2,3,4],ymm9[5],ymm2[6,7,8],ymm9[9],ymm2[10,11,12],ymm9[13],ymm2[14,15] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,5,12,3,10,1,8,15,22,29,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm1, %ymm2, %ymm1 -; AVX2-FAST-NEXT: vextracti128 $1, %ymm8, %xmm2 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u],zero,zero,zero,xmm2[5,12],zero,zero,xmm2[1,8,15,u,u,u,u] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm8[u,u,0,7,14],zero,zero,xmm8[3,10],zero,zero,zero,xmm8[u,u,u,u] -; AVX2-FAST-NEXT: vpor %xmm2, %xmm6, %xmm2 +; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm4, %ymm9, %ymm4 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm9 = xmm6[6,13],zero,zero,xmm6[2,9],zero,zero,zero,xmm6[u,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vextracti128 $1, %ymm6, %xmm6 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,xmm6[4,11],zero,zero,xmm6[0,7,14,u,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vpor %xmm6, %xmm9, %xmm6 +; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm9 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm9[1],ymm1[2,3,4],ymm9[5],ymm1[6,7,8],ymm9[9],ymm1[10,11,12],ymm9[13],ymm1[14,15] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,5,12,3,10,1,8,15,22,29,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm6, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vextracti128 $1, %ymm2, %xmm6 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u],zero,zero,zero,xmm6[5,12],zero,zero,xmm6[1,8,15,u,u,u,u] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,0,7,14],zero,zero,xmm2[3,10],zero,zero,zero,xmm2[u,u,u,u] +; AVX2-FAST-NEXT: vpor %xmm6, %xmm2, %xmm2 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = ; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload ; AVX2-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 @@ -3467,8 +3465,8 @@ ; AVX2-FAST-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm3 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm3 = ymm5[0],mem[1,2,3,4,5,6,7],ymm5[8],mem[9,10,11,12,13,14,15] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5,6,7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm7[0],ymm14[1,2,3,4,5,6,7],ymm7[8],ymm14[9,10,11,12,13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0,1,2,3],ymm5[4,5,6,7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm8[0],ymm14[1,2,3,4,5,6,7],ymm8[8],ymm14[9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3],ymm5[4,5,6,7] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm4[0],ymm12[1,2,3,4,5,6,7],ymm4[8],ymm12[9,10,11,12,13,14,15] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4,5,6,7] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm10[1,2,3,4,5,6,7],ymm1[8],ymm10[9,10,11,12,13,14,15] @@ -3493,38 +3491,39 @@ ; AVX2-FAST-PERLANE-NEXT: subq $104, %rsp ; AVX2-FAST-PERLANE-NEXT: vmovdqa 160(%rdi), %ymm5 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 128(%rdi), %ymm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %ymm3 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdi), %ymm14 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm15 = <255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0> -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm3, %ymm4, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm0 = <255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0> +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm1, %ymm4, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, %ymm10 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm7 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm7[5,12],zero,zero,xmm7[1,8,15,u,u,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,7,14],zero,zero,xmm0[3,10],zero,zero,zero,xmm0[u,u,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vpor %xmm7, %xmm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm1 = -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm1, %ymm14, %ymm2, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, (%rsp) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, %ymm13 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,7,14],zero,zero,xmm2[3,10],zero,zero,zero,xmm2[u,u,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vpor %xmm7, %xmm2, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm0 = +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm14, %ymm3, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, %ymm13 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm7, %xmm8 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm7 = ymm7[0,1],ymm8[2],ymm7[3,4],ymm8[5],ymm7[6,7,8,9],ymm8[10],ymm7[11,12],ymm8[13],ymm7[14,15] ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,u,u,u,u,u,u,u,6,13,4,11,2,9,16,23,30,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm8 = <255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm0, %ymm7, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm1 = <255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u> -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm1, %ymm5, %ymm6, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm2, %ymm7, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm0 = <255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u> +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm5, %ymm6, %ymm7 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm7, %xmm8 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u],zero,zero,xmm8[3,10],zero,zero,zero,xmm8[6,13,u,u,u,u] ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,5,12],zero,zero,xmm7[1,8,15],zero,zero,xmm7[u,u,u,u] ; AVX2-FAST-PERLANE-NEXT: vpor %xmm7, %xmm8, %xmm7 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm1 = <255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u> -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm1, %ymm3, %ymm4, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm15 = <255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u> +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm1, %ymm4, %ymm8 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm8, %xmm9 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,zero,xmm9[6,13],zero,zero,xmm9[2,9,u,u,u,u,u,u,u] ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[1,8,15],zero,zero,xmm8[4,11],zero,zero,xmm8[u,u,u,u,u,u,u] ; AVX2-FAST-PERLANE-NEXT: vpor %xmm9, %xmm8, %xmm8 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm2, %ymm14, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm3, %ymm14, %ymm9 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm9, %xmm10 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm9 = ymm9[0,1],ymm10[2],ymm9[3,4,5],ymm10[6],ymm9[7,8,9],ymm10[10],ymm9[11,12,13],ymm10[14],ymm9[15] ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[u,u,u,u,u,u,u,u,u,0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u] @@ -3539,47 +3538,46 @@ ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm9[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm9, %ymm0, %ymm7, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm9, %ymm2, %ymm7, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm13, %ymm5, %ymm6, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm7 = xmm0[u,u,u,6,13],zero,zero,xmm0[2,9],zero,zero,zero,xmm0[u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u],zero,zero,xmm0[4,11],zero,zero,xmm0[0,7,14,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vpor %xmm7, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm13, %ymm5, %ymm6, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm7 = xmm2[u,u,u,6,13],zero,zero,xmm2[2,9],zero,zero,zero,xmm2[u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u],zero,zero,xmm2[4,11],zero,zero,xmm2[0,7,14,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vpor %xmm7, %xmm2, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm7 = xmm11[u,u,u,u,u,u,3,10,u,u,u,u,u,u,u,u] ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm12 = xmm10[u,u,u,u,u,u,5,12,u,u,u,u,u,u,u,u] ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm12[0],xmm7[0],xmm12[1],xmm7[1],xmm12[2],xmm7[2],xmm12[3],xmm7[3] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm7[7] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm9, %ymm8, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm7[7] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm9, %ymm8, %ymm2, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm1, %ymm6, %ymm5, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm6, %ymm5, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm7 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u],zero,zero,zero,xmm7[6,13],zero,zero,xmm7[2,9,u,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,1,8,15],zero,zero,xmm0[4,11],zero,zero,xmm0[u,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vpor %xmm7, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,1,8,15],zero,zero,xmm2[4,11],zero,zero,xmm2[u,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vpor %xmm7, %xmm2, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm7 = xmm11[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm11[5,12] ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm8 = xmm10[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero ; AVX2-FAST-PERLANE-NEXT: vpor %xmm7, %xmm8, %xmm7 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,0,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,0,0,0,0,0] -; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm1, %ymm8, %ymm7, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm6, %ymm5, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm8 = xmm7[u,u,2,9],zero,zero,zero,xmm7[5,12],zero,zero,xmm7[u,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm7, %xmm7 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u],zero,zero,xmm7[0,7,14],zero,zero,xmm7[3,10,u,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vpor %xmm7, %xmm8, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm7 = +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm2, %ymm8, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm3 = +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, %ymm6, %ymm5, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm8 = xmm2[u,u,2,9],zero,zero,zero,xmm2[5,12],zero,zero,xmm2[u,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u],zero,zero,xmm2[0,7,14],zero,zero,xmm2[3,10,u,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vpor %xmm2, %xmm8, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm8 = xmm11[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm11[6,13] ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm12 = xmm10[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero ; AVX2-FAST-PERLANE-NEXT: vpor %xmm8, %xmm12, %xmm8 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm1, %ymm7, %ymm8, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm2, %ymm8, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm9 = ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm9, %ymm6, %ymm5, %ymm8 @@ -3592,9 +3590,10 @@ ; AVX2-FAST-PERLANE-NEXT: vpor %xmm8, %xmm13, %xmm8 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm1, %ymm0, %ymm8, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm0, %ymm8, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm6, %ymm5, %ymm13 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm0 = <255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0> +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm6, %ymm5, %ymm13 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm12 = <255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u> ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm5, %ymm6, %ymm5 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm5, %xmm6 @@ -3606,64 +3605,63 @@ ; AVX2-FAST-PERLANE-NEXT: vpor %xmm6, %xmm15, %xmm6 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm1, %ymm5, %ymm6, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm5, %ymm6, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm0 = -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm4, %ymm3, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm3, %ymm4, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm9, %ymm3, %ymm4, %ymm15 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm4, %ymm3, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm4, %ymm1, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, %ymm1, %ymm4, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm9, %ymm1, %ymm4, %ymm15 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm4, %ymm1, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm8 = <255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u> -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm4, %ymm3, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqu (%rsp), %ymm0 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm4, %ymm1, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm1 = <255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u> ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm1, %ymm0, %ymm14, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm0, %ymm14, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, %ymm0, %ymm14, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm9, %ymm0, %ymm14, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm14, %ymm0, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm14, %ymm0, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm14 = xmm6[2,9],zero,zero,zero,xmm6[5,12],zero,zero,xmm6[u,u,u,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm6, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,xmm6[0,7,14],zero,zero,xmm6[3,10,u,u,u,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vpor %xmm6, %xmm14, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm4, %xmm14 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1,2],ymm14[3],ymm4[4,5],ymm14[6],ymm4[7,8,9,10],ymm14[11],ymm4[12,13],ymm14[14],ymm4[15] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm14, %ymm0, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm14, %ymm0, %ymm14 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm0 = xmm7[2,9],zero,zero,zero,xmm7[5,12],zero,zero,xmm7[u,u,u,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm7, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,xmm7[0,7,14],zero,zero,xmm7[3,10,u,u,u,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vpor %xmm0, %xmm7, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm4, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0,1,2],ymm7[3],ymm4[4,5],ymm7[6],ymm4[7,8,9,10],ymm7[11],ymm4[12,13],ymm7[14],ymm4[15] ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,1,8,15,6,13,4,11,18,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm14 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm6, %ymm4, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm6 = xmm15[3,10],zero,zero,zero,xmm15[6,13],zero,zero,xmm15[u,u,u,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm7 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm0, %ymm4, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm4 = xmm15[3,10],zero,zero,zero,xmm15[6,13],zero,zero,xmm15[u,u,u,u,u,u,u] ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm15, %xmm15 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm15 = zero,zero,xmm15[1,8,15],zero,zero,xmm15[4,11,u,u,u,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vpor %xmm6, %xmm15, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpor %xmm4, %xmm15, %xmm4 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm15 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm15[0],ymm1[1,2],ymm15[3],ymm1[4,5,6],ymm15[7,8],ymm1[9,10],ymm15[11],ymm1[12,13,14],ymm15[15] ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,2,9,0,7,14,5,12,19,26,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm6, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,xmm6[2,9],zero,zero,zero,xmm6[5,12,u,u,u,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm1 = xmm2[4,11],zero,zero,xmm2[0,7,14],zero,zero,xmm2[u,u,u,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vpor %xmm6, %xmm1, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm9, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm6 = ymm6[0],ymm9[1,2,3],ymm6[4],ymm9[5,6],ymm6[7,8],ymm9[9,10,11],ymm6[12],ymm9[13,14],ymm6[15] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,u,u,u,u,u,3,10,1,8,15,6,13,20,27,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm1, %ymm6, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm3, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,xmm6[3,10],zero,zero,zero,xmm6[6,13,u,u,u,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[5,12],zero,zero,xmm3[1,8,15],zero,zero,xmm3[u,u,u,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vpor %xmm6, %xmm3, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm7, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0],ymm6[1],ymm7[2,3],ymm6[4],ymm7[5,6,7,8],ymm6[9],ymm7[10,11],ymm6[12],ymm7[13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,u,u,u,u,u,4,11,2,9,0,7,14,21,28,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm3, %ymm6, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm6 = xmm5[6,13],zero,zero,xmm5[2,9],zero,zero,zero,xmm5[u,u,u,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm5, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,xmm4[2,9],zero,zero,zero,xmm4[5,12,u,u,u,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[4,11],zero,zero,xmm2[0,7,14],zero,zero,xmm2[u,u,u,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vpor %xmm4, %xmm2, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm9, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm9[1,2,3],ymm4[4],ymm9[5,6],ymm4[7,8],ymm9[9,10,11],ymm4[12],ymm9[13,14],ymm4[15] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,3,10,1,8,15,6,13,20,27,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm2, %ymm4, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm5, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,xmm4[3,10],zero,zero,zero,xmm4[6,13,u,u,u,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[5,12],zero,zero,xmm5[1,8,15],zero,zero,xmm5[u,u,u,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vpor %xmm4, %xmm5, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm3, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm5[1],ymm3[2,3],ymm5[4],ymm3[5,6,7,8],ymm5[9],ymm3[10,11],ymm5[12],ymm3[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,4,11,2,9,0,7,14,21,28,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm4, %ymm3, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm4 = xmm6[6,13],zero,zero,xmm6[2,9],zero,zero,zero,xmm6[u,u,u,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm6, %xmm5 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,xmm5[4,11],zero,zero,xmm5[0,7,14,u,u,u,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vpor %xmm6, %xmm5, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm8, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm8[0],ymm6[1],ymm8[2,3,4],ymm6[5],ymm8[6,7,8],ymm6[9],ymm8[10,11,12],ymm6[13],ymm8[14,15] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,5,12,3,10,1,8,15,22,29,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm5, %ymm2, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpor %xmm4, %xmm5, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm14, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm5 = ymm14[0],ymm5[1],ymm14[2,3,4],ymm5[5],ymm14[6,7,8],ymm5[9],ymm14[10,11,12],ymm5[13],ymm14[14,15] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,u,u,u,u,u,5,12,3,10,1,8,15,22,29,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm4, %ymm5, %ymm4 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm13, %xmm5 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u],zero,zero,zero,xmm5[5,12],zero,zero,xmm5[1,8,15,u,u,u,u] ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm6 = xmm13[u,u,0,7,14],zero,zero,xmm13[3,10],zero,zero,zero,xmm13[u,u,u,u] @@ -3674,31 +3672,31 @@ ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm6[7] -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm5 = ymm4[0],ymm5[1,2,3,4,5,6,7],ymm4[8],ymm5[9,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm5 = ymm8[0],ymm5[1,2,3,4,5,6,7],ymm8[8],ymm5[9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3],ymm5[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw $254, (%rsp), %ymm0, %ymm5 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm5 = ymm0[0],mem[1,2,3,4,5,6,7],ymm0[8],mem[9,10,11,12,13,14,15] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm5 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm5 = ymm1[0],mem[1,2,3,4,5,6,7],ymm1[8],mem[9,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm5 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm5 = ymm3[0],mem[1,2,3,4,5,6,7],ymm3[8],mem[9,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm5[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm5 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm5 = ymm2[0],mem[1,2,3,4,5,6,7],ymm2[8],mem[9,10,11,12,13,14,15] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm5 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm5 = ymm3[0],mem[1,2,3,4,5,6,7],ymm3[8],mem[9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm5[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm5 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm5 = ymm4[0],mem[1,2,3,4,5,6,7],ymm4[8],mem[9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, (%rsi) ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm5, (%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm4, (%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, (%rcx) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, (%r8) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, (%r9) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, (%r9) ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, (%rax) ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, (%rax) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm4, (%rax) ; AVX2-FAST-PERLANE-NEXT: addq $104, %rsp ; AVX2-FAST-PERLANE-NEXT: vzeroupper ; AVX2-FAST-PERLANE-NEXT: retq @@ -7348,497 +7346,506 @@ ; ; AVX2-SLOW-LABEL: load_i8_stride7_vf64: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: subq $760, %rsp # imm = 0x2F8 -; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm8 +; AVX2-SLOW-NEXT: subq $712, %rsp # imm = 0x2C8 +; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm11 ; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %ymm6 -; AVX2-SLOW-NEXT: vmovdqa 64(%rdi), %ymm7 +; AVX2-SLOW-NEXT: vmovdqa 64(%rdi), %ymm10 ; AVX2-SLOW-NEXT: vmovdqa 320(%rdi), %ymm4 ; AVX2-SLOW-NEXT: vmovdqa 288(%rdi), %ymm5 -; AVX2-SLOW-NEXT: vmovdqa 224(%rdi), %ymm1 -; AVX2-SLOW-NEXT: vmovdqa 256(%rdi), %ymm10 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = <255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0> -; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm1, %ymm10, %ymm0 -; AVX2-SLOW-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa %ymm2, %ymm14 -; AVX2-SLOW-NEXT: vmovdqa %ymm1, %ymm12 -; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = <128,128,128,5,12,128,128,1,8,15,u,u,u,u,u,u> -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm2, %xmm3 +; AVX2-SLOW-NEXT: vmovdqa 224(%rdi), %ymm0 +; AVX2-SLOW-NEXT: vmovdqa 256(%rdi), %ymm2 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = <255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0> +; AVX2-SLOW-NEXT: vpblendvb %ymm3, %ymm0, %ymm2, %ymm1 +; AVX2-SLOW-NEXT: vmovdqa %ymm3, %ymm13 +; AVX2-SLOW-NEXT: vmovdqa %ymm2, %ymm7 +; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa %ymm0, %ymm8 +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = <128,128,128,5,12,128,128,1,8,15,u,u,u,u,u,u> +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm2, %xmm3 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = <0,7,14,128,128,3,10,128,128,128,u,u,u,u,u,u> -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vpor %xmm3, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vpor %xmm3, %xmm1, %xmm1 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = ; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm4, %ymm5, %ymm3 -; AVX2-SLOW-NEXT: vmovdqa %ymm9, %ymm15 +; AVX2-SLOW-NEXT: vmovdqa %ymm9, %ymm14 ; AVX2-SLOW-NEXT: vmovdqa %ymm5, %ymm9 ; AVX2-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa %ymm4, %ymm11 +; AVX2-SLOW-NEXT: vmovdqa %ymm4, %ymm12 ; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm4 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7,8,9],ymm4[10],ymm3[11,12],ymm4[13],ymm3[14,15] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = ; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm4, %ymm5 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0] -; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm0, %ymm5, %ymm0 -; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm8, %ymm6, %ymm5 -; AVX2-SLOW-NEXT: vmovdqa %ymm8, %ymm13 -; AVX2-SLOW-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa %ymm6, %ymm0 +; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm1, %ymm5, %ymm1 +; AVX2-SLOW-NEXT: vpblendvb %ymm13, %ymm11, %ymm6, %ymm5 +; AVX2-SLOW-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa %ymm6, %ymm15 ; AVX2-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm6 -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm6, %xmm1 +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm6, %xmm0 ; AVX2-SLOW-NEXT: vmovdqa 96(%rdi), %ymm6 ; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm5, %xmm2 -; AVX2-SLOW-NEXT: vpor %xmm1, %xmm2, %xmm1 -; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm6, %ymm7, %ymm2 -; AVX2-SLOW-NEXT: vmovdqa %ymm6, %ymm8 +; AVX2-SLOW-NEXT: vpor %xmm0, %xmm2, %xmm0 +; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm6, %ymm10, %ymm2 +; AVX2-SLOW-NEXT: vmovdqa %ymm6, %ymm14 ; AVX2-SLOW-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm5 ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm5[2],ymm2[3,4],ymm5[5],ymm2[6,7,8,9],ymm5[10],ymm2[11,12],ymm5[13],ymm2[14,15] ; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2 -; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm1, %ymm2, %ymm1 -; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm9, %ymm11, %ymm1 -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1],ymm2[2],ymm1[3,4,5],ymm2[6],ymm1[7,8,9],ymm2[10],ymm1[11,12,13],ymm2[14],ymm1[15] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = <255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u> -; AVX2-SLOW-NEXT: vpblendvb %ymm1, %ymm12, %ymm10, %ymm4 -; AVX2-SLOW-NEXT: vmovdqa %ymm1, %ymm6 -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm5 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = <128,128,128,6,13,128,128,2,9,u,u,u,u,u,u,u> -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm5 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm10 = <1,8,15,128,128,4,11,128,128,u,u,u,u,u,u,u> -; AVX2-SLOW-NEXT: vpshufb %xmm10, %xmm4, %xmm4 -; AVX2-SLOW-NEXT: vpor %xmm5, %xmm4, %xmm4 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = -; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm2, %ymm2 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = <0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,u,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm2, %ymm4, %ymm2 +; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm0, %ymm2, %ymm0 +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendvb %ymm13, %ymm9, %ymm12, %ymm0 +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm0[0,1],ymm2[2],ymm0[3,4,5],ymm2[6],ymm0[7,8,9],ymm2[10],ymm0[11,12,13],ymm2[14],ymm0[15] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = <255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u> +; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm8, %ymm7, %ymm3 +; AVX2-SLOW-NEXT: vmovdqa %ymm0, %ymm8 +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = <128,128,128,6,13,128,128,2,9,u,u,u,u,u,u,u> +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm4, %xmm4 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm6 = <1,8,15,128,128,4,11,128,128,u,u,u,u,u,u,u> +; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm3, %xmm3 +; AVX2-SLOW-NEXT: vpor %xmm4, %xmm3, %xmm3 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = +; AVX2-SLOW-NEXT: vpshufb %ymm4, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = <0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX2-SLOW-NEXT: vpblendvb %ymm7, %ymm2, %ymm3, %ymm2 ; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm7, %ymm8, %ymm2 -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm4 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm4[2],ymm2[3,4,5],ymm4[6],ymm2[7,8,9],ymm4[10],ymm2[11,12,13],ymm4[14],ymm2[15] -; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm2, %ymm2 -; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm13, %ymm0, %ymm4 -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm5 -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm1 -; AVX2-SLOW-NEXT: vpshufb %xmm10, %xmm4, %xmm4 -; AVX2-SLOW-NEXT: vpor %xmm1, %xmm4, %xmm1 -; AVX2-SLOW-NEXT: vpblendvb %ymm9, %ymm2, %ymm1, %ymm11 -; AVX2-SLOW-NEXT: vmovdqa 384(%rdi), %ymm1 -; AVX2-SLOW-NEXT: vmovdqa 352(%rdi), %ymm13 +; AVX2-SLOW-NEXT: vpblendvb %ymm13, %ymm10, %ymm14, %ymm2 +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4,5],ymm3[6],ymm2[7,8,9],ymm3[10],ymm2[11,12,13],ymm3[14],ymm2[15] +; AVX2-SLOW-NEXT: vpshufb %ymm4, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vpblendvb %ymm8, %ymm11, %ymm15, %ymm3 +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm4, %xmm0 +; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm3, %xmm3 +; AVX2-SLOW-NEXT: vpor %xmm0, %xmm3, %xmm0 +; AVX2-SLOW-NEXT: vpblendvb %ymm7, %ymm2, %ymm0, %ymm15 +; AVX2-SLOW-NEXT: vmovdqa 384(%rdi), %ymm2 +; AVX2-SLOW-NEXT: vmovdqa 352(%rdi), %ymm9 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = <255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u> -; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm1, %ymm13, %ymm4 -; AVX2-SLOW-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa %ymm1, %ymm7 -; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa %ymm0, %ymm10 -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm5 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm9 = -; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm5, %xmm5 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm12 = -; AVX2-SLOW-NEXT: vpshufb %xmm12, %xmm4, %xmm4 -; AVX2-SLOW-NEXT: vpor %xmm5, %xmm4, %xmm4 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = -; AVX2-SLOW-NEXT: vmovdqa 432(%rdi), %xmm1 -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm1, %xmm5 -; AVX2-SLOW-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm2, %ymm9, %ymm3 +; AVX2-SLOW-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa %ymm2, %ymm10 +; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa %ymm0, %ymm14 +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm6 = +; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm4, %xmm4 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm7 = +; AVX2-SLOW-NEXT: vpshufb %xmm7, %xmm3, %xmm3 +; AVX2-SLOW-NEXT: vpor %xmm4, %xmm3, %xmm3 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm8 = +; AVX2-SLOW-NEXT: vmovdqa 432(%rdi), %xmm2 +; AVX2-SLOW-NEXT: vpshufb %xmm8, %xmm2, %xmm4 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = -; AVX2-SLOW-NEXT: vmovdqa 416(%rdi), %xmm3 -; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm3, %xmm14 -; AVX2-SLOW-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vmovdqa %xmm0, %xmm8 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm14[0],xmm5[0],xmm14[1],xmm5[1],xmm14[2],xmm5[2],xmm14[3],xmm5[3] +; AVX2-SLOW-NEXT: vmovdqa 416(%rdi), %xmm11 +; AVX2-SLOW-NEXT: vpshufb %xmm0, %xmm11, %xmm13 +; AVX2-SLOW-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vmovdqa %xmm0, %xmm12 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm13[0],xmm4[0],xmm13[1],xmm4[1],xmm13[2],xmm4[2],xmm13[3],xmm4[3] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm5[7] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm14 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX2-SLOW-NEXT: vpblendvb %ymm14, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm0 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm4[7] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm13 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX2-SLOW-NEXT: vpblendvb %ymm13, %ymm1, %ymm3, %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa 160(%rdi), %ymm4 ; AVX2-SLOW-NEXT: vmovdqa 128(%rdi), %ymm5 -; AVX2-SLOW-NEXT: vpblendvb %ymm10, %ymm4, %ymm5, %ymm0 -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm15 -; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm15, %xmm9 -; AVX2-SLOW-NEXT: vpshufb %xmm12, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vpor %xmm0, %xmm9, %xmm0 -; AVX2-SLOW-NEXT: vmovdqa 208(%rdi), %xmm10 -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm10, %xmm9 -; AVX2-SLOW-NEXT: vmovdqa %xmm10, %xmm2 -; AVX2-SLOW-NEXT: vmovdqa 192(%rdi), %xmm10 -; AVX2-SLOW-NEXT: vpshufb %xmm8, %xmm10, %xmm12 +; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm4, %ymm5, %ymm1 +; AVX2-SLOW-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm14 +; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm14, %xmm6 +; AVX2-SLOW-NEXT: vpshufb %xmm7, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vpor %xmm6, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vmovdqa 208(%rdi), %xmm0 +; AVX2-SLOW-NEXT: vpshufb %xmm8, %xmm0, %xmm6 +; AVX2-SLOW-NEXT: vmovdqa %xmm0, %xmm8 +; AVX2-SLOW-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vmovdqa 192(%rdi), %xmm3 +; AVX2-SLOW-NEXT: vpshufb %xmm12, %xmm3, %xmm7 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm6[7] +; AVX2-SLOW-NEXT: vpblendvb %ymm13, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = +; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm10, %ymm9, %ymm1 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm10 = +; AVX2-SLOW-NEXT: vpshufb %xmm10, %xmm1, %xmm6 +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm7 = +; AVX2-SLOW-NEXT: vpshufb %xmm7, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vpor %xmm6, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm6 = +; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm2, %xmm9 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm14 = +; AVX2-SLOW-NEXT: vpshufb %xmm14, %xmm11, %xmm12 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm12[0],xmm9[0],xmm12[1],xmm9[1],xmm12[2],xmm9[2],xmm12[3],xmm9[3] -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm9[7] -; AVX2-SLOW-NEXT: vpblendvb %ymm14, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm9[7] +; AVX2-SLOW-NEXT: vpblendvb %ymm13, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm4, %ymm5, %ymm1 +; AVX2-SLOW-NEXT: vpshufb %xmm10, %xmm1, %xmm5 +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX2-SLOW-NEXT: vpshufb %xmm7, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vpor %xmm5, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm8, %xmm5 +; AVX2-SLOW-NEXT: vpshufb %xmm14, %xmm3, %xmm6 +; AVX2-SLOW-NEXT: vmovdqa %xmm3, %xmm14 +; AVX2-SLOW-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm5[7] +; AVX2-SLOW-NEXT: vpblendvb %ymm13, %ymm15, %ymm1, %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = -; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm7, %ymm13, %ymm0 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm7 = -; AVX2-SLOW-NEXT: vpshufb %xmm7, %xmm0, %xmm9 -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm12 = -; AVX2-SLOW-NEXT: vpshufb %xmm12, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vpor %xmm0, %xmm9, %xmm0 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm9 = -; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm1, %xmm13 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm15 = -; AVX2-SLOW-NEXT: vpshufb %xmm15, %xmm3, %xmm8 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm13[0],xmm8[1],xmm13[1],xmm8[2],xmm13[2],xmm8[3],xmm13[3] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = <255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u> +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendvb %ymm11, %ymm15, %ymm10, %ymm0 +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm5 = +; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm6 = +; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = +; AVX2-SLOW-NEXT: vmovdqa %xmm2, %xmm12 +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm2, %xmm7 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm9 = +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm13, %xmm8 +; AVX2-SLOW-NEXT: vpor %xmm7, %xmm8, %xmm7 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm8[7] -; AVX2-SLOW-NEXT: vpblendvb %ymm14, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = +; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm0, %ymm7, %ymm0 +; AVX2-SLOW-NEXT: vmovdqa %ymm2, %ymm8 +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendvb %ymm11, %ymm3, %ymm4, %ymm0 +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm7 +; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm7, %xmm5 +; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vpor %xmm5, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm2, %xmm1 +; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm14, %xmm5 +; AVX2-SLOW-NEXT: vpor %xmm1, %xmm5, %xmm1 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-SLOW-NEXT: vpblendvb %ymm8, %ymm0, %ymm1, %ymm0 +; AVX2-SLOW-NEXT: vmovdqa %ymm8, %ymm14 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm4, %ymm5, %ymm0 -; AVX2-SLOW-NEXT: vpshufb %xmm7, %xmm0, %xmm3 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = +; AVX2-SLOW-NEXT: vpblendvb %ymm1, %ymm15, %ymm10, %ymm0 +; AVX2-SLOW-NEXT: vmovdqa %ymm10, %ymm15 +; AVX2-SLOW-NEXT: vmovdqa %ymm1, %ymm8 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm0, %xmm5 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX2-SLOW-NEXT: vpshufb %xmm12, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vpor %xmm3, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm2, %xmm3 -; AVX2-SLOW-NEXT: vmovdqa %xmm2, %xmm13 -; AVX2-SLOW-NEXT: vpshufb %xmm15, %xmm10, %xmm6 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm6 = +; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vpor %xmm5, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm5 = +; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm12, %xmm7 +; AVX2-SLOW-NEXT: vmovdqa %xmm12, %xmm11 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm9 = +; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm13, %xmm12 +; AVX2-SLOW-NEXT: vpor %xmm7, %xmm12, %xmm7 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm3[7] -; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm11, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX2-SLOW-NEXT: vmovdqa %ymm14, %ymm10 +; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm0, %ymm7, %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = <255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u> -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm11, %ymm7, %ymm0 -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm6 = +; AVX2-SLOW-NEXT: vmovdqa %ymm4, %ymm7 +; AVX2-SLOW-NEXT: vpblendvb %ymm8, %ymm3, %ymm4, %ymm0 +; AVX2-SLOW-NEXT: vmovdqa %ymm3, %ymm4 +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm0, %xmm1 +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm0, %xmm0 ; AVX2-SLOW-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm15, %xmm8 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm9 = -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm14, %xmm12 -; AVX2-SLOW-NEXT: vpor %xmm8, %xmm12, %xmm8 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm12 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [0,0,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,0,0,0,0,0] -; AVX2-SLOW-NEXT: # ymm0 = mem[0,1,0,1] -; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm12, %ymm8, %ymm8 -; AVX2-SLOW-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm5, %ymm4, %ymm8 -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm8, %xmm12 -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm12, %xmm3 -; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm8, %xmm6 -; AVX2-SLOW-NEXT: vpor %xmm3, %xmm6, %xmm3 -; AVX2-SLOW-NEXT: vmovdqa %xmm13, %xmm2 -; AVX2-SLOW-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm13, %xmm1 -; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm10, %xmm6 -; AVX2-SLOW-NEXT: vpor %xmm1, %xmm6, %xmm1 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm2, %xmm1 +; AVX2-SLOW-NEXT: vmovdqa %xmm2, %xmm14 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm3, %xmm5 +; AVX2-SLOW-NEXT: vpor %xmm1, %xmm5, %xmm1 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm3, %ymm1, %ymm1 -; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = -; AVX2-SLOW-NEXT: vpblendvb %ymm3, %ymm11, %ymm7, %ymm1 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm7 = -; AVX2-SLOW-NEXT: vpshufb %xmm7, %xmm1, %xmm6 -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm1 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm8 = -; AVX2-SLOW-NEXT: vpshufb %xmm8, %xmm1, %xmm1 -; AVX2-SLOW-NEXT: vpor %xmm6, %xmm1, %xmm1 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm6 = -; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm15, %xmm9 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm12 = -; AVX2-SLOW-NEXT: vpshufb %xmm12, %xmm14, %xmm13 +; AVX2-SLOW-NEXT: vpblendvb %ymm10, %ymm0, %ymm1, %ymm0 +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendvb %ymm1, %ymm2, %ymm15, %ymm0 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm8 = +; AVX2-SLOW-NEXT: vpshufb %xmm8, %xmm0, %xmm5 +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm6 = +; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vpor %xmm5, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm5 = +; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm13, %xmm9 +; AVX2-SLOW-NEXT: vmovdqa %xmm13, %xmm10 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm12 = +; AVX2-SLOW-NEXT: vpshufb %xmm12, %xmm11, %xmm13 +; AVX2-SLOW-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vpor %xmm9, %xmm13, %xmm9 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 -; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm1, %ymm9, %ymm1 -; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendvb %ymm3, %ymm5, %ymm4, %ymm1 -; AVX2-SLOW-NEXT: vpshufb %xmm7, %xmm1, %xmm3 -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm1 -; AVX2-SLOW-NEXT: vpshufb %xmm8, %xmm1, %xmm1 -; AVX2-SLOW-NEXT: vpor %xmm3, %xmm1, %xmm1 -; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm2, %xmm3 -; AVX2-SLOW-NEXT: vpshufb %xmm12, %xmm10, %xmm6 -; AVX2-SLOW-NEXT: vmovdqa %xmm10, %xmm2 -; AVX2-SLOW-NEXT: vpor %xmm3, %xmm6, %xmm3 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm1, %ymm3, %ymm1 -; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendvb %ymm3, %ymm11, %ymm7, %ymm1 -; AVX2-SLOW-NEXT: vmovdqa %ymm3, %ymm15 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm1, %xmm6 -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm1 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm8 = -; AVX2-SLOW-NEXT: vpshufb %xmm8, %xmm1, %xmm1 -; AVX2-SLOW-NEXT: vpor %xmm6, %xmm1, %xmm1 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm6 = -; AVX2-SLOW-NEXT: vmovdqa %xmm14, %xmm9 -; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm14, %xmm12 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm13 = -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm13, %xmm10, %xmm14 -; AVX2-SLOW-NEXT: vpor %xmm12, %xmm14, %xmm12 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 -; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm1, %ymm12, %ymm1 -; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm5, %ymm4, %ymm1 -; AVX2-SLOW-NEXT: vmovdqa %ymm5, %ymm15 -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm1, %xmm3 -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm1 -; AVX2-SLOW-NEXT: vpshufb %xmm8, %xmm1, %xmm1 -; AVX2-SLOW-NEXT: vpor %xmm3, %xmm1, %xmm1 -; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm2, %xmm3 -; AVX2-SLOW-NEXT: vmovdqa %xmm2, %xmm5 -; AVX2-SLOW-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm13, %xmm14, %xmm6 -; AVX2-SLOW-NEXT: vpor %xmm3, %xmm6, %xmm3 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm13 = +; AVX2-SLOW-NEXT: vpblendvb %ymm13, %ymm0, %ymm9, %ymm0 +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendvb %ymm1, %ymm4, %ymm7, %ymm0 +; AVX2-SLOW-NEXT: vmovdqa %ymm7, %ymm4 +; AVX2-SLOW-NEXT: vpshufb %xmm8, %xmm0, %xmm1 +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm3, %xmm1 +; AVX2-SLOW-NEXT: vmovdqa %xmm3, %xmm7 +; AVX2-SLOW-NEXT: vpshufb %xmm12, %xmm14, %xmm5 +; AVX2-SLOW-NEXT: vmovdqa %xmm14, %xmm8 +; AVX2-SLOW-NEXT: vpor %xmm1, %xmm5, %xmm1 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm1, %ymm3, %ymm1 -; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm12 = <255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0> -; AVX2-SLOW-NEXT: vmovdqa %ymm11, %ymm3 -; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm11, %ymm7, %ymm1 -; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = <255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u> -; AVX2-SLOW-NEXT: vpblendvb %ymm11, %ymm7, %ymm3, %ymm1 +; AVX2-SLOW-NEXT: vpblendvb %ymm13, %ymm0, %ymm1, %ymm0 +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa %ymm13, %ymm3 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm14 = <255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0> +; AVX2-SLOW-NEXT: vmovdqa %ymm15, %ymm1 +; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm2, %ymm15, %ymm0 +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm15 = <255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u> +; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm1, %ymm2, %ymm1 ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm5 = +; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm2, %xmm2 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm6 = ; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm1, %xmm1 ; AVX2-SLOW-NEXT: vpor %xmm2, %xmm1, %xmm1 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm9, %xmm7 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm8 = -; AVX2-SLOW-NEXT: vpshufb %xmm8, %xmm10, %xmm13 -; AVX2-SLOW-NEXT: vpor %xmm7, %xmm13, %xmm7 +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm10, %xmm9 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm12 = +; AVX2-SLOW-NEXT: vpshufb %xmm12, %xmm11, %xmm13 +; AVX2-SLOW-NEXT: vpor %xmm9, %xmm13, %xmm9 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm1, %ymm7, %ymm1 -; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm15, %ymm4, %ymm7 -; AVX2-SLOW-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa %ymm11, %ymm12 -; AVX2-SLOW-NEXT: vpblendvb %ymm11, %ymm4, %ymm15, %ymm1 -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm4 -; AVX2-SLOW-NEXT: vpshufb %xmm3, %xmm4, %xmm3 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 +; AVX2-SLOW-NEXT: vmovdqa %ymm3, %ymm10 +; AVX2-SLOW-NEXT: vpblendvb %ymm3, %ymm1, %ymm9, %ymm0 +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendvb %ymm14, %ymm3, %ymm4, %ymm0 +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm4, %ymm3, %ymm1 +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm3 +; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm3, %xmm3 ; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm1, %xmm1 ; AVX2-SLOW-NEXT: vpor %xmm3, %xmm1, %xmm1 -; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm5, %xmm2 -; AVX2-SLOW-NEXT: vpshufb %xmm8, %xmm14, %xmm3 +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm7, %xmm2 +; AVX2-SLOW-NEXT: vpshufb %xmm12, %xmm8, %xmm3 ; AVX2-SLOW-NEXT: vpor %xmm2, %xmm3, %xmm2 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm0 +; AVX2-SLOW-NEXT: vpblendvb %ymm10, %ymm1, %ymm2, %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = -; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm1, %ymm10, %ymm0 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm13 = -; AVX2-SLOW-NEXT: vpblendvb %ymm13, %ymm1, %ymm10, %ymm15 -; AVX2-SLOW-NEXT: vpblendvb %ymm11, %ymm10, %ymm1, %ymm11 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = <255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u> -; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm10, %ymm1, %ymm2 -; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = -; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm10, %ymm1, %ymm1 -; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = +; AVX2-SLOW-NEXT: vpblendvb %ymm11, %ymm1, %ymm2, %ymm12 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = +; AVX2-SLOW-NEXT: vpblendvb %ymm7, %ymm1, %ymm2, %ymm4 +; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm2, %ymm1, %ymm8 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm10 = <255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u> +; AVX2-SLOW-NEXT: vpblendvb %ymm10, %ymm2, %ymm1, %ymm0 +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = +; AVX2-SLOW-NEXT: vpblendvb %ymm3, %ymm2, %ymm1, %ymm0 +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm1, %ymm10, %ymm2 -; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm10, %ymm1, %ymm3 -; AVX2-SLOW-NEXT: vpblendvb %ymm13, %ymm10, %ymm1, %ymm7 -; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm1, %ymm10, %ymm14 -; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm1, %ymm10, %ymm1 +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm0 +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendvb %ymm11, %ymm2, %ymm1, %ymm14 +; AVX2-SLOW-NEXT: vpblendvb %ymm7, %ymm2, %ymm1, %ymm5 +; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm1, %ymm2, %ymm9 +; AVX2-SLOW-NEXT: vmovdqa %ymm15, %ymm0 +; AVX2-SLOW-NEXT: vpblendvb %ymm10, %ymm1, %ymm2, %ymm1 ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = <255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u> -; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm2, %ymm1, %ymm9 -; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm2, %ymm1, %ymm5 -; AVX2-SLOW-NEXT: vpblendvb %ymm13, %ymm2, %ymm1, %ymm8 -; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm1, %ymm2, %ymm10 -; AVX2-SLOW-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm1, %ymm2, %ymm1 +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm15 = <255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u> +; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm3, %ymm1, %ymm13 +; AVX2-SLOW-NEXT: vpblendvb %ymm11, %ymm3, %ymm1, %ymm2 +; AVX2-SLOW-NEXT: vpblendvb %ymm7, %ymm3, %ymm1, %ymm6 +; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm1, %ymm3, %ymm11 +; AVX2-SLOW-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendvb %ymm10, %ymm1, %ymm3, %ymm1 ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm2, %ymm4, %ymm1 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = -; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm2, %ymm4, %ymm6 -; AVX2-SLOW-NEXT: vpblendvb %ymm13, %ymm2, %ymm4, %ymm10 -; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm4, %ymm2, %ymm12 -; AVX2-SLOW-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm12 = <255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u> -; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm4, %ymm2, %ymm2 -; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm12 = <2,9,128,128,128,5,12,128,128,u,u,u,u,u,u,u> -; AVX2-SLOW-NEXT: vpshufb %xmm12, %xmm0, %xmm2 -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm13 = <128,128,0,7,14,128,128,3,10,u,u,u,u,u,u,u> -; AVX2-SLOW-NEXT: vpshufb %xmm13, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vpor %xmm2, %xmm0, %xmm4 -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm9, %xmm0 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm9[0,1,2],ymm0[3],ymm9[4,5],ymm0[6],ymm9[7,8,9,10],ymm0[11],ymm9[12,13],ymm0[14],ymm9[15] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = -; AVX2-SLOW-NEXT: vpshufb %ymm9, %ymm0, %ymm2 +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-SLOW-NEXT: vpblendvb %ymm15, %ymm10, %ymm11, %ymm1 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = +; AVX2-SLOW-NEXT: vpblendvb %ymm3, %ymm10, %ymm11, %ymm3 +; AVX2-SLOW-NEXT: vpblendvb %ymm7, %ymm10, %ymm11, %ymm7 +; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm11, %ymm10, %ymm0 +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm0 = <255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u> +; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm11, %ymm10, %ymm0 +; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm15 = <2,9,128,128,128,5,12,128,128,u,u,u,u,u,u,u> +; AVX2-SLOW-NEXT: vpshufb %xmm15, %xmm12, %xmm0 +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm12, %xmm12 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm11 = <128,128,0,7,14,128,128,3,10,u,u,u,u,u,u,u> +; AVX2-SLOW-NEXT: vpshufb %xmm11, %xmm12, %xmm12 +; AVX2-SLOW-NEXT: vpor %xmm0, %xmm12, %xmm12 +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm13, %xmm0 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm13[0,1,2],ymm0[3],ymm13[4,5],ymm0[6],ymm13[7,8,9,10],ymm0[11],ymm13[12,13],ymm0[14],ymm13[15] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm10 = +; AVX2-SLOW-NEXT: vpshufb %ymm10, %ymm0, %ymm13 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0] -; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm4, %ymm2, %ymm2 -; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpshufb %xmm12, %xmm3, %xmm2 -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm3 -; AVX2-SLOW-NEXT: vpshufb %xmm13, %xmm3, %xmm3 -; AVX2-SLOW-NEXT: vpor %xmm2, %xmm3, %xmm2 -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm3 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2],ymm3[3],ymm1[4,5],ymm3[6],ymm1[7,8,9,10],ymm3[11],ymm1[12,13],ymm3[14],ymm1[15] -; AVX2-SLOW-NEXT: vpshufb %ymm9, %ymm1, %ymm1 -; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm2, %ymm1, %ymm1 -; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm12, %ymm13, %ymm12 +; AVX2-SLOW-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vpshufb %xmm15, %xmm14, %xmm12 +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm14, %xmm14 +; AVX2-SLOW-NEXT: vpshufb %xmm11, %xmm14, %xmm11 +; AVX2-SLOW-NEXT: vpor %xmm12, %xmm11, %xmm11 +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm12 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2],ymm12[3],ymm1[4,5],ymm12[6],ymm1[7,8,9,10],ymm12[11],ymm1[12,13],ymm12[14],ymm1[15] +; AVX2-SLOW-NEXT: vpshufb %ymm10, %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm11, %ymm1, %ymm14 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = <3,10,128,128,128,6,13,128,128,u,u,u,u,u,u,u> -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm15, %xmm2 -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm15, %xmm3 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = <128,128,1,8,15,128,128,4,11,u,u,u,u,u,u,u> -; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm3, %xmm3 -; AVX2-SLOW-NEXT: vpor %xmm2, %xmm3, %xmm2 -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm3 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm5[1,2],ymm3[3],ymm5[4,5,6],ymm3[7,8],ymm5[9,10],ymm3[11],ymm5[12,13,14],ymm3[15] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = -; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm3, %ymm3 -; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm2, %ymm3, %ymm2 -; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm7, %xmm1 -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm7, %xmm3 -; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm3, %xmm3 -; AVX2-SLOW-NEXT: vpor %xmm1, %xmm3, %xmm1 -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm3 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm6[1,2],ymm3[3],ymm6[4,5,6],ymm3[7,8],ymm6[9,10],ymm3[11],ymm6[12,13,14],ymm3[15] -; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm3, %ymm3 -; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm1, %ymm3, %ymm15 -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm11, %xmm1 +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm4, %xmm10 +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm4, %xmm4 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm11 = <128,128,1,8,15,128,128,4,11,u,u,u,u,u,u,u> +; AVX2-SLOW-NEXT: vpshufb %xmm11, %xmm4, %xmm4 +; AVX2-SLOW-NEXT: vpor %xmm4, %xmm10, %xmm4 +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm10 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm10[0],ymm2[1,2],ymm10[3],ymm2[4,5,6],ymm10[7,8],ymm2[9,10],ymm10[11],ymm2[12,13,14],ymm10[15] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm10 = +; AVX2-SLOW-NEXT: vpshufb %ymm10, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm4, %ymm2, %ymm13 +; AVX2-SLOW-NEXT: vpshufb %xmm1, %xmm5, %xmm1 +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm5, %xmm4 +; AVX2-SLOW-NEXT: vpshufb %xmm11, %xmm4, %xmm4 +; AVX2-SLOW-NEXT: vpor %xmm1, %xmm4, %xmm1 +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm3[1,2],ymm4[3],ymm3[4,5,6],ymm4[7,8],ymm3[9,10],ymm4[11],ymm3[12,13,14],ymm4[15] +; AVX2-SLOW-NEXT: vpshufb %ymm10, %ymm3, %ymm3 +; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm1, %ymm3, %ymm12 +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm8, %xmm1 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = <128,128,2,9,128,128,128,5,12,u,u,u,u,u,u,u> ; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm1, %xmm1 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm5 = <4,11,128,128,0,7,14,128,128,u,u,u,u,u,u,u> -; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm11, %xmm3 +; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm8, %xmm3 ; AVX2-SLOW-NEXT: vpor %xmm1, %xmm3, %xmm1 -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm8, %xmm3 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm8[1,2,3],ymm3[4],ymm8[5,6],ymm3[7,8],ymm8[9,10,11],ymm3[12],ymm8[13,14],ymm3[15] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm6, %xmm3 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm6[1,2,3],ymm3[4],ymm6[5,6],ymm3[7,8],ymm6[9,10,11],ymm3[12],ymm6[13,14],ymm3[15] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = ; AVX2-SLOW-NEXT: vpshufb %ymm6, %ymm3, %ymm3 ; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm1, %ymm3, %ymm3 -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm14, %xmm1 +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm9, %xmm1 ; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm1, %xmm1 -; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm14, %xmm4 +; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm9, %xmm4 ; AVX2-SLOW-NEXT: vpor %xmm1, %xmm4, %xmm1 -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm10, %xmm4 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm10[1,2,3],ymm4[4],ymm10[5,6],ymm4[7,8],ymm10[9,10,11],ymm4[12],ymm10[13,14],ymm4[15] +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm7, %xmm4 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm7[1,2,3],ymm4[4],ymm7[5,6],ymm4[7,8],ymm7[9,10,11],ymm4[12],ymm7[13,14],ymm4[15] ; AVX2-SLOW-NEXT: vpshufb %ymm6, %ymm4, %ymm4 ; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm1, %ymm4, %ymm1 -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm7, %xmm4 +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm4 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm5 = <128,128,3,10,128,128,128,6,13,u,u,u,u,u,u,u> ; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm4, %xmm4 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm6 = <5,12,128,128,1,8,15,128,128,u,u,u,u,u,u,u> -; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm7, %xmm7 +; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm2, %xmm7 ; AVX2-SLOW-NEXT: vpor %xmm4, %xmm7, %xmm4 -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm8, %xmm7 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0],ymm7[1],ymm8[2,3],ymm7[4],ymm8[5,6,7,8],ymm7[9],ymm8[10,11],ymm7[12],ymm8[13,14,15] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm7 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm7 = ymm2[0],ymm7[1],ymm2[2,3],ymm7[4],ymm2[5,6,7,8],ymm7[9],ymm2[10,11],ymm7[12],ymm2[13,14,15] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = ; AVX2-SLOW-NEXT: vpshufb %ymm8, %ymm7, %ymm7 ; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm4, %ymm7, %ymm4 -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm9, %xmm7 +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm7 ; AVX2-SLOW-NEXT: vpshufb %xmm5, %xmm7, %xmm5 -; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm9, %xmm6 +; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm2, %xmm6 ; AVX2-SLOW-NEXT: vpor %xmm5, %xmm6, %xmm5 -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm7, %xmm6 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0],ymm6[1],ymm7[2,3],ymm6[4],ymm7[5,6,7,8],ymm6[9],ymm7[10,11],ymm6[12],ymm7[13,14,15] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm6 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm2[0],ymm6[1],ymm2[2,3],ymm6[4],ymm2[5,6,7,8],ymm6[9],ymm2[10,11],ymm6[12],ymm2[13,14,15] ; AVX2-SLOW-NEXT: vpshufb %ymm8, %ymm6, %ymm6 ; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm5, %ymm6, %ymm5 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm6 = <6,13,128,128,2,9,128,128,128,u,u,u,u,u,u,u> -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm8, %xmm7 -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm8, %xmm8 +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm2, %xmm7 +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm8 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm9 = <128,128,4,11,128,128,0,7,14,u,u,u,u,u,u,u> ; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm8, %xmm8 ; AVX2-SLOW-NEXT: vpor %xmm7, %xmm8, %xmm7 -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm10, %xmm8 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm10[0],ymm8[1],ymm10[2,3,4],ymm8[5],ymm10[6,7,8],ymm8[9],ymm10[10,11,12],ymm8[13],ymm10[14,15] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm8 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm2[0],ymm8[1],ymm2[2,3,4],ymm8[5],ymm2[6,7,8],ymm8[9],ymm2[10,11,12],ymm8[13],ymm2[14,15] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm10 = ; AVX2-SLOW-NEXT: vpshufb %ymm10, %ymm8, %ymm8 ; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm7, %ymm8, %ymm7 -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm8, %xmm6 -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm8, %xmm8 +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm2, %xmm6 +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm8 ; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm8, %xmm8 ; AVX2-SLOW-NEXT: vpor %xmm6, %xmm8, %xmm6 -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm9, %xmm8 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0],ymm8[1],ymm9[2,3,4],ymm8[5],ymm9[6,7,8],ymm8[9],ymm9[10,11,12],ymm8[13],ymm9[14,15] +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm8 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm2[0],ymm8[1],ymm2[2,3,4],ymm8[5],ymm2[6,7,8],ymm8[9],ymm2[10,11,12],ymm8[13],ymm2[14,15] ; AVX2-SLOW-NEXT: vpshufb %ymm10, %ymm8, %ymm8 ; AVX2-SLOW-NEXT: vpblendvb %ymm0, %ymm6, %ymm8, %ymm0 -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm10, %xmm6 +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm6 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm8 = ; AVX2-SLOW-NEXT: vpshufb %xmm8, %xmm6, %xmm6 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm9 = -; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm10, %xmm10 +; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm2, %xmm10 ; AVX2-SLOW-NEXT: vpor %xmm6, %xmm10, %xmm6 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm13 = +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm13, %xmm10, %xmm10 +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm10, %xmm10 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm11 = -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm11, %xmm12, %xmm12 -; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm12[0],xmm10[0],xmm12[1],xmm10[1],xmm12[2],xmm10[2],xmm12[3],xmm10[3] +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm11, %xmm15, %xmm15 +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm15[0],xmm10[0],xmm15[1],xmm10[1],xmm15[2],xmm10[2],xmm15[3],xmm10[3] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm10[7] ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm6 = ymm10[0],ymm6[1,2,3,4,5,6,7],ymm10[8],ymm6[9,10,11,12,13,14,15] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0,1,2,3],ymm6[4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm12, %xmm10 +; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm15, %xmm10 ; AVX2-SLOW-NEXT: vpshufb %xmm8, %xmm10, %xmm8 -; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm12, %xmm9 +; AVX2-SLOW-NEXT: vpshufb %xmm9, %xmm15, %xmm9 ; AVX2-SLOW-NEXT: vpor %xmm8, %xmm9, %xmm8 ; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm13, %xmm9, %xmm9 -; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-SLOW-NEXT: vpshufb %xmm11, %xmm10, %xmm10 +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm9, %xmm9 +; AVX2-SLOW-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-SLOW-NEXT: vpshufb %xmm11, %xmm2, %xmm10 ; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3] ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 ; AVX2-SLOW-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6],ymm9[7] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0],ymm8[1,2,3,4,5,6,7],ymm9[8],ymm8[9,10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] -; AVX2-SLOW-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm9 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm9 = ymm2[0],mem[1,2,3,4,5,6,7],ymm2[8],mem[9,10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm9[4,5,6,7] -; AVX2-SLOW-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm9 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: # ymm9 = ymm15[0],mem[1,2,3,4,5,6,7],ymm15[8],mem[9,10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm15[0,1,2,3],ymm9[4,5,6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm8 = ymm14[0],ymm8[1,2,3,4,5,6,7],ymm14[8],ymm8[9,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm8 = ymm14[0,1,2,3],ymm8[4,5,6,7] +; AVX2-SLOW-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm9 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm9 = ymm13[0],mem[1,2,3,4,5,6,7],ymm13[8],mem[9,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1,2,3],ymm9[4,5,6,7] +; AVX2-SLOW-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm9 # 32-byte Folded Reload +; AVX2-SLOW-NEXT: # ymm9 = ymm12[0],mem[1,2,3,4,5,6,7],ymm12[8],mem[9,10,11,12,13,14,15] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm9 = ymm12[0,1,2,3],ymm9[4,5,6,7] ; AVX2-SLOW-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm10 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm10 = ymm3[0],mem[1,2,3,4,5,6,7],ymm3[8],mem[9,10,11,12,13,14,15] ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm10[4,5,6,7] @@ -7877,13 +7884,13 @@ ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-SLOW-NEXT: vmovdqa %ymm0, (%rax) ; AVX2-SLOW-NEXT: vmovdqa %ymm7, 32(%rax) -; AVX2-SLOW-NEXT: addq $760, %rsp # imm = 0x2F8 +; AVX2-SLOW-NEXT: addq $712, %rsp # imm = 0x2C8 ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-LABEL: load_i8_stride7_vf64: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: subq $728, %rsp # imm = 0x2D8 +; AVX2-FAST-NEXT: subq $712, %rsp # imm = 0x2C8 ; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm6 ; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %ymm8 ; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %ymm7 @@ -7893,7 +7900,7 @@ ; AVX2-FAST-NEXT: vmovdqa 256(%rdi), %ymm2 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = <255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0> ; AVX2-FAST-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm0 -; AVX2-FAST-NEXT: vmovdqa %ymm3, %ymm14 +; AVX2-FAST-NEXT: vmovdqa %ymm3, %ymm15 ; AVX2-FAST-NEXT: vmovdqa %ymm2, %ymm9 ; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa %ymm1, %ymm10 @@ -7904,8 +7911,8 @@ ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = <0,7,14,128,128,3,10,128,128,128,u,u,u,u,u,u> ; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX2-FAST-NEXT: vpor %xmm3, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = -; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm4, %ymm5, %ymm3 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = +; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm4, %ymm5, %ymm3 ; AVX2-FAST-NEXT: vmovdqa %ymm5, %ymm11 ; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa %ymm4, %ymm13 @@ -7917,10 +7924,10 @@ ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm12 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0] ; AVX2-FAST-NEXT: vpblendvb %ymm12, %ymm0, %ymm4, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm6, %ymm8, %ymm4 +; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm6, %ymm8, %ymm4 ; AVX2-FAST-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa %ymm6, %ymm5 -; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqu %ymm6, (%rsp) # 32-byte Spill ; AVX2-FAST-NEXT: vextracti128 $1, %ymm4, %xmm6 ; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm6, %xmm1 ; AVX2-FAST-NEXT: vmovdqa 96(%rdi), %ymm6 @@ -7928,80 +7935,80 @@ ; AVX2-FAST-NEXT: vpor %xmm1, %xmm2, %xmm1 ; AVX2-FAST-NEXT: vmovdqa %ymm7, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm6, %ymm7, %ymm2 +; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm6, %ymm7, %ymm2 ; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vextracti128 $1, %ymm2, %xmm4 ; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm4[2],ymm2[3,4],ymm4[5],ymm2[6,7,8,9],ymm4[10],ymm2[11,12],ymm4[13],ymm2[14,15] ; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm2 ; AVX2-FAST-NEXT: vpblendvb %ymm12, %ymm1, %ymm2, %ymm12 -; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm11, %ymm13, %ymm2 +; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm11, %ymm13, %ymm1 +; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4,5],ymm2[6],ymm1[7,8,9],ymm2[10],ymm1[11,12,13],ymm2[14],ymm1[15] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = <255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u> +; AVX2-FAST-NEXT: vpblendvb %ymm3, %ymm10, %ymm9, %ymm2 +; AVX2-FAST-NEXT: vmovdqa %ymm3, %ymm10 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4,5],ymm3[6],ymm2[7,8,9],ymm3[10],ymm2[11,12,13],ymm3[14],ymm2[15] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u> -; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm10, %ymm9, %ymm3 -; AVX2-FAST-NEXT: vmovdqa %ymm1, %ymm10 -; AVX2-FAST-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm9 = <128,128,128,6,13,128,128,2,9,u,u,u,u,u,u,u> -; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm4, %xmm4 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = <128,128,128,6,13,128,128,2,9,u,u,u,u,u,u,u> +; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm3, %xmm3 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm7 = <1,8,15,128,128,4,11,128,128,u,u,u,u,u,u,u> -; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm3, %xmm3 -; AVX2-FAST-NEXT: vpor %xmm4, %xmm3, %xmm3 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = -; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,u,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm2, %ymm3, %ymm13 -; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm0, %ymm6, %ymm2 +; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm2, %xmm2 +; AVX2-FAST-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = +; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = <0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm1, %ymm2, %ymm13 +; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm0, %ymm6, %ymm1 +; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4,5],ymm2[6],ymm1[7,8,9],ymm2[10],ymm1[11,12,13],ymm2[14],ymm1[15] +; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm5, %ymm8, %ymm2 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4,5],ymm3[6],ymm2[7,8,9],ymm3[10],ymm2[11,12,13],ymm3[14],ymm2[15] -; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm5, %ymm8, %ymm3 -; AVX2-FAST-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm4, %xmm4 -; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm3, %xmm3 -; AVX2-FAST-NEXT: vpor %xmm4, %xmm3, %xmm3 -; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm2, %ymm3, %ymm7 +; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm3, %xmm3 +; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm2, %xmm2 +; AVX2-FAST-NEXT: vpor %xmm3, %xmm2, %xmm2 +; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm1, %ymm2, %ymm7 ; AVX2-FAST-NEXT: vmovdqa 384(%rdi), %ymm1 -; AVX2-FAST-NEXT: vmovdqa 352(%rdi), %ymm3 +; AVX2-FAST-NEXT: vmovdqa 352(%rdi), %ymm2 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = <255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u> -; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm1, %ymm3, %ymm4 +; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm3 +; AVX2-FAST-NEXT: vmovdqa %ymm2, %ymm5 ; AVX2-FAST-NEXT: vmovdqa %ymm1, %ymm2 -; AVX2-FAST-NEXT: vmovdqa %ymm0, %ymm14 -; AVX2-FAST-NEXT: vextracti128 $1, %ymm4, %xmm5 +; AVX2-FAST-NEXT: vmovdqa %ymm0, %ymm15 +; AVX2-FAST-NEXT: vextracti128 $1, %ymm3, %xmm4 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = -; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm5, %xmm5 +; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm4, %xmm4 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm9 = -; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm4, %xmm4 -; AVX2-FAST-NEXT: vpor %xmm5, %xmm4, %xmm4 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm3, %xmm3 +; AVX2-FAST-NEXT: vpor %xmm4, %xmm3, %xmm3 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX2-FAST-NEXT: vmovdqa 416(%rdi), %ymm0 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = [1,2,0,2,1,2,4,6] -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm10, %ymm5 +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm10, %ymm4 ; AVX2-FAST-NEXT: vmovdqa %ymm0, %ymm6 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = -; AVX2-FAST-NEXT: vpshufb %ymm11, %ymm5, %ymm5 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm5[7] +; AVX2-FAST-NEXT: vpshufb %ymm11, %ymm4, %ymm4 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm4[7] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX2-FAST-NEXT: vpblendvb %ymm1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vpblendvb %ymm1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm0 # 32-byte Folded Reload ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 160(%rdi), %ymm4 -; AVX2-FAST-NEXT: vmovdqa 128(%rdi), %ymm5 -; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm4, %ymm5, %ymm0 -; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm14 -; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm14, %xmm8 +; AVX2-FAST-NEXT: vmovdqa 160(%rdi), %ymm3 +; AVX2-FAST-NEXT: vmovdqa 128(%rdi), %ymm4 +; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm3, %ymm4, %ymm0 +; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm15 +; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm15, %xmm8 ; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm0, %xmm0 ; AVX2-FAST-NEXT: vpor %xmm0, %xmm8, %xmm0 ; AVX2-FAST-NEXT: vmovdqa 192(%rdi), %ymm9 ; AVX2-FAST-NEXT: vpermd %ymm9, %ymm10, %ymm8 -; AVX2-FAST-NEXT: vmovdqa %ymm9, %ymm14 +; AVX2-FAST-NEXT: vmovdqa %ymm9, %ymm15 ; AVX2-FAST-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpshufb %ymm11, %ymm8, %ymm8 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm8[7] ; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm12, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm2, %ymm3, %ymm0 +; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm2, %ymm5, %ymm0 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = ; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm0, %xmm9 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm0 @@ -8016,339 +8023,345 @@ ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm11[7] ; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm13, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm4, %ymm5, %ymm0 +; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm3, %ymm4, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm0, %xmm6 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm0, %xmm0 ; AVX2-FAST-NEXT: vpor %xmm6, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vpermd %ymm14, %ymm9, %ymm6 +; AVX2-FAST-NEXT: vpermd %ymm15, %ymm9, %ymm6 ; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm6, %ymm6 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm6[7] ; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm7, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = <255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u> -; AVX2-FAST-NEXT: vmovdqa %ymm2, %ymm7 -; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm3, %ymm2, %ymm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = <255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u> +; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa %ymm2, %ymm6 +; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm5, %ymm2, %ymm0 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = -; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm1, %xmm1 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm7 = +; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm1, %xmm1 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = ; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpor %xmm1, %xmm0, %xmm1 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm9 = -; AVX2-FAST-NEXT: vmovdqa 432(%rdi), %xmm1 -; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm1, %xmm10 -; AVX2-FAST-NEXT: vmovdqa %xmm1, %xmm14 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm11 = -; AVX2-FAST-NEXT: vmovdqa 416(%rdi), %xmm15 -; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm15, %xmm12 -; AVX2-FAST-NEXT: vpor %xmm10, %xmm12, %xmm10 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm12 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [0,0,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,0,0,0,0,0] -; AVX2-FAST-NEXT: # ymm5 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm12, %ymm10, %ymm1 -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm0, %ymm4, %ymm10 -; AVX2-FAST-NEXT: vextracti128 $1, %ymm10, %xmm12 -; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm12, %xmm6 -; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm10, %xmm8 -; AVX2-FAST-NEXT: vpor %xmm6, %xmm8, %xmm10 -; AVX2-FAST-NEXT: vmovdqa 208(%rdi), %xmm1 -; AVX2-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm1, %xmm9 -; AVX2-FAST-NEXT: vmovdqa 192(%rdi), %xmm1 -; AVX2-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm1, %xmm11 -; AVX2-FAST-NEXT: vpor %xmm9, %xmm11, %xmm9 +; AVX2-FAST-NEXT: vmovdqa 432(%rdi), %xmm0 +; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm0, %xmm10 +; AVX2-FAST-NEXT: vmovdqa %xmm0, %xmm15 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm12 = +; AVX2-FAST-NEXT: vmovdqa 416(%rdi), %xmm0 +; AVX2-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpshufb %xmm12, %xmm0, %xmm11 +; AVX2-FAST-NEXT: vpor %xmm10, %xmm11, %xmm10 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = +; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm1, %ymm10, %ymm0 +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm4, %ymm3, %ymm1 +; AVX2-FAST-NEXT: vmovdqa %ymm3, %ymm4 +; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm10 +; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm10, %xmm7 +; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm1, %xmm1 +; AVX2-FAST-NEXT: vpor %xmm7, %xmm1, %xmm8 +; AVX2-FAST-NEXT: vmovdqa 208(%rdi), %xmm2 +; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm2, %xmm9 +; AVX2-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vmovdqa 192(%rdi), %xmm13 +; AVX2-FAST-NEXT: vpshufb %xmm12, %xmm13, %xmm10 +; AVX2-FAST-NEXT: vpor %xmm9, %xmm10, %xmm9 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 -; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm10, %ymm9, %ymm1 -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm8, %ymm9, %ymm0 +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = -; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm3, %ymm2, %ymm9 -; AVX2-FAST-NEXT: vmovdqa %ymm1, %ymm8 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm10 = -; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm9, %xmm11 -; AVX2-FAST-NEXT: vextracti128 $1, %ymm9, %xmm9 +; AVX2-FAST-NEXT: vmovdqa %ymm6, %ymm3 +; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm5, %ymm6, %ymm8 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm9 = +; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm8, %xmm10 +; AVX2-FAST-NEXT: vextracti128 $1, %ymm8, %xmm8 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm12 = -; AVX2-FAST-NEXT: vpshufb %xmm12, %xmm9, %xmm9 -; AVX2-FAST-NEXT: vpor %xmm11, %xmm9, %xmm9 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm11 = -; AVX2-FAST-NEXT: vmovdqa %xmm14, %xmm13 -; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm14, %xmm14 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = -; AVX2-FAST-NEXT: vmovdqa %xmm15, %xmm4 -; AVX2-FAST-NEXT: vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm15, %xmm15 -; AVX2-FAST-NEXT: vpor %xmm14, %xmm15, %xmm14 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 +; AVX2-FAST-NEXT: vpshufb %xmm12, %xmm8, %xmm8 +; AVX2-FAST-NEXT: vpor %xmm10, %xmm8, %xmm8 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm10 = +; AVX2-FAST-NEXT: vmovdqa %xmm15, %xmm6 +; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm15, %xmm15 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm7, %xmm14 +; AVX2-FAST-NEXT: vpor %xmm15, %xmm14, %xmm14 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 -; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm9, %ymm14, %ymm9 -; AVX2-FAST-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm0, %ymm6, %ymm9 -; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm9, %xmm10 -; AVX2-FAST-NEXT: vextracti128 $1, %ymm9, %xmm9 -; AVX2-FAST-NEXT: vpshufb %xmm12, %xmm9, %xmm9 -; AVX2-FAST-NEXT: vpor %xmm10, %xmm9, %xmm9 -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm8, %xmm10 -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm1 -; AVX2-FAST-NEXT: vpor %xmm1, %xmm10, %xmm1 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm9, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = -; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm3, %ymm2, %ymm1 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm1, %xmm11 -; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm8, %ymm14, %ymm8 +; AVX2-FAST-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa %ymm4, %ymm5 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm4, %ymm5, %ymm8 +; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm8, %xmm9 +; AVX2-FAST-NEXT: vextracti128 $1, %ymm8, %xmm8 +; AVX2-FAST-NEXT: vpshufb %xmm12, %xmm8, %xmm8 +; AVX2-FAST-NEXT: vpor %xmm9, %xmm8, %xmm8 +; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm2, %xmm9 +; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm13, %xmm0 +; AVX2-FAST-NEXT: vpor %xmm0, %xmm9, %xmm0 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpblendvb %ymm11, %ymm8, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = +; AVX2-FAST-NEXT: vmovdqa %ymm3, %ymm1 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm2, %ymm3, %ymm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm9 = +; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm0, %xmm10 +; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm12 = -; AVX2-FAST-NEXT: vpshufb %xmm12, %xmm1, %xmm1 -; AVX2-FAST-NEXT: vpor %xmm1, %xmm11, %xmm1 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm11 = -; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm4, %xmm14 +; AVX2-FAST-NEXT: vpshufb %xmm12, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpor %xmm0, %xmm10, %xmm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm10 = +; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm7, %xmm14 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm15 = -; AVX2-FAST-NEXT: vpshufb %xmm15, %xmm13, %xmm10 -; AVX2-FAST-NEXT: vpor %xmm14, %xmm10, %xmm10 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 -; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm1, %ymm10, %ymm1 -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm4, %ymm6, %ymm1 -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm1, %xmm9 -; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm1 -; AVX2-FAST-NEXT: vpshufb %xmm12, %xmm1, %xmm1 -; AVX2-FAST-NEXT: vpor %xmm1, %xmm9, %xmm1 -; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm0, %xmm9 -; AVX2-FAST-NEXT: vpshufb %xmm15, %xmm8, %xmm10 -; AVX2-FAST-NEXT: vpor %xmm9, %xmm10, %xmm9 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 -; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm1, %ymm9, %ymm1 -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = <255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0> -; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm3, %ymm7, %ymm1 +; AVX2-FAST-NEXT: vmovdqa %xmm6, %xmm3 +; AVX2-FAST-NEXT: vpshufb %xmm15, %xmm6, %xmm11 +; AVX2-FAST-NEXT: vpor %xmm14, %xmm11, %xmm11 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = +; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm0, %ymm11, %ymm0 +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa %ymm5, %ymm14 +; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm4, %ymm5, %ymm0 +; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm0, %xmm8 +; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX2-FAST-NEXT: vpshufb %xmm12, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpor %xmm0, %xmm8, %xmm0 +; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm13, %xmm8 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm15, %xmm5, %xmm10 +; AVX2-FAST-NEXT: vpor %xmm8, %xmm10, %xmm8 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 +; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm0, %ymm8, %ymm0 +; AVX2-FAST-NEXT: vmovdqa %ymm6, %ymm9 +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = <255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0> +; AVX2-FAST-NEXT: vmovdqa %ymm1, %ymm0 +; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm2, %ymm1, %ymm1 ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm15 = <255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u> -; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm7, %ymm3, %ymm1 -; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = -; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm10 = -; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm1, %xmm1 -; AVX2-FAST-NEXT: vpor %xmm2, %xmm1, %xmm1 +; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm0, %ymm2, %ymm0 +; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm7 = +; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm2, %xmm2 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm11 = +; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpor %xmm2, %xmm0, %xmm0 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm7, %xmm7 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm1, %xmm6 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm12 = -; AVX2-FAST-NEXT: vpshufb %xmm12, %xmm13, %xmm11 -; AVX2-FAST-NEXT: vpor %xmm7, %xmm11, %xmm7 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm1, %ymm7, %ymm1 -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa %ymm6, %ymm1 -; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm4, %ymm6, %ymm6 -; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm1, %ymm4, %ymm1 -; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm4 -; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm4, %xmm3 -; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm1, %xmm1 -; AVX2-FAST-NEXT: vpor %xmm3, %xmm1, %xmm1 -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm2 -; AVX2-FAST-NEXT: vpshufb %xmm12, %xmm8, %xmm3 -; AVX2-FAST-NEXT: vpor %xmm2, %xmm3, %xmm2 +; AVX2-FAST-NEXT: vpshufb %xmm12, %xmm3, %xmm10 +; AVX2-FAST-NEXT: vpor %xmm6, %xmm10, %xmm6 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm0, %ymm6, %ymm0 +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm4, %ymm14, %ymm3 +; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm14, %ymm4, %ymm0 +; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm3 +; AVX2-FAST-NEXT: vpshufb %xmm7, %xmm3, %xmm3 +; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpor %xmm3, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm13, %xmm1 +; AVX2-FAST-NEXT: vpshufb %xmm12, %xmm5, %xmm2 +; AVX2-FAST-NEXT: vpor %xmm1, %xmm2, %xmm1 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FAST-NEXT: vpblendvb %ymm5, %ymm1, %ymm2, %ymm0 +; AVX2-FAST-NEXT: vpblendvb %ymm9, %ymm0, %ymm1, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = -; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm0, %ymm1, %ymm3 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = -; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm0, %ymm1, %ymm12 -; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm1, %ymm0, %ymm8 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = <255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u> -; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm1, %ymm0, %ymm2 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = +; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm0, %ymm1, %ymm5 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = +; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm0, %ymm1, %ymm13 +; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm1, %ymm0, %ymm9 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = <255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u> +; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm1, %ymm0, %ymm2 ; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = ; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vmovdqu (%rsp), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm2 ; AVX2-FAST-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm0, %ymm1, %ymm9 -; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm0, %ymm1, %ymm5 -; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm1, %ymm0, %ymm11 -; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm1, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm0, %ymm1, %ymm6 +; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm0, %ymm1, %ymm4 +; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm1, %ymm0, %ymm10 +; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm1, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = <255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u> -; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm1, %ymm0, %ymm10 -; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm1, %ymm0, %ymm2 -; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm1, %ymm0, %ymm6 -; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm0, %ymm1, %ymm14 -; AVX2-FAST-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = <255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u> +; AVX2-FAST-NEXT: vpblendvb %ymm3, %ymm1, %ymm0, %ymm12 +; AVX2-FAST-NEXT: vpblendvb %ymm7, %ymm1, %ymm0, %ymm2 +; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm1, %ymm0, %ymm7 +; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm0, %ymm1, %ymm11 +; AVX2-FAST-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendvb %ymm8, %ymm0, %ymm1, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm1, %ymm14, %ymm0 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = -; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm1, %ymm14, %ymm4 -; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm1, %ymm14, %ymm7 -; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm14, %ymm1, %ymm13 -; AVX2-FAST-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = <255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u> -; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm14, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendvb %ymm3, %ymm1, %ymm11, %ymm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = +; AVX2-FAST-NEXT: vpblendvb %ymm3, %ymm1, %ymm11, %ymm3 +; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm1, %ymm11, %ymm8 +; AVX2-FAST-NEXT: vpblendvb %ymm15, %ymm11, %ymm1, %ymm14 +; AVX2-FAST-NEXT: vmovdqu %ymm14, (%rsp) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = <255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u> +; AVX2-FAST-NEXT: vpblendvb %ymm14, %ymm11, %ymm1, %ymm1 ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm15 = <2,9,128,128,128,5,12,128,128,u,u,u,u,u,u,u> -; AVX2-FAST-NEXT: vpshufb %xmm15, %xmm3, %xmm1 -; AVX2-FAST-NEXT: vextracti128 $1, %ymm3, %xmm3 +; AVX2-FAST-NEXT: vpshufb %xmm15, %xmm5, %xmm1 +; AVX2-FAST-NEXT: vextracti128 $1, %ymm5, %xmm5 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm14 = <128,128,0,7,14,128,128,3,10,u,u,u,u,u,u,u> -; AVX2-FAST-NEXT: vpshufb %xmm14, %xmm3, %xmm3 -; AVX2-FAST-NEXT: vpor %xmm1, %xmm3, %xmm1 -; AVX2-FAST-NEXT: vextracti128 $1, %ymm10, %xmm3 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm3 = ymm10[0,1,2],ymm3[3],ymm10[4,5],ymm3[6],ymm10[7,8,9,10],ymm3[11],ymm10[12,13],ymm3[14],ymm10[15] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = -; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm3, %ymm3 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm10 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0] -; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm1, %ymm3, %ymm3 -; AVX2-FAST-NEXT: vpshufb %xmm15, %xmm9, %xmm1 -; AVX2-FAST-NEXT: vextracti128 $1, %ymm9, %xmm9 -; AVX2-FAST-NEXT: vpshufb %xmm14, %xmm9, %xmm9 -; AVX2-FAST-NEXT: vpor %xmm1, %xmm9, %xmm1 -; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm9 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm9[3],ymm0[4,5],ymm9[6],ymm0[7,8,9,10],ymm9[11],ymm0[12,13],ymm9[14],ymm0[15] -; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm1, %ymm0, %ymm15 +; AVX2-FAST-NEXT: vpshufb %xmm14, %xmm5, %xmm5 +; AVX2-FAST-NEXT: vpor %xmm1, %xmm5, %xmm1 +; AVX2-FAST-NEXT: vextracti128 $1, %ymm12, %xmm5 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm12[0,1,2],ymm5[3],ymm12[4,5],ymm5[6],ymm12[7,8,9,10],ymm5[11],ymm12[12,13],ymm5[14],ymm12[15] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = +; AVX2-FAST-NEXT: vpshufb %ymm11, %ymm5, %ymm5 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm12 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0] +; AVX2-FAST-NEXT: vpblendvb %ymm12, %ymm1, %ymm5, %ymm1 +; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpshufb %xmm15, %xmm6, %xmm1 +; AVX2-FAST-NEXT: vextracti128 $1, %ymm6, %xmm5 +; AVX2-FAST-NEXT: vpshufb %xmm14, %xmm5, %xmm5 +; AVX2-FAST-NEXT: vpor %xmm1, %xmm5, %xmm1 +; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm5 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm5[3],ymm0[4,5],ymm5[6],ymm0[7,8,9,10],ymm5[11],ymm0[12,13],ymm5[14],ymm0[15] +; AVX2-FAST-NEXT: vpshufb %ymm11, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpblendvb %ymm12, %ymm1, %ymm0, %ymm6 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm0 = <3,10,128,128,128,6,13,128,128,u,u,u,u,u,u,u> -; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm12, %xmm1 -; AVX2-FAST-NEXT: vextracti128 $1, %ymm12, %xmm9 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm12 = <128,128,1,8,15,128,128,4,11,u,u,u,u,u,u,u> -; AVX2-FAST-NEXT: vpshufb %xmm12, %xmm9, %xmm9 -; AVX2-FAST-NEXT: vpor %xmm1, %xmm9, %xmm1 -; AVX2-FAST-NEXT: vextracti128 $1, %ymm2, %xmm9 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm9[0],ymm2[1,2],ymm9[3],ymm2[4,5,6],ymm9[7,8],ymm2[9,10],ymm9[11],ymm2[12,13,14],ymm9[15] +; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm13, %xmm1 +; AVX2-FAST-NEXT: vextracti128 $1, %ymm13, %xmm5 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm11 = <128,128,1,8,15,128,128,4,11,u,u,u,u,u,u,u> +; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm5, %xmm5 +; AVX2-FAST-NEXT: vpor %xmm1, %xmm5, %xmm1 +; AVX2-FAST-NEXT: vextracti128 $1, %ymm2, %xmm5 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm5[0],ymm2[1,2],ymm5[3],ymm2[4,5,6],ymm5[7,8],ymm2[9,10],ymm5[11],ymm2[12,13,14],ymm5[15] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = ; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm1, %ymm2, %ymm9 -; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm5, %xmm0 -; AVX2-FAST-NEXT: vextracti128 $1, %ymm5, %xmm1 -; AVX2-FAST-NEXT: vpshufb %xmm12, %xmm1, %xmm1 -; AVX2-FAST-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX2-FAST-NEXT: vpblendvb %ymm12, %ymm1, %ymm2, %ymm5 +; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm4, %xmm0 ; AVX2-FAST-NEXT: vextracti128 $1, %ymm4, %xmm1 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm4[1,2],ymm1[3],ymm4[4,5,6],ymm1[7,8],ymm4[9,10],ymm1[11],ymm4[12,13,14],ymm1[15] +; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm1, %xmm1 +; AVX2-FAST-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX2-FAST-NEXT: vextracti128 $1, %ymm3, %xmm1 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm3[1,2],ymm1[3],ymm3[4,5,6],ymm1[7,8],ymm3[9,10],ymm1[11],ymm3[12,13,14],ymm1[15] ; AVX2-FAST-NEXT: vpshufb %ymm13, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm0, %ymm1, %ymm12 -; AVX2-FAST-NEXT: vextracti128 $1, %ymm8, %xmm0 +; AVX2-FAST-NEXT: vpblendvb %ymm12, %ymm0, %ymm1, %ymm13 +; AVX2-FAST-NEXT: vextracti128 $1, %ymm9, %xmm0 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = <128,128,2,9,128,128,128,5,12,u,u,u,u,u,u,u> ; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = <4,11,128,128,0,7,14,128,128,u,u,u,u,u,u,u> -; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm8, %xmm1 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = <4,11,128,128,0,7,14,128,128,u,u,u,u,u,u,u> +; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm9, %xmm1 ; AVX2-FAST-NEXT: vpor %xmm0, %xmm1, %xmm0 -; AVX2-FAST-NEXT: vextracti128 $1, %ymm6, %xmm1 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm6[1,2,3],ymm1[4],ymm6[5,6],ymm1[7,8],ymm6[9,10,11],ymm1[12],ymm6[13,14],ymm1[15] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = -; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm0, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vextracti128 $1, %ymm11, %xmm0 +; AVX2-FAST-NEXT: vextracti128 $1, %ymm7, %xmm1 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm7[1,2,3],ymm1[4],ymm7[5,6],ymm1[7,8],ymm7[9,10,11],ymm1[12],ymm7[13,14],ymm1[15] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = +; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpblendvb %ymm12, %ymm0, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vextracti128 $1, %ymm10, %xmm0 ; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm11, %xmm2 +; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm10, %xmm2 ; AVX2-FAST-NEXT: vpor %xmm0, %xmm2, %xmm0 -; AVX2-FAST-NEXT: vextracti128 $1, %ymm7, %xmm2 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm7[1,2,3],ymm2[4],ymm7[5,6],ymm2[7,8],ymm7[9,10,11],ymm2[12],ymm7[13,14],ymm2[15] -; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm0, %ymm2, %ymm0 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-NEXT: vextracti128 $1, %ymm6, %xmm2 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = <128,128,3,10,128,128,128,6,13,u,u,u,u,u,u,u> -; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm2, %xmm2 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = <5,12,128,128,1,8,15,128,128,u,u,u,u,u,u,u> -; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm6, %xmm6 -; AVX2-FAST-NEXT: vpor %xmm2, %xmm6, %xmm2 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-NEXT: vextracti128 $1, %ymm7, %xmm6 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0],ymm6[1],ymm7[2,3],ymm6[4],ymm7[5,6,7,8],ymm6[9],ymm7[10,11],ymm6[12],ymm7[13,14,15] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = -; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm6, %ymm6 -; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm2, %ymm6, %ymm2 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FAST-NEXT: vextracti128 $1, %ymm8, %xmm6 -; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm6, %xmm4 -; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm8, %xmm5 -; AVX2-FAST-NEXT: vpor %xmm4, %xmm5, %xmm4 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-NEXT: vextracti128 $1, %ymm6, %xmm5 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2,3],ymm5[4],ymm6[5,6,7,8],ymm5[9],ymm6[10,11],ymm5[12],ymm6[13,14,15] -; AVX2-FAST-NEXT: vpshufb %ymm7, %ymm5, %ymm5 -; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm4, %ymm5, %ymm4 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = <6,13,128,128,2,9,128,128,128,u,u,u,u,u,u,u> -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm7, %xmm6 -; AVX2-FAST-NEXT: vextracti128 $1, %ymm7, %xmm7 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = <128,128,4,11,128,128,0,7,14,u,u,u,u,u,u,u> -; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm7, %xmm7 -; AVX2-FAST-NEXT: vpor %xmm6, %xmm7, %xmm6 -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FAST-NEXT: vextracti128 $1, %ymm11, %xmm7 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm11[0],ymm7[1],ymm11[2,3,4],ymm7[5],ymm11[6,7,8],ymm7[9],ymm11[10,11,12],ymm7[13],ymm11[14,15] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = -; AVX2-FAST-NEXT: vpshufb %ymm11, %ymm7, %ymm7 -; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm6, %ymm7, %ymm6 +; AVX2-FAST-NEXT: vextracti128 $1, %ymm8, %xmm2 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm8[1,2,3],ymm2[4],ymm8[5,6],ymm2[7,8],ymm8[9,10,11],ymm2[12],ymm8[13,14],ymm2[15] +; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vpblendvb %ymm12, %ymm0, %ymm2, %ymm0 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm7, %xmm5 -; AVX2-FAST-NEXT: vextracti128 $1, %ymm7, %xmm7 -; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm7, %xmm7 -; AVX2-FAST-NEXT: vpor %xmm5, %xmm7, %xmm5 +; AVX2-FAST-NEXT: vextracti128 $1, %ymm7, %xmm2 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = <128,128,3,10,128,128,128,6,13,u,u,u,u,u,u,u> +; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = <5,12,128,128,1,8,15,128,128,u,u,u,u,u,u,u> +; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm7, %xmm7 +; AVX2-FAST-NEXT: vpor %xmm2, %xmm7, %xmm2 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload ; AVX2-FAST-NEXT: vextracti128 $1, %ymm8, %xmm7 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0],ymm7[1],ymm8[2,3,4],ymm7[5],ymm8[6,7,8],ymm7[9],ymm8[10,11,12],ymm7[13],ymm8[14,15] -; AVX2-FAST-NEXT: vpshufb %ymm11, %ymm7, %ymm7 -; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm5, %ymm7, %ymm5 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0],ymm7[1],ymm8[2,3],ymm7[4],ymm8[5,6,7,8],ymm7[9],ymm8[10,11],ymm7[12],ymm8[13,14,15] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = +; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm7, %ymm7 +; AVX2-FAST-NEXT: vpblendvb %ymm12, %ymm2, %ymm7, %ymm2 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-FAST-NEXT: vextracti128 $1, %ymm9, %xmm7 +; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm7, %xmm3 +; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm9, %xmm4 +; AVX2-FAST-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX2-FAST-NEXT: vmovdqu (%rsp), %ymm7 # 32-byte Reload +; AVX2-FAST-NEXT: vextracti128 $1, %ymm7, %xmm4 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm4 = ymm7[0],ymm4[1],ymm7[2,3],ymm4[4],ymm7[5,6,7,8],ymm4[9],ymm7[10,11],ymm4[12],ymm7[13,14,15] +; AVX2-FAST-NEXT: vpshufb %ymm8, %ymm4, %ymm4 +; AVX2-FAST-NEXT: vpblendvb %ymm12, %ymm3, %ymm4, %ymm3 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm4 = <6,13,128,128,2,9,128,128,128,u,u,u,u,u,u,u> +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm8, %xmm7 +; AVX2-FAST-NEXT: vextracti128 $1, %ymm8, %xmm8 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm9 = <128,128,4,11,128,128,0,7,14,u,u,u,u,u,u,u> +; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm8, %xmm8 +; AVX2-FAST-NEXT: vpor %xmm7, %xmm8, %xmm7 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-FAST-NEXT: vextracti128 $1, %ymm10, %xmm8 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm10[0],ymm8[1],ymm10[2,3,4],ymm8[5],ymm10[6,7,8],ymm8[9],ymm10[10,11,12],ymm8[13],ymm10[14,15] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = +; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm8, %ymm8 +; AVX2-FAST-NEXT: vpblendvb %ymm12, %ymm7, %ymm8, %ymm7 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm4, %xmm8, %xmm4 +; AVX2-FAST-NEXT: vextracti128 $1, %ymm8, %xmm8 +; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm8, %xmm8 +; AVX2-FAST-NEXT: vpor %xmm4, %xmm8, %xmm4 +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload +; AVX2-FAST-NEXT: vextracti128 $1, %ymm9, %xmm8 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0],ymm8[1],ymm9[2,3,4],ymm8[5],ymm9[6,7,8],ymm8[9],ymm9[10,11,12],ymm8[13],ymm9[14,15] +; AVX2-FAST-NEXT: vpshufb %ymm10, %ymm8, %ymm8 +; AVX2-FAST-NEXT: vpblendvb %ymm12, %ymm4, %ymm8, %ymm4 ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FAST-NEXT: vextracti128 $1, %ymm11, %xmm7 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = -; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm7, %xmm7 +; AVX2-FAST-NEXT: vextracti128 $1, %ymm11, %xmm8 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm9 = +; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm8, %xmm8 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm10 = ; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm11, %xmm11 -; AVX2-FAST-NEXT: vpor %xmm7, %xmm11, %xmm7 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX2-FAST-NEXT: vpor %xmm8, %xmm11, %xmm8 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = [1,3,1,2,1,3,5,6] -; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm13 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm12 # 32-byte Folded Reload ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = -; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm13, %ymm13 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm13[7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm3[0],ymm7[1,2,3,4,5,6,7],ymm3[8],ymm7[9,10,11,12,13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm7[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-FAST-NEXT: vextracti128 $1, %ymm13, %xmm7 -; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm7, %xmm7 -; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm13, %xmm8 -; AVX2-FAST-NEXT: vpor %xmm7, %xmm8, %xmm7 -; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm8 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm8, %ymm8 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm8[7] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm7 = ymm15[0],ymm7[1,2,3,4,5,6,7],ymm15[8],ymm7[9,10,11,12,13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm15[0,1,2,3],ymm7[4,5,6,7] -; AVX2-FAST-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm8 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm8 = ymm9[0],mem[1,2,3,4,5,6,7],ymm9[8],mem[9,10,11,12,13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] -; AVX2-FAST-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm9 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm9 = ymm12[0],mem[1,2,3,4,5,6,7],ymm12[8],mem[9,10,11,12,13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm12[0,1,2,3],ymm9[4,5,6,7] +; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm12, %ymm12 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6],ymm12[7] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm8 = ymm12[0],ymm8[1,2,3,4,5,6,7],ymm12[8],ymm8[9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm12[0,1,2,3],ymm8[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-FAST-NEXT: vextracti128 $1, %ymm15, %xmm12 +; AVX2-FAST-NEXT: vpshufb %xmm9, %xmm12, %xmm9 +; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm15, %xmm10 +; AVX2-FAST-NEXT: vpor %xmm9, %xmm10, %xmm9 +; AVX2-FAST-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm10 # 32-byte Folded Reload +; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm10, %ymm10 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5,6],ymm10[7] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm9 = ymm6[0],ymm9[1,2,3,4,5,6,7],ymm6[8],ymm9[9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm9[4,5,6,7] +; AVX2-FAST-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm9 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm9 = ymm5[0],mem[1,2,3,4,5,6,7],ymm5[8],mem[9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm9[4,5,6,7] +; AVX2-FAST-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm9 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm9 = ymm13[0],mem[1,2,3,4,5,6,7],ymm13[8],mem[9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm13[0,1,2,3],ymm9[4,5,6,7] ; AVX2-FAST-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm10 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm10 = ymm1[0],mem[1,2,3,4,5,6,7],ymm1[8],mem[9,10,11,12,13,14,15] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm10[4,5,6,7] @@ -8358,15 +8371,15 @@ ; AVX2-FAST-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm10 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm10 = ymm2[0],mem[1,2,3,4,5,6,7],ymm2[8],mem[9,10,11,12,13,14,15] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm10[4,5,6,7] +; AVX2-FAST-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm10 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm10 = ymm3[0],mem[1,2,3,4,5,6,7],ymm3[8],mem[9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm10[4,5,6,7] +; AVX2-FAST-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm10 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm10 = ymm7[0],mem[1,2,3,4,5,6,7],ymm7[8],mem[9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm10[4,5,6,7] ; AVX2-FAST-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm10 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm10 = ymm4[0],mem[1,2,3,4,5,6,7],ymm4[8],mem[9,10,11,12,13,14,15] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm10[4,5,6,7] -; AVX2-FAST-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm10 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm10 = ymm6[0],mem[1,2,3,4,5,6,7],ymm6[8],mem[9,10,11,12,13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm10[4,5,6,7] -; AVX2-FAST-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm10 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm10 = ymm5[0],mem[1,2,3,4,5,6,7],ymm5[8],mem[9,10,11,12,13,14,15] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm10[4,5,6,7] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm10, (%rsi) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload @@ -8375,515 +8388,524 @@ ; AVX2-FAST-NEXT: vmovaps %ymm10, (%rdx) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm10, 32(%rdx) -; AVX2-FAST-NEXT: vmovdqa %ymm7, (%rcx) -; AVX2-FAST-NEXT: vmovdqa %ymm3, 32(%rcx) +; AVX2-FAST-NEXT: vmovdqa %ymm6, (%rcx) +; AVX2-FAST-NEXT: vmovdqa %ymm8, 32(%rcx) ; AVX2-FAST-NEXT: vmovdqa %ymm9, (%r8) -; AVX2-FAST-NEXT: vmovdqa %ymm8, 32(%r8) +; AVX2-FAST-NEXT: vmovdqa %ymm5, 32(%r8) ; AVX2-FAST-NEXT: vmovdqa %ymm0, (%r9) ; AVX2-FAST-NEXT: vmovdqa %ymm1, 32(%r9) ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-NEXT: vmovdqa %ymm4, (%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm3, (%rax) ; AVX2-FAST-NEXT: vmovdqa %ymm2, 32(%rax) ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-NEXT: vmovdqa %ymm5, (%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm6, 32(%rax) -; AVX2-FAST-NEXT: addq $728, %rsp # imm = 0x2D8 +; AVX2-FAST-NEXT: vmovdqa %ymm4, (%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm7, 32(%rax) +; AVX2-FAST-NEXT: addq $712, %rsp # imm = 0x2C8 ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; ; AVX2-FAST-PERLANE-LABEL: load_i8_stride7_vf64: ; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: subq $760, %rsp # imm = 0x2F8 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm8 +; AVX2-FAST-PERLANE-NEXT: subq $712, %rsp # imm = 0x2C8 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm11 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %ymm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 64(%rdi), %ymm10 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 320(%rdi), %ymm4 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 288(%rdi), %ymm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 224(%rdi), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 256(%rdi), %ymm10 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = <255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0> -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm1, %ymm10, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, %ymm14 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, %ymm12 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm1 = <128,128,128,5,12,128,128,1,8,15,u,u,u,u,u,u> -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm2, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 224(%rdi), %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 256(%rdi), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm3 = <255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0> +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, %ymm0, %ymm2, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, %ymm13 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm0 = <128,128,128,5,12,128,128,1,8,15,u,u,u,u,u,u> +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm2, %xmm3 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm2 = <0,7,14,128,128,3,10,128,128,128,u,u,u,u,u,u> -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpor %xmm3, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpor %xmm3, %xmm1, %xmm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm9 = ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm9, %ymm4, %ymm5, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm9, %ymm15 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm9, %ymm14 ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm5, %ymm9 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm4, %ymm11 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm4, %ymm12 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm3, %xmm4 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7,8,9],ymm4[10],ymm3[11,12],ymm4[13],ymm3[14,15] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm3 = ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm4, %ymm5 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm0, %ymm5, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm8, %ymm6, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm8, %ymm13 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm6, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm1, %ymm5, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm13, %ymm11, %ymm6, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm6, %ymm15 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm5, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm6, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm6, %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 96(%rdi), %ymm6 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm5, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpor %xmm1, %xmm2, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm6, %ymm7, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm6, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vpor %xmm0, %xmm2, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm6, %ymm10, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm6, %ymm14 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm5 ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm5[2],ymm2[3,4],ymm5[5],ymm2[6,7,8,9],ymm5[10],ymm2[11,12],ymm5[13],ymm2[14,15] ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm3, %ymm2, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm1, %ymm2, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm9, %ymm11, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1],ymm2[2],ymm1[3,4,5],ymm2[6],ymm1[7,8,9],ymm2[10],ymm1[11,12,13],ymm2[14],ymm1[15] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm1 = <255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u> -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm1, %ymm12, %ymm10, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm4, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm1 = <128,128,128,6,13,128,128,2,9,u,u,u,u,u,u,u> -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm5, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm10 = <1,8,15,128,128,4,11,128,128,u,u,u,u,u,u,u> -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm4, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpor %xmm5, %xmm4, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm5 = -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm2, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm9 = <0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,u,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm9, %ymm2, %ymm4, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm0, %ymm2, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm13, %ymm9, %ymm12, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm0[0,1],ymm2[2],ymm0[3,4,5],ymm2[6],ymm0[7,8,9],ymm2[10],ymm0[11,12,13],ymm2[14],ymm0[15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm0 = <255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u> +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm8, %ymm7, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm0 = <128,128,128,6,13,128,128,2,9,u,u,u,u,u,u,u> +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm4, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm6 = <1,8,15,128,128,4,11,128,128,u,u,u,u,u,u,u> +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm3, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpor %xmm4, %xmm3, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm4 = +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm2, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm7 = <0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,u,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm2, %ymm3, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm7, %ymm8, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm4[2],ymm2[3,4,5],ymm4[6],ymm2[7,8,9],ymm4[10],ymm2[11,12,13],ymm4[14],ymm2[15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm2, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm13, %ymm0, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm4, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm5, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm4, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpor %xmm1, %xmm4, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm9, %ymm2, %ymm1, %ymm11 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 384(%rdi), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 352(%rdi), %ymm13 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm13, %ymm10, %ymm14, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4,5],ymm3[6],ymm2[7,8,9],ymm3[10],ymm2[11,12,13],ymm3[14],ymm2[15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm4, %ymm2, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm11, %ymm15, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm4, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm3, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpor %xmm0, %xmm3, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm2, %ymm0, %ymm15 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 384(%rdi), %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 352(%rdi), %ymm9 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm0 = <255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u> -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm1, %ymm13, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, %ymm10 -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm4, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm9 = -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm5, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm12 = -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm4, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpor %xmm5, %xmm4, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm2 = -; AVX2-FAST-PERLANE-NEXT: vmovdqa 432(%rdi), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm1, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm2, %ymm9, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, %ymm10 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, %ymm14 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm6 = +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm4, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm7 = +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm3, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpor %xmm4, %xmm3, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm8 = +; AVX2-FAST-PERLANE-NEXT: vmovdqa 432(%rdi), %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm2, %xmm4 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm0 = -; AVX2-FAST-PERLANE-NEXT: vmovdqa 416(%rdi), %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm3, %xmm14 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, %xmm8 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm14[0],xmm5[0],xmm14[1],xmm5[1],xmm14[2],xmm5[2],xmm14[3],xmm5[3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 416(%rdi), %xmm11 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm0, %xmm11, %xmm13 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, %xmm12 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm13[0],xmm4[0],xmm13[1],xmm4[1],xmm13[2],xmm4[2],xmm13[3],xmm4[3] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm5[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm14 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm4[7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm13 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm13, %ymm1, %ymm3, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqa 160(%rdi), %ymm4 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 128(%rdi), %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm4, %ymm5, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm15 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm15, %xmm9 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpor %xmm0, %xmm9, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 208(%rdi), %xmm10 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm10, %xmm9 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm10, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa 192(%rdi), %xmm10 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm10, %xmm12 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm4, %ymm5, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm14, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpor %xmm6, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa 208(%rdi), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm0, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, %xmm8 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa 192(%rdi), %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm3, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm6[7] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm13, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm0 = +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm10, %ymm9, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm10 = +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm1, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm7 = +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpor %xmm6, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm6 = +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm2, %xmm9 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm14 = +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm14, %xmm11, %xmm12 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm12[0],xmm9[0],xmm12[1],xmm9[1],xmm12[2],xmm9[2],xmm12[3],xmm9[3] -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm9[7] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm9[7] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm13, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm4, %ymm5, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm10, %xmm1, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpor %xmm5, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm8, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm14, %xmm3, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm3, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm5[7] +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm13, %ymm15, %ymm1, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm6 = -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm7, %ymm13, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm7 = -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm0, %xmm9 -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm12 = -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpor %xmm0, %xmm9, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm9 = -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm1, %xmm13 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm15 = -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm15, %xmm3, %xmm8 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm13[0],xmm8[1],xmm13[1],xmm8[2],xmm13[2],xmm8[3],xmm13[3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm11 = <255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u> +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm15, %ymm10, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm5 = +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm6 = +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm1 = +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, %xmm12 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm2, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm9 = +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm13, %xmm8 +; AVX2-FAST-PERLANE-NEXT: vpor %xmm7, %xmm8, %xmm7 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm8[7] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm0, %ymm7, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm2, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm3, %ymm4, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm7, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpor %xmm5, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm2, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm14, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpor %xmm1, %xmm5, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm0, %ymm1, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm8, %ymm14 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm4, %ymm5, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm0, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm1 = +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm1, %ymm15, %ymm10, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm10, %ymm15 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm1 = +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm0, %xmm5 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpor %xmm3, %xmm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm2, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, %xmm13 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm15, %xmm10, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm6 = +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpor %xmm5, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm5 = +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm12, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm12, %xmm11 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm9 = +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm13, %xmm12 +; AVX2-FAST-PERLANE-NEXT: vpor %xmm7, %xmm12, %xmm7 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm3[7] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm11, %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm14, %ymm10 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm0, %ymm7, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = <255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u> -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm11, %ymm7, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm3 = -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm6 = +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm4, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm8, %ymm3, %ymm4, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm0, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm0, %xmm0 ; AVX2-FAST-PERLANE-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm1 = -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm15, %xmm8 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm9 = -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm14, %xmm12 -; AVX2-FAST-PERLANE-NEXT: vpor %xmm8, %xmm12, %xmm8 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm12 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [0,0,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,0,0,0,0,0] -; AVX2-FAST-PERLANE-NEXT: # ymm0 = mem[0,1,0,1] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm12, %ymm8, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm5, %ymm4, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm8, %xmm12 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm12, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm8, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpor %xmm3, %xmm6, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm13, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm13, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm10, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpor %xmm1, %xmm6, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm2, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm3, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vpor %xmm1, %xmm5, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm3, %ymm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm3 = -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, %ymm11, %ymm7, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm7 = -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm1, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm8 = -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm1, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpor %xmm6, %xmm1, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm6 = -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm15, %xmm9 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm12 = -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm14, %xmm13 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm0, %ymm1, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm1 = +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm1, %ymm2, %ymm15, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm8 = +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm0, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm6 = +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpor %xmm5, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm5 = +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm13, %xmm9 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm13, %xmm10 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm12 = +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm11, %xmm13 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpor %xmm9, %xmm13, %xmm9 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm1, %ymm9, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, %ymm5, %ymm4, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm7, %xmm1, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm1, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpor %xmm3, %xmm1, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm2, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm10, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm10, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpor %xmm3, %xmm6, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm1, %ymm3, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm3 = -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, %ymm11, %ymm7, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, %ymm15 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm3 = -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm1, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm8 = -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm1, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpor %xmm6, %xmm1, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm6 = -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm14, %xmm9 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm14, %xmm12 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm13 = -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm10, %xmm14 -; AVX2-FAST-PERLANE-NEXT: vpor %xmm12, %xmm14, %xmm12 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm1, %ymm12, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm5, %ymm4, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm5, %ymm15 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm1, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm1, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpor %xmm3, %xmm1, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm2, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm14, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpor %xmm3, %xmm6, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm13 = +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm13, %ymm0, %ymm9, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm1, %ymm4, %ymm7, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm7, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm0, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm3, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm3, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm14, %xmm5 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %xmm14, %xmm8 +; AVX2-FAST-PERLANE-NEXT: vpor %xmm1, %xmm5, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm1, %ymm3, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm12 = <255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0> -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm11, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm11, %ymm7, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm11 = <255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u> -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm7, %ymm3, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm13, %ymm0, %ymm1, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm13, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm14 = <255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0> +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm15, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm2, %ymm15, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm15 = <255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u> +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm1, %ymm2, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm3 = -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm5 = +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm2, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm6 = ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm1, %xmm1 ; AVX2-FAST-PERLANE-NEXT: vpor %xmm2, %xmm1, %xmm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm2 = -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm9, %xmm7 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm8 = -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm10, %xmm13 -; AVX2-FAST-PERLANE-NEXT: vpor %xmm7, %xmm13, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm10, %xmm9 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm12 = +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm11, %xmm13 +; AVX2-FAST-PERLANE-NEXT: vpor %xmm9, %xmm13, %xmm9 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm1, %ymm7, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm15, %ymm4, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm11, %ymm12 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm4, %ymm15, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm3, %xmm4, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm3, %ymm10 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, %ymm1, %ymm9, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm14, %ymm3, %ymm4, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm4, %ymm3, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm3, %xmm3 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm1, %xmm1 ; AVX2-FAST-PERLANE-NEXT: vpor %xmm3, %xmm1, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm5, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm14, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm7, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm8, %xmm3 ; AVX2-FAST-PERLANE-NEXT: vpor %xmm2, %xmm3, %xmm2 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm1, %ymm2, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm5 = -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm1, %ymm10, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm13 = -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm13, %ymm1, %ymm10, %ymm15 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm10, %ymm1, %ymm11 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm4 = <255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u> -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm10, %ymm1, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm10, %ymm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm11 = +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm1, %ymm2, %ymm12 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm7 = +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm1, %ymm2, %ymm4 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm2, %ymm1, %ymm8 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm10 = <255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u> +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm2, %ymm1, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm3 = +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, %ymm2, %ymm1, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm2, %ymm1, %ymm10, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm10, %ymm1, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm13, %ymm10, %ymm1, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm1, %ymm10, %ymm14 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm1, %ymm10, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm2, %ymm1, %ymm14 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm2, %ymm1, %ymm5 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm1, %ymm2, %ymm9 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm15, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm1, %ymm2, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm6 = <255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u> -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm2, %ymm1, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm5, %ymm2, %ymm1, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm13, %ymm2, %ymm1, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm1, %ymm2, %ymm10 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm4, %ymm1, %ymm2, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm15 = <255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u> +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm3, %ymm1, %ymm13 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm11, %ymm3, %ymm1, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm3, %ymm1, %ymm6 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm1, %ymm3, %ymm11 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm10, %ymm1, %ymm3, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm2, %ymm4, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm6 = -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm6, %ymm2, %ymm4, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm13, %ymm2, %ymm4, %ymm10 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm4, %ymm2, %ymm12 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm12 = <255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u> -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm4, %ymm2, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm12 = <2,9,128,128,128,5,12,128,128,u,u,u,u,u,u,u> -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm0, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm13 = <128,128,0,7,14,128,128,3,10,u,u,u,u,u,u,u> -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm0, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpor %xmm2, %xmm0, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm9, %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm9[0,1,2],ymm0[3],ymm9[4,5],ymm0[6],ymm9[7,8,9,10],ymm0[11],ymm9[12,13],ymm0[14],ymm9[15] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm9 = -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm0, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm15, %ymm10, %ymm11, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm3 = +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm3, %ymm10, %ymm11, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm7, %ymm10, %ymm11, %ymm7 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm11, %ymm10, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm0 = <255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u> +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm11, %ymm10, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm15 = <2,9,128,128,128,5,12,128,128,u,u,u,u,u,u,u> +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm15, %xmm12, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm12, %xmm12 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm11 = <128,128,0,7,14,128,128,3,10,u,u,u,u,u,u,u> +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm12, %xmm12 +; AVX2-FAST-PERLANE-NEXT: vpor %xmm0, %xmm12, %xmm12 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm13, %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm0 = ymm13[0,1,2],ymm0[3],ymm13[4,5],ymm0[6],ymm13[7,8,9,10],ymm0[11],ymm13[12,13],ymm0[14],ymm13[15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm10 = +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm10, %ymm0, %ymm13 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0] -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm4, %ymm2, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm12, %xmm3, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm3, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm3, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpor %xmm2, %xmm3, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2],ymm3[3],ymm1[4,5],ymm3[6],ymm1[7,8,9,10],ymm3[11],ymm1[12,13],ymm3[14],ymm1[15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm9, %ymm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm2, %ymm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm12, %ymm13, %ymm12 +; AVX2-FAST-PERLANE-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm15, %xmm14, %xmm12 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm14, %xmm14 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm14, %xmm11 +; AVX2-FAST-PERLANE-NEXT: vpor %xmm12, %xmm11, %xmm11 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm1, %xmm12 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1,2],ymm12[3],ymm1[4,5],ymm12[6],ymm1[7,8,9,10],ymm12[11],ymm1[12,13],ymm12[14],ymm1[15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm10, %ymm1, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm11, %ymm1, %ymm14 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm1 = <3,10,128,128,128,6,13,128,128,u,u,u,u,u,u,u> -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm15, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm15, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm4 = <128,128,1,8,15,128,128,4,11,u,u,u,u,u,u,u> -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm3, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpor %xmm2, %xmm3, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm5, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm5[1,2],ymm3[3],ymm5[4,5,6],ymm3[7,8],ymm5[9,10],ymm3[11],ymm5[12,13,14],ymm3[15] -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm5 = -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm3, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm2, %ymm3, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm7, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm7, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm3, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpor %xmm1, %xmm3, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm6, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm6[1,2],ymm3[3],ymm6[4,5,6],ymm3[7,8],ymm6[9,10],ymm3[11],ymm6[12,13,14],ymm3[15] -; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm3, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm1, %ymm3, %ymm15 -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm11, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm4, %xmm10 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm4, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm11 = <128,128,1,8,15,128,128,4,11,u,u,u,u,u,u,u> +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm4, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpor %xmm4, %xmm10, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm10 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm2 = ymm10[0],ymm2[1,2],ymm10[3],ymm2[4,5,6],ymm10[7,8],ymm2[9,10],ymm10[11],ymm2[12,13,14],ymm10[15] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm10 = +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm10, %ymm2, %ymm2 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm4, %ymm2, %ymm13 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm1, %xmm5, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm5, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm4, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpor %xmm1, %xmm4, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm3, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm3[1,2],ymm4[3],ymm3[4,5,6],ymm4[7,8],ymm3[9,10],ymm4[11],ymm3[12,13,14],ymm4[15] +; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm10, %ymm3, %ymm3 +; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm1, %ymm3, %ymm12 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm8, %xmm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm4 = <128,128,2,9,128,128,128,5,12,u,u,u,u,u,u,u> ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm1, %xmm1 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm5 = <4,11,128,128,0,7,14,128,128,u,u,u,u,u,u,u> -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm11, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm8, %xmm3 ; AVX2-FAST-PERLANE-NEXT: vpor %xmm1, %xmm3, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm8, %xmm3 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm8[1,2,3],ymm3[4],ymm8[5,6],ymm3[7,8],ymm8[9,10,11],ymm3[12],ymm8[13,14],ymm3[15] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm6, %xmm3 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm6[1,2,3],ymm3[4],ymm6[5,6],ymm3[7,8],ymm6[9,10,11],ymm3[12],ymm6[13,14],ymm3[15] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm6 = ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm6, %ymm3, %ymm3 ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm1, %ymm3, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm14, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm9, %xmm1 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm4, %xmm1, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm14, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm9, %xmm4 ; AVX2-FAST-PERLANE-NEXT: vpor %xmm1, %xmm4, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm10, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm10[1,2,3],ymm4[4],ymm10[5,6],ymm4[7,8],ymm10[9,10,11],ymm4[12],ymm10[13,14],ymm4[15] +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm7, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm7[1,2,3],ymm4[4],ymm7[5,6],ymm4[7,8],ymm7[9,10,11],ymm4[12],ymm7[13,14],ymm4[15] ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm6, %ymm4, %ymm4 ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm1, %ymm4, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm7, %xmm4 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm4 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm5 = <128,128,3,10,128,128,128,6,13,u,u,u,u,u,u,u> ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm4, %xmm4 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm6 = <5,12,128,128,1,8,15,128,128,u,u,u,u,u,u,u> -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm7, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm2, %xmm7 ; AVX2-FAST-PERLANE-NEXT: vpor %xmm4, %xmm7, %xmm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm8, %xmm7 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0],ymm7[1],ymm8[2,3],ymm7[4],ymm8[5,6,7,8],ymm7[9],ymm8[10,11],ymm7[12],ymm8[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm7 = ymm2[0],ymm7[1],ymm2[2,3],ymm7[4],ymm2[5,6,7,8],ymm7[9],ymm2[10,11],ymm7[12],ymm2[13,14,15] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm8 = ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm8, %ymm7, %ymm7 ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm4, %ymm7, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm9, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm7 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm5, %xmm7, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm9, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm2, %xmm6 ; AVX2-FAST-PERLANE-NEXT: vpor %xmm5, %xmm6, %xmm5 -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm7, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm6 = ymm7[0],ymm6[1],ymm7[2,3],ymm6[4],ymm7[5,6,7,8],ymm6[9],ymm7[10,11],ymm6[12],ymm7[13,14,15] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm6 = ymm2[0],ymm6[1],ymm2[2,3],ymm6[4],ymm2[5,6,7,8],ymm6[9],ymm2[10,11],ymm6[12],ymm2[13,14,15] ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm8, %ymm6, %ymm6 ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm5, %ymm6, %ymm5 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm6 = <6,13,128,128,2,9,128,128,128,u,u,u,u,u,u,u> -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm8, %xmm7 -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm8, %xmm8 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm2, %xmm7 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm8 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm9 = <128,128,4,11,128,128,0,7,14,u,u,u,u,u,u,u> ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm8, %xmm8 ; AVX2-FAST-PERLANE-NEXT: vpor %xmm7, %xmm8, %xmm7 -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm10, %xmm8 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm8 = ymm10[0],ymm8[1],ymm10[2,3,4],ymm8[5],ymm10[6,7,8],ymm8[9],ymm10[10,11,12],ymm8[13],ymm10[14,15] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm8 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm8 = ymm2[0],ymm8[1],ymm2[2,3,4],ymm8[5],ymm2[6,7,8],ymm8[9],ymm2[10,11,12],ymm8[13],ymm2[14,15] ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm10 = ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm10, %ymm8, %ymm8 ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm7, %ymm8, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm8, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm8, %xmm8 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm6, %xmm2, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm8 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm8, %xmm8 ; AVX2-FAST-PERLANE-NEXT: vpor %xmm6, %xmm8, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm9, %xmm8 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0],ymm8[1],ymm9[2,3,4],ymm8[5],ymm9[6,7,8],ymm8[9],ymm9[10,11,12],ymm8[13],ymm9[14,15] +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm8 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm8 = ymm2[0],ymm8[1],ymm2[2,3,4],ymm8[5],ymm2[6,7,8],ymm8[9],ymm2[10,11,12],ymm8[13],ymm2[14,15] ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm10, %ymm8, %ymm8 ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm0, %ymm6, %ymm8, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm10, %xmm6 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm2, %xmm6 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm8 = ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm6, %xmm6 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm9 = -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm10, %xmm10 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm2, %xmm10 ; AVX2-FAST-PERLANE-NEXT: vpor %xmm6, %xmm10, %xmm6 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm13 = +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm2 = ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm10, %xmm10 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm10, %xmm10 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} xmm11 = -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm12, %xmm12 -; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm12[0],xmm10[0],xmm12[1],xmm10[1],xmm12[2],xmm10[2],xmm12[3],xmm10[3] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm15, %xmm15 +; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm15[0],xmm10[0],xmm15[1],xmm10[1],xmm15[2],xmm10[2],xmm15[3],xmm10[3] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm10[7] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm6 = ymm10[0],ymm6[1,2,3,4,5,6,7],ymm10[8],ymm6[9,10,11,12,13,14,15] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0,1,2,3],ymm6[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm12, %xmm10 +; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload +; AVX2-FAST-PERLANE-NEXT: vextracti128 $1, %ymm15, %xmm10 ; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm8, %xmm10, %xmm8 -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm12, %xmm9 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm9, %xmm15, %xmm9 ; AVX2-FAST-PERLANE-NEXT: vpor %xmm8, %xmm9, %xmm8 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm13, %xmm9, %xmm9 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm10, %xmm10 +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm2, %xmm9, %xmm9 +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-FAST-PERLANE-NEXT: vpshufb %xmm11, %xmm2, %xmm10 ; AVX2-FAST-PERLANE-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3] ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 ; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5,6],ymm9[7] -; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0],ymm8[1,2,3,4,5,6,7],ymm9[8],ymm8[9,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm9 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm9 = ymm2[0],mem[1,2,3,4,5,6,7],ymm2[8],mem[9,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm9[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm9 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: # ymm9 = ymm15[0],mem[1,2,3,4,5,6,7],ymm15[8],mem[9,10,11,12,13,14,15] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm9 = ymm15[0,1,2,3],ymm9[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} ymm8 = ymm14[0],ymm8[1,2,3,4,5,6,7],ymm14[8],ymm8[9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm8 = ymm14[0,1,2,3],ymm8[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm9 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm9 = ymm13[0],mem[1,2,3,4,5,6,7],ymm13[8],mem[9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1,2,3],ymm9[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm9 # 32-byte Folded Reload +; AVX2-FAST-PERLANE-NEXT: # ymm9 = ymm12[0],mem[1,2,3,4,5,6,7],ymm12[8],mem[9,10,11,12,13,14,15] +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm9 = ymm12[0,1,2,3],ymm9[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpblendw $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm10 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm10 = ymm3[0],mem[1,2,3,4,5,6,7],ymm3[8],mem[9,10,11,12,13,14,15] ; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm10[4,5,6,7] @@ -8922,7 +8944,7 @@ ; AVX2-FAST-PERLANE-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, (%rax) ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm7, 32(%rax) -; AVX2-FAST-PERLANE-NEXT: addq $760, %rsp # imm = 0x2F8 +; AVX2-FAST-PERLANE-NEXT: addq $712, %rsp # imm = 0x2C8 ; AVX2-FAST-PERLANE-NEXT: vzeroupper ; AVX2-FAST-PERLANE-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-6.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-6.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-6.ll @@ -329,8 +329,7 @@ ; AVX2-FAST-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero ; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm4[0],xmm3[0] ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm6 = ymm2[0,1,8,9,u,u,u,u,u,u,u,u,2,3,10,11,18,19,26,27,u,u,u,u,u,u,u,u,20,21,28,29] -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [4,6,1,3,4,6,1,3] -; AVX2-FAST-NEXT: # ymm7 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = <4,6,u,u,u,u,1,3> ; AVX2-FAST-NEXT: vpermd %ymm2, %ymm7, %ymm2 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,0,1,4,5,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,28,29,u,u,u,u] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0],ymm2[1],ymm6[2,3,4,5],ymm2[6],ymm6[7] @@ -662,20 +661,17 @@ ; AVX2-FAST-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 ; AVX2-FAST-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 ; AVX2-FAST-NEXT: vinserti128 $1, (%r9), %ymm2, %ymm2 -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,4,1,5,0,4,1,5] -; AVX2-FAST-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = <0,4,u,u,0,4,1,5> ; AVX2-FAST-NEXT: vpermd %ymm1, %ymm3, %ymm3 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,0,1,4,5,u,u,u,u,u,u,u,u,18,19,22,23,u,u,u,u,u,u,u,u,24,25,28,29] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm0[0,2,0,2] ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[0,1,8,9,u,u,u,u,u,u,u,u,2,3,10,11,u,u,u,u,u,u,u,u,20,21,28,29,u,u,u,u] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6],ymm3[7] -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,4,4,0,0,4,4,0] -; AVX2-FAST-NEXT: # ymm4 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = ; AVX2-FAST-NEXT: vpermd %ymm2, %ymm4, %ymm4 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,12,13,8,9,u,u,u,u,u,u,u,u,18,19,22,23,u,u,u,u,u,u,u,u] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7] -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [2,6,1,5,2,6,1,5] -; AVX2-FAST-NEXT: # ymm4 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = ; AVX2-FAST-NEXT: vpermd %ymm1, %ymm4, %ymm4 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,10,11,14,15,u,u,u,u,u,u,u,u,16,17,20,21,u,u,u,u,u,u,u,u] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm0[0,2,1,3] @@ -684,8 +680,7 @@ ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm2[0,2,1,3] ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[4,5,12,13,u,u,u,u,u,u,u,u,6,7,14,15,u,u,u,u,u,u,u,u,16,17,24,25,u,u,u,u] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0],ymm4[1,2],ymm5[3],ymm4[4,5],ymm5[6],ymm4[7] -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [7,3,3,7,7,3,3,7] -; AVX2-FAST-NEXT: # ymm5 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm5, %ymm0 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,8,9,12,13,u,u,u,u,u,u,u,u,22,23,18,19,u,u,u,u,u,u,u,u] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[1,3,1,3] @@ -1379,8 +1374,7 @@ ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm15 = ymm15[2,1,2,3] ; AVX2-FAST-NEXT: vpblendvb %ymm0, %ymm7, %ymm15, %ymm0 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [1,0,2,2,1,0,2,2] -; AVX2-FAST-NEXT: # ymm8 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = ; AVX2-FAST-NEXT: vpermd %ymm7, %ymm8, %ymm7 ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,2,1] @@ -2918,8 +2912,7 @@ ; AVX2-FAST-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX2-FAST-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm13[0],xmm4[1],xmm13[1],xmm4[2],xmm13[2],xmm4[3],xmm13[3] -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [1,0,2,2,1,0,2,2] -; AVX2-FAST-NEXT: # ymm12 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = [1,0,2,2,1,0,2,2] ; AVX2-FAST-NEXT: vpermd %ymm1, %ymm12, %ymm1 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,2,1] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0],ymm1[1],ymm4[2,3],ymm1[4],ymm4[5,6],ymm1[7] @@ -6581,8 +6574,7 @@ ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3] -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [1,0,2,2,1,0,2,2] -; AVX2-FAST-NEXT: # ymm1 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [1,0,2,2,1,0,2,2] ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-7.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-7.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-7.ll @@ -361,8 +361,7 @@ ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <5,7,1,3,7,u,u,u> ; AVX2-FAST-NEXT: vpermd %ymm2, %ymm1, %ymm1 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,4,5],zero,zero,zero,zero,zero,zero,ymm1[10,11,14,15,2,3,18,19],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [1,3,5,7,1,3,5,7] -; AVX2-FAST-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = <1,3,5,u,1,3,5,u> ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm3, %ymm3 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,ymm3[0,1,4,5,8,9],zero,zero,zero,zero,zero,zero,zero,zero,ymm3[18,19,22,23,26,27],zero,zero,zero,zero,zero,zero,zero,zero ; AVX2-FAST-NEXT: vpor %ymm3, %ymm1, %ymm1 @@ -370,8 +369,7 @@ ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,0,1] ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,ymm2[0,1,8,9,4,5,6,7,4,5],zero,zero,ymm2[26,27],zero,zero,zero,zero,ymm2[24,25,20,21,22,23,20,21,28,29] ; AVX2-FAST-NEXT: vpor %ymm2, %ymm3, %ymm2 -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,2,4,6,0,2,4,6] -; AVX2-FAST-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = <0,2,4,u,0,2,4,u> ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm3, %ymm0 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,0,1,4,5,8,9,u,u,u,u,u,u,u,u,18,19,22,23,26,27,u,u,u,u] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255] diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-8.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-8.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-8.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-8.ll @@ -334,8 +334,7 @@ ; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0] ; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm1 -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,2,4,6,0,2,4,6] -; AVX2-FAST-NEXT: # ymm2 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,0,2,4,6] ; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm3 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = ; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm3, %ymm3 @@ -343,8 +342,7 @@ ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = <0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,2,3,6,7,10,11,14,15,u,u,u,u,u,u,u,u> ; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm2, %ymm2 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3],ymm2[4,5],ymm3[6,7] -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [1,3,5,7,1,3,5,7] -; AVX2-FAST-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [1,3,5,7,1,3,5,7] ; AVX2-FAST-NEXT: vpermd %ymm1, %ymm3, %ymm1 ; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm1, %ymm1 ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm3, %ymm0 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-3.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-3.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-3.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-3.ll @@ -294,8 +294,7 @@ ; AVX2-FAST-NEXT: vmovaps (%rdi), %ymm0 ; AVX2-FAST-NEXT: vmovaps (%rsi), %ymm1 ; AVX2-FAST-NEXT: vmovaps (%rdx), %ymm2 -; AVX2-FAST-NEXT: vbroadcastf128 {{.*#+}} ymm3 = [1,0,2,2,1,0,2,2] -; AVX2-FAST-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm3 = ; AVX2-FAST-NEXT: vpermps %ymm1, %ymm3, %ymm3 ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm4 = ymm0[0,0,2,1] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6],ymm3[7] @@ -575,8 +574,7 @@ ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1],ymm0[2],ymm7[3,4],ymm0[5],ymm7[6,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm7 = ymm5[1,1,2,2] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0],ymm0[1,2],ymm7[3],ymm0[4,5],ymm7[6],ymm0[7] -; AVX2-FAST-NEXT: vbroadcastf128 {{.*#+}} ymm7 = [1,0,2,2,1,0,2,2] -; AVX2-FAST-NEXT: # ymm7 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm7 = [1,0,2,2,1,0,2,2] ; AVX2-FAST-NEXT: vpermps %ymm3, %ymm7, %ymm8 ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,0,2,1] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm8[1],ymm1[2,3],ymm8[4],ymm1[5,6],ymm8[7] @@ -1131,8 +1129,7 @@ ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm5 = ymm2[1,1,2,2] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0],ymm0[1,2],ymm5[3],ymm0[4,5],ymm5[6],ymm0[7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vbroadcastf128 {{.*#+}} ymm8 = [1,0,2,2,1,0,2,2] -; AVX2-FAST-NEXT: # ymm8 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm8 = [1,0,2,2,1,0,2,2] ; AVX2-FAST-NEXT: vpermps %ymm1, %ymm8, %ymm5 ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,0,2,1] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm5[1],ymm4[2,3],ymm5[4],ymm4[5,6],ymm5[7] @@ -2280,8 +2277,7 @@ ; AVX2-FAST-NEXT: vmovaps 32(%rdx), %ymm3 ; AVX2-FAST-NEXT: vmovaps 64(%rdx), %ymm1 ; AVX2-FAST-NEXT: vmovaps 96(%rdx), %ymm14 -; AVX2-FAST-NEXT: vbroadcastf128 {{.*#+}} ymm6 = [1,0,2,2,1,0,2,2] -; AVX2-FAST-NEXT: # ymm6 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm6 = [1,0,2,2,1,0,2,2] ; AVX2-FAST-NEXT: vpermps %ymm8, %ymm6, %ymm7 ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm11 = ymm4[0,0,2,1] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm11[0],ymm7[1],ymm11[2,3],ymm7[4],ymm11[5,6],ymm7[7] diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-6.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-6.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-6.ll @@ -267,17 +267,15 @@ ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm8 ; AVX2-ONLY-NEXT: vmovaps {{.*#+}} xmm9 = ; AVX2-ONLY-NEXT: vpermps %ymm7, %ymm9, %ymm9 -; AVX2-ONLY-NEXT: vbroadcastf128 {{.*#+}} ymm10 = [0,4,1,5,0,4,1,5] -; AVX2-ONLY-NEXT: # ymm10 = mem[0,1,0,1] +; AVX2-ONLY-NEXT: vmovaps {{.*#+}} ymm10 = <0,4,u,u,u,u,1,5> ; AVX2-ONLY-NEXT: vpermps %ymm6, %ymm10, %ymm10 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1],ymm9[2,3],ymm10[4,5,6,7] -; AVX2-ONLY-NEXT: vbroadcastsd {{.*#+}} ymm10 = [17179869184,17179869184,17179869184,17179869184] +; AVX2-ONLY-NEXT: vmovaps {{.*#+}} ymm10 = ; AVX2-ONLY-NEXT: vpermps %ymm8, %ymm10, %ymm10 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5],ymm9[6,7] -; AVX2-ONLY-NEXT: vbroadcastsd {{.*#+}} ymm10 = [25769803778,25769803778,25769803778,25769803778] +; AVX2-ONLY-NEXT: vmovaps {{.*#+}} ymm10 = ; AVX2-ONLY-NEXT: vpermps %ymm6, %ymm10, %ymm6 -; AVX2-ONLY-NEXT: vbroadcastf128 {{.*#+}} ymm10 = [1,5,2,6,1,5,2,6] -; AVX2-ONLY-NEXT: # ymm10 = mem[0,1,0,1] +; AVX2-ONLY-NEXT: vmovaps {{.*#+}} ymm10 = <1,5,u,u,u,u,2,6> ; AVX2-ONLY-NEXT: vpermps %ymm7, %ymm10, %ymm7 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5],ymm7[6,7] ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] @@ -286,8 +284,7 @@ ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] ; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] -; AVX2-ONLY-NEXT: vbroadcastf128 {{.*#+}} ymm1 = [2,6,3,7,2,6,3,7] -; AVX2-ONLY-NEXT: # ymm1 = mem[0,1,0,1] +; AVX2-ONLY-NEXT: vmovaps {{.*#+}} ymm1 = <2,6,u,u,u,u,3,7> ; AVX2-ONLY-NEXT: vpermps %ymm8, %ymm1, %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovaps %ymm0, 64(%rax) @@ -609,8 +606,7 @@ ; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} xmm9 = xmm10[2],xmm9[2],xmm10[3],xmm9[3] ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[2,3,2,3] ; AVX2-FAST-NEXT: vinserti128 $1, %xmm9, %ymm8, %ymm8 -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [2,2,3,3,2,2,3,3] -; AVX2-FAST-NEXT: # ymm9 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = <2,2,3,3,u,u,3,3> ; AVX2-FAST-NEXT: vpermd %ymm2, %ymm9, %ymm10 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm8 = ymm10[0,1],ymm8[2,3,4,5],ymm10[6,7] ; AVX2-FAST-NEXT: vpermd %ymm5, %ymm9, %ymm9 @@ -1375,8 +1371,7 @@ ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[2,3,2,3] ; AVX2-FAST-NEXT: vinserti128 $1, %xmm8, %ymm6, %ymm6 ; AVX2-FAST-NEXT: vmovdqa (%r8), %ymm8 -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [2,2,3,3,2,2,3,3] -; AVX2-FAST-NEXT: # ymm9 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm9 = <2,2,3,3,u,u,3,3> ; AVX2-FAST-NEXT: vpermd %ymm8, %ymm9, %ymm11 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm11[0,1],ymm6[2,3,4,5],ymm11[6,7] ; AVX2-FAST-NEXT: vmovdqa (%r9), %ymm11 @@ -3312,8 +3307,7 @@ ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm14 ; AVX2-FAST-NEXT: vmovdqa (%r8), %ymm0 -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [2,2,3,3,2,2,3,3] -; AVX2-FAST-NEXT: # ymm1 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = <2,2,3,3,u,u,3,3> ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm15 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1],ymm14[2,3,4,5],ymm15[6,7] ; AVX2-FAST-NEXT: vmovdqa (%r9), %ymm15 @@ -6764,182 +6758,182 @@ ; AVX2-FAST-LABEL: store_i32_stride6_vf64: ; AVX2-FAST: # %bb.0: ; AVX2-FAST-NEXT: subq $2312, %rsp # imm = 0x908 -; AVX2-FAST-NEXT: vmovdqa (%rsi), %xmm11 -; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %xmm1 -; AVX2-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm6 -; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %xmm9 +; AVX2-FAST-NEXT: vmovdqa (%rsi), %xmm6 +; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %xmm9 ; AVX2-FAST-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm6[2],xmm11[2],xmm6[3],xmm11[3] +; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm1 +; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %xmm10 +; AVX2-FAST-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm1[2],xmm6[2],xmm1[3],xmm6[3] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vmovdqa (%rcx), %xmm2 -; AVX2-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vmovdqa (%rcx), %xmm3 +; AVX2-FAST-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vmovdqa 32(%rcx), %xmm7 ; AVX2-FAST-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vmovdqa 64(%rcx), %xmm5 ; AVX2-FAST-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[1,2,2,3] -; AVX2-FAST-NEXT: vmovdqa (%rdx), %xmm2 -; AVX2-FAST-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,2,2,3] +; AVX2-FAST-NEXT: vmovdqa (%rdx), %xmm4 +; AVX2-FAST-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vmovdqa 32(%rdx), %xmm8 ; AVX2-FAST-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[1,2,2,3] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,2,2,3] ; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,2,1] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm0[4,5],ymm3[6,7] ; AVX2-FAST-NEXT: vmovdqa (%r8), %xmm2 -; AVX2-FAST-NEXT: vmovdqa 32(%r8), %xmm10 -; AVX2-FAST-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vmovdqa 32(%r8), %xmm0 +; AVX2-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm2[0],zero,xmm2[1],zero ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3],ymm3[4,5,6,7] ; AVX2-FAST-NEXT: vpbroadcastd 4(%r9), %ymm4 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm4[3],ymm3[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm9[2],xmm1[2],xmm9[3],xmm1[3] -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} xmm9 = xmm10[2],xmm9[2],xmm10[3],xmm9[3] +; AVX2-FAST-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm7[1,2,2,3] ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm8[1,2,2,3] ; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm4 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm4 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,2,1] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5],ymm3[6,7] -; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm10[0],zero,xmm10[1],zero +; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm0[0],zero,xmm0[1],zero ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3],ymm3[4,5,6,7] ; AVX2-FAST-NEXT: vpbroadcastd 36(%r9), %ymm4 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm4[3],ymm3[4,5,6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 64(%rdx), %xmm0 -; AVX2-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vmovdqa 64(%rdx), %xmm4 +; AVX2-FAST-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm5[1,2,2,3] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,2,2,3] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,2,2,3] ; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm7 = ymm3[0,1,2,1] -; AVX2-FAST-NEXT: vmovdqa 64(%rsi), %xmm4 +; AVX2-FAST-NEXT: vmovdqa 64(%rsi), %xmm11 ; AVX2-FAST-NEXT: vmovdqa 64(%rdi), %xmm3 -; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm3[2],xmm4[2],xmm3[3],xmm4[3] -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm8 +; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm3[2],xmm11[2],xmm3[3],xmm11[3] +; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm8 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5],ymm7[6,7] ; AVX2-FAST-NEXT: vmovdqa 64(%r8), %xmm12 ; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm8 = xmm12[0],zero,xmm12[1],zero ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1],ymm8[2,3],ymm7[4,5,6,7] ; AVX2-FAST-NEXT: vpbroadcastd 68(%r9), %ymm8 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0,1,2],ymm8[3],ymm7[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 96(%rcx), %xmm0 -; AVX2-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm7 = xmm0[1,2,2,3] -; AVX2-FAST-NEXT: vmovdqa 96(%rdx), %xmm0 -; AVX2-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm8 = xmm0[1,2,2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0,1,2],ymm8[3],ymm7[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 96(%rcx), %xmm4 +; AVX2-FAST-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm7 = xmm4[1,2,2,3] +; AVX2-FAST-NEXT: vmovdqa 96(%rdx), %xmm4 +; AVX2-FAST-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm8 = xmm4[1,2,2,3] ; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm9 = ymm7[0,1,2,1] ; AVX2-FAST-NEXT: vmovdqa 96(%rsi), %xmm8 ; AVX2-FAST-NEXT: vmovdqa 96(%rdi), %xmm7 -; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm7[2],xmm8[2],xmm7[3],xmm8[3] -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm10 +; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm7[2],xmm8[2],xmm7[3],xmm8[3] +; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm10 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5],ymm9[6,7] ; AVX2-FAST-NEXT: vmovdqa 96(%r8), %xmm0 ; AVX2-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm10 = xmm0[0],zero,xmm0[1],zero ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1],ymm10[2,3],ymm9[4,5,6,7] ; AVX2-FAST-NEXT: vpbroadcastd 100(%r9), %ymm10 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0,1,2],ymm10[3],ymm9[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 128(%rcx), %xmm0 -; AVX2-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm9 = xmm0[1,2,2,3] -; AVX2-FAST-NEXT: vmovdqa 128(%rdx), %xmm0 -; AVX2-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm10 = xmm0[1,2,2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0,1,2],ymm10[3],ymm9[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 128(%rcx), %xmm4 +; AVX2-FAST-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm9 = xmm4[1,2,2,3] +; AVX2-FAST-NEXT: vmovdqa 128(%rdx), %xmm4 +; AVX2-FAST-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm10 = xmm4[1,2,2,3] ; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm14 = ymm9[0,1,2,1] ; AVX2-FAST-NEXT: vmovdqa 128(%rsi), %xmm10 ; AVX2-FAST-NEXT: vmovdqa 128(%rdi), %xmm9 -; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm9[2],xmm10[2],xmm9[3],xmm10[3] -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm15 +; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm9[2],xmm10[2],xmm9[3],xmm10[3] +; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm15 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm15 = ymm14[0,1,2,3],ymm15[4,5],ymm14[6,7] ; AVX2-FAST-NEXT: vmovdqa 128(%r8), %xmm14 ; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm13 = xmm14[0],zero,xmm14[1],zero ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1],ymm13[2,3],ymm15[4,5,6,7] ; AVX2-FAST-NEXT: vpbroadcastd 132(%r9), %ymm15 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm13[0,1,2],ymm15[3],ymm13[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 160(%rcx), %xmm0 -; AVX2-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm13 = xmm0[1,2,2,3] -; AVX2-FAST-NEXT: vmovdqa 160(%rdx), %xmm0 -; AVX2-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm15 = xmm0[1,2,2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm13[0,1,2],ymm15[3],ymm13[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 160(%rcx), %xmm4 +; AVX2-FAST-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm13 = xmm4[1,2,2,3] +; AVX2-FAST-NEXT: vmovdqa 160(%rdx), %xmm4 +; AVX2-FAST-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm15 = xmm4[1,2,2,3] ; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm13 = xmm15[0],xmm13[0],xmm15[1],xmm13[1] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm13[0,1,2,1] ; AVX2-FAST-NEXT: vmovdqa 160(%rsi), %xmm0 ; AVX2-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vmovdqa 160(%rdi), %xmm15 -; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm15[2],xmm0[2],xmm15[3],xmm0[3] -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm5 +; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm15[2],xmm0[2],xmm15[3],xmm0[3] +; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm5 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm13[0,1,2,3],ymm5[4,5],ymm13[6,7] ; AVX2-FAST-NEXT: vmovdqa 160(%r8), %xmm0 ; AVX2-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm13 = xmm0[0],zero,xmm0[1],zero ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1],ymm13[2,3],ymm5[4,5,6,7] ; AVX2-FAST-NEXT: vpbroadcastd 164(%r9), %ymm13 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2],ymm13[3],ymm5[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 192(%rcx), %xmm0 -; AVX2-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[1,2,2,3] -; AVX2-FAST-NEXT: vmovdqa 192(%rdx), %xmm0 -; AVX2-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm13 = xmm0[1,2,2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2],ymm13[3],ymm5[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 192(%rcx), %xmm4 +; AVX2-FAST-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm5 = xmm4[1,2,2,3] +; AVX2-FAST-NEXT: vmovdqa 192(%rdx), %xmm4 +; AVX2-FAST-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm13 = xmm4[1,2,2,3] ; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm13[0],xmm5[0],xmm13[1],xmm5[1] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,2,1] -; AVX2-FAST-NEXT: vmovdqa 192(%rsi), %xmm1 -; AVX2-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovdqa 192(%rdi), %xmm0 +; AVX2-FAST-NEXT: vmovdqa 192(%rsi), %xmm0 ; AVX2-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm13 +; AVX2-FAST-NEXT: vmovdqa 192(%rdi), %xmm4 +; AVX2-FAST-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm4[2],xmm0[2],xmm4[3],xmm0[3] +; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm13 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm13[4,5],ymm5[6,7] ; AVX2-FAST-NEXT: vmovdqa 192(%r8), %xmm0 ; AVX2-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm13 = xmm0[0],zero,xmm0[1],zero ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1],ymm13[2,3],ymm5[4,5,6,7] ; AVX2-FAST-NEXT: vpbroadcastd 196(%r9), %ymm13 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2],ymm13[3],ymm5[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 224(%rcx), %xmm5 -; AVX2-FAST-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[1,2,2,3] -; AVX2-FAST-NEXT: vmovdqa 224(%rdx), %xmm0 -; AVX2-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm13 = xmm0[1,2,2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2],ymm13[3],ymm5[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 224(%rcx), %xmm4 +; AVX2-FAST-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm5 = xmm4[1,2,2,3] +; AVX2-FAST-NEXT: vmovdqa 224(%rdx), %xmm4 +; AVX2-FAST-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm13 = xmm4[1,2,2,3] ; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm13[0],xmm5[0],xmm13[1],xmm5[1] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,2,1] -; AVX2-FAST-NEXT: vmovdqa 224(%rsi), %xmm1 -; AVX2-FAST-NEXT: vmovdqa %xmm1, (%rsp) # 16-byte Spill +; AVX2-FAST-NEXT: vmovdqa 224(%rsi), %xmm4 +; AVX2-FAST-NEXT: vmovdqa %xmm4, (%rsp) # 16-byte Spill ; AVX2-FAST-NEXT: vmovdqa 224(%rdi), %xmm0 ; AVX2-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm13 +; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] +; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm13 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm13[4,5],ymm5[6,7] -; AVX2-FAST-NEXT: vmovdqa 224(%r8), %xmm0 -; AVX2-FAST-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm13 = xmm0[0],zero,xmm0[1],zero +; AVX2-FAST-NEXT: vmovdqa 224(%r8), %xmm4 +; AVX2-FAST-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm13 = xmm4[0],zero,xmm4[1],zero ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1],ymm13[2,3],ymm5[4,5,6,7] ; AVX2-FAST-NEXT: vpbroadcastd 228(%r9), %ymm13 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2],ymm13[3],ymm5[4,5,6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2],ymm13[3],ymm5[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpbroadcastd (%rcx), %xmm5 ; AVX2-FAST-NEXT: vpbroadcastd (%rdx), %xmm13 ; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm13[0],xmm5[0],xmm13[1],xmm5[1] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm6[0],xmm11[0],xmm6[1],xmm11[1] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm5[2,3],ymm1[4,5,6,7] ; AVX2-FAST-NEXT: vpbroadcastq %xmm2, %ymm0 @@ -6947,19 +6941,19 @@ ; AVX2-FAST-NEXT: vpbroadcastd (%r9), %ymm1 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa (%rdx), %ymm2 -; AVX2-FAST-NEXT: vmovdqa (%rcx), %ymm1 -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm0 = ymm1[0,1,2,2,4,5,6,6] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm5 = ymm2[1,1,2,3,5,5,6,7] +; AVX2-FAST-NEXT: vmovdqa (%rdx), %ymm1 +; AVX2-FAST-NEXT: vmovdqa (%rcx), %ymm2 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm0 = ymm2[0,1,2,2,4,5,6,6] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm5 = ymm1[1,1,2,3,5,5,6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0],ymm0[1],ymm5[2],ymm0[3],ymm5[4],ymm0[5],ymm5[6],ymm0[7] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] -; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm6 -; AVX2-FAST-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa (%rsi), %ymm5 -; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm5 = ymm6[2],ymm5[2],ymm6[3],ymm5[3],ymm6[6],ymm5[6],ymm6[7],ymm5[7] +; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm5 ; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5],ymm0[6,7] +; AVX2-FAST-NEXT: vmovdqa (%rsi), %ymm4 +; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm4 = ymm5[2],ymm4[2],ymm5[3],ymm4[3],ymm5[6],ymm4[6],ymm5[7],ymm4[7] +; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5],ymm0[6,7] ; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm5 = mem[0],zero,mem[1],zero ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm5[2,3],ymm0[4,5,6,7] ; AVX2-FAST-NEXT: vpbroadcastd 20(%r9), %ymm5 @@ -6968,9 +6962,9 @@ ; AVX2-FAST-NEXT: vbroadcastss 32(%rcx), %xmm0 ; AVX2-FAST-NEXT: vbroadcastss 32(%rdx), %xmm5 ; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm0 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] -; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FAST-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm5 = xmm5[0],mem[0],xmm5[1],mem[1] +; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX2-FAST-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm5 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm5 = xmm4[0],mem[0],xmm4[1],mem[1] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,2,1] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1],ymm0[2,3],ymm5[4,5,6,7] ; AVX2-FAST-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 16-byte Folded Reload @@ -6978,20 +6972,20 @@ ; AVX2-FAST-NEXT: vbroadcastss 32(%r9), %ymm5 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm5[5],ymm0[6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 32(%rdx), %ymm5 -; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa 32(%rdx), %ymm4 +; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovdqa 32(%rcx), %ymm6 ; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm0 = ymm6[0,1,2,2,4,5,6,6] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[1,1,2,3,5,5,6,7] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm5 = ymm4[1,1,2,3,5,5,6,7] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0],ymm0[1],ymm5[2],ymm0[3],ymm5[4],ymm0[5],ymm5[6],ymm0[7] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] -; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %ymm11 -; AVX2-FAST-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %ymm5 -; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm5 = ymm11[2],ymm5[2],ymm11[3],ymm5[3],ymm11[6],ymm5[6],ymm11[7],ymm5[7] +; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %ymm5 ; AVX2-FAST-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5],ymm0[6,7] +; AVX2-FAST-NEXT: vmovdqa 32(%rsi), %ymm4 +; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm4 = ymm5[2],ymm4[2],ymm5[3],ymm4[3],ymm5[6],ymm4[6],ymm5[7],ymm4[7] +; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5],ymm0[6,7] ; AVX2-FAST-NEXT: vpmovzxdq {{.*#+}} xmm5 = mem[0],zero,mem[1],zero ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm5[2,3],ymm0[4,5,6,7] ; AVX2-FAST-NEXT: vpbroadcastd 52(%r9), %ymm5 @@ -7000,7 +6994,7 @@ ; AVX2-FAST-NEXT: vpbroadcastd 64(%rcx), %xmm0 ; AVX2-FAST-NEXT: vpbroadcastd 64(%rdx), %xmm5 ; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm11[0],xmm3[1],xmm11[1] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,2,1] ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3],ymm3[4,5,6,7] ; AVX2-FAST-NEXT: vpbroadcastq %xmm12, %ymm3 @@ -7187,258 +7181,257 @@ ; AVX2-FAST-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] ; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm3, %ymm4 -; AVX2-FAST-NEXT: vmovdqa (%r8), %ymm3 -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [2,2,3,3,2,2,3,3] -; AVX2-FAST-NEXT: # ymm0 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpermd %ymm3, %ymm0, %ymm12 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm12[0,1],ymm4[2,3,4,5],ymm12[6,7] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm3, %ymm3 +; AVX2-FAST-NEXT: vmovdqa (%r8), %ymm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = <2,2,3,3,u,u,3,3> +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm4, %ymm12 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm12[0,1],ymm3[2,3,4,5],ymm12[6,7] ; AVX2-FAST-NEXT: vmovdqa (%r9), %ymm12 -; AVX2-FAST-NEXT: vpermd %ymm12, %ymm0, %ymm13 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0],ymm13[1],ymm4[2,3,4,5,6],ymm13[7] -; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm4 = ymm4[0],mem[0],ymm4[1],mem[1],ymm4[4],mem[4],ymm4[5],mem[5] -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} ymm13 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[4],ymm1[4],ymm2[5],ymm1[5] +; AVX2-FAST-NEXT: vpermd %ymm12, %ymm4, %ymm13 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0],ymm13[1],ymm3[2,3,4,5,6],ymm13[7] +; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm3 = ymm3[0],mem[0],ymm3[1],mem[1],ymm3[4],mem[4],ymm3[5],mem[5] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} ymm13 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[4],ymm2[4],ymm1[5],ymm2[5] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,2,2,2] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,1,2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm13[2,3],ymm4[4,5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm3[4,5],ymm4[6,7] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm13[2,3],ymm3[4,5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm0[4,5],ymm3[6,7] ; AVX2-FAST-NEXT: vpbroadcastd 16(%r9), %ymm13 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm13[5],ymm4[6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm1 = ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[6],ymm1[6],ymm2[7],ymm1[7] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,3,2,3,6,7,6,7] -; AVX2-FAST-NEXT: vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm1 = mem[2,3],ymm1[2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm13[5],ymm3[6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm2 = ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[6],ymm2[6],ymm1[7],ymm2[7] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,3,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm2 = mem[2,3],ymm2[2,3] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [6,5,3,3,6,5,7,7] -; AVX2-FAST-NEXT: vpermd %ymm3, %ymm8, %ymm2 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5],ymm2[6,7] +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm8, %ymm0 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3,4,5],ymm0[6,7] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm7 = [4,6,2,3,4,6,6,7] ; AVX2-FAST-NEXT: vpermd %ymm12, %ymm7, %ymm2 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4,5,6],ymm2[7] -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-FAST-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3,4,5,6],ymm2[7] +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-FAST-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 ; AVX2-FAST-NEXT: vmovdqa 32(%r8), %ymm2 -; AVX2-FAST-NEXT: vpermd %ymm2, %ymm0, %ymm12 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1],ymm1[2,3,4,5],ymm12[6,7] +; AVX2-FAST-NEXT: vpermd %ymm2, %ymm4, %ymm12 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1],ymm0[2,3,4,5],ymm12[6,7] ; AVX2-FAST-NEXT: vmovdqa 32(%r9), %ymm12 -; AVX2-FAST-NEXT: vpermd %ymm12, %ymm0, %ymm13 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm13[1],ymm1[2,3,4,5,6],ymm13[7] -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermd %ymm12, %ymm4, %ymm13 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm13[1],ymm0[2,3,4,5,6],ymm13[7] +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm1 = ymm1[0],mem[0],ymm1[1],mem[1],ymm1[4],mem[4],ymm1[5],mem[5] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} ymm13 = ymm3[0],ymm6[0],ymm3[1],ymm6[1],ymm3[4],ymm6[4],ymm3[5],ymm6[5] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} ymm13 = ymm1[0],ymm6[0],ymm1[1],ymm6[1],ymm1[4],ymm6[4],ymm1[5],ymm6[5] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,2,2,2] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm13[2,3],ymm1[4,5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm13[2,3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5],ymm0[6,7] ; AVX2-FAST-NEXT: vpbroadcastd 48(%r9), %ymm13 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm13[5],ymm1[6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm1 = ymm3[2],ymm6[2],ymm3[3],ymm6[3],ymm3[6],ymm6[6],ymm3[7],ymm6[7] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,3,2,3,6,7,6,7] -; AVX2-FAST-NEXT: vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm1 = mem[2,3],ymm1[2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm13[5],ymm0[6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm0 = ymm1[2],ymm6[2],ymm1[3],ymm6[3],ymm1[6],ymm6[6],ymm1[7],ymm6[7] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm0 = mem[2,3],ymm0[2,3] ; AVX2-FAST-NEXT: vpermd %ymm2, %ymm8, %ymm2 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5],ymm2[6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5],ymm2[6,7] ; AVX2-FAST-NEXT: vpermd %ymm12, %ymm7, %ymm2 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4,5,6],ymm2[7] -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-FAST-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3,4,5,6],ymm2[7] +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-FAST-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 ; AVX2-FAST-NEXT: vmovdqa 64(%r8), %ymm2 -; AVX2-FAST-NEXT: vpermd %ymm2, %ymm0, %ymm6 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1],ymm1[2,3,4,5],ymm6[6,7] +; AVX2-FAST-NEXT: vpermd %ymm2, %ymm4, %ymm6 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1],ymm0[2,3,4,5],ymm6[6,7] ; AVX2-FAST-NEXT: vmovdqa 64(%r9), %ymm6 -; AVX2-FAST-NEXT: vpermd %ymm6, %ymm0, %ymm12 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm12[1],ymm1[2,3,4,5,6],ymm12[7] -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermd %ymm6, %ymm4, %ymm12 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm12[1],ymm0[2,3,4,5,6],ymm12[7] +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm1 = ymm1[0],mem[0],ymm1[1],mem[1],ymm1[4],mem[4],ymm1[5],mem[5] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} ymm12 = ymm3[0],ymm5[0],ymm3[1],ymm5[1],ymm3[4],ymm5[4],ymm3[5],ymm5[5] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} ymm12 = ymm1[0],ymm5[0],ymm1[1],ymm5[1],ymm1[4],ymm5[4],ymm1[5],ymm5[5] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,2,2,2] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm12[2,3],ymm1[4,5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm12[2,3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5],ymm0[6,7] ; AVX2-FAST-NEXT: vpbroadcastd 80(%r9), %ymm12 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm12[5],ymm1[6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm1 = ymm3[2],ymm5[2],ymm3[3],ymm5[3],ymm3[6],ymm5[6],ymm3[7],ymm5[7] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,3,2,3,6,7,6,7] -; AVX2-FAST-NEXT: vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm1 = mem[2,3],ymm1[2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm12[5],ymm0[6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm0 = ymm1[2],ymm5[2],ymm1[3],ymm5[3],ymm1[6],ymm5[6],ymm1[7],ymm5[7] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm0 = mem[2,3],ymm0[2,3] ; AVX2-FAST-NEXT: vpermd %ymm2, %ymm8, %ymm2 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5],ymm2[6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5],ymm2[6,7] ; AVX2-FAST-NEXT: vpermd %ymm6, %ymm7, %ymm2 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4,5,6],ymm2[7] -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-FAST-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3,4,5,6],ymm2[7] +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-FAST-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 ; AVX2-FAST-NEXT: vmovdqa 96(%r8), %ymm2 -; AVX2-FAST-NEXT: vpermd %ymm2, %ymm0, %ymm5 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1],ymm1[2,3,4,5],ymm5[6,7] +; AVX2-FAST-NEXT: vpermd %ymm2, %ymm4, %ymm5 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1],ymm0[2,3,4,5],ymm5[6,7] ; AVX2-FAST-NEXT: vmovdqa 96(%r9), %ymm5 -; AVX2-FAST-NEXT: vpermd %ymm5, %ymm0, %ymm6 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm6[1],ymm1[2,3,4,5,6],ymm6[7] -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermd %ymm5, %ymm4, %ymm6 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm6[1],ymm0[2,3,4,5,6],ymm6[7] +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm1 = ymm1[0],mem[0],ymm1[1],mem[1],ymm1[4],mem[4],ymm1[5],mem[5] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} ymm6 = ymm3[0],ymm11[0],ymm3[1],ymm11[1],ymm3[4],ymm11[4],ymm3[5],ymm11[5] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} ymm6 = ymm1[0],ymm11[0],ymm1[1],ymm11[1],ymm1[4],ymm11[4],ymm1[5],ymm11[5] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,2,2] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm6[2,3],ymm1[4,5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm6[2,3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5],ymm0[6,7] ; AVX2-FAST-NEXT: vpbroadcastd 112(%r9), %ymm6 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm6[5],ymm1[6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm1 = ymm3[2],ymm11[2],ymm3[3],ymm11[3],ymm3[6],ymm11[6],ymm3[7],ymm11[7] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,3,2,3,6,7,6,7] -; AVX2-FAST-NEXT: vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm1 = mem[2,3],ymm1[2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm6[5],ymm0[6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm0 = ymm1[2],ymm11[2],ymm1[3],ymm11[3],ymm1[6],ymm11[6],ymm1[7],ymm11[7] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm0 = mem[2,3],ymm0[2,3] ; AVX2-FAST-NEXT: vpermd %ymm2, %ymm8, %ymm2 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5],ymm2[6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5],ymm2[6,7] ; AVX2-FAST-NEXT: vpermd %ymm5, %ymm7, %ymm2 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4,5,6],ymm2[7] -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-FAST-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3,4,5,6],ymm2[7] +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-FAST-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 ; AVX2-FAST-NEXT: vmovdqa 128(%r8), %ymm2 -; AVX2-FAST-NEXT: vpermd %ymm2, %ymm0, %ymm5 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1],ymm1[2,3,4,5],ymm5[6,7] +; AVX2-FAST-NEXT: vpermd %ymm2, %ymm4, %ymm5 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1],ymm0[2,3,4,5],ymm5[6,7] ; AVX2-FAST-NEXT: vmovdqa 128(%r9), %ymm5 -; AVX2-FAST-NEXT: vpermd %ymm5, %ymm0, %ymm6 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm6[1],ymm1[2,3,4,5,6],ymm6[7] -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpermd %ymm5, %ymm4, %ymm6 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm6[1],ymm0[2,3,4,5,6],ymm6[7] +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm1 = ymm1[0],mem[0],ymm1[1],mem[1],ymm1[4],mem[4],ymm1[5],mem[5] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} ymm6 = ymm3[0],ymm10[0],ymm3[1],ymm10[1],ymm3[4],ymm10[4],ymm3[5],ymm10[5] +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} ymm6 = ymm1[0],ymm10[0],ymm1[1],ymm10[1],ymm1[4],ymm10[4],ymm1[5],ymm10[5] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,2,2] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm6[2,3],ymm1[4,5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm6[2,3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5],ymm0[6,7] ; AVX2-FAST-NEXT: vpbroadcastd 144(%r9), %ymm6 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm6[5],ymm1[6,7] -; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm1 = ymm3[2],ymm10[2],ymm3[3],ymm10[3],ymm3[6],ymm10[6],ymm3[7],ymm10[7] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,3,2,3,6,7,6,7] -; AVX2-FAST-NEXT: vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm1 = mem[2,3],ymm1[2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm6[5],ymm0[6,7] +; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm0 = ymm1[2],ymm10[2],ymm1[3],ymm10[3],ymm1[6],ymm10[6],ymm1[7],ymm10[7] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm0 = mem[2,3],ymm0[2,3] ; AVX2-FAST-NEXT: vpermd %ymm2, %ymm8, %ymm2 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5],ymm2[6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5],ymm2[6,7] ; AVX2-FAST-NEXT: vpermd %ymm5, %ymm7, %ymm2 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm1[0],ymm2[1],ymm1[2,3,4,5,6],ymm2[7] -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-FAST-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 -; AVX2-FAST-NEXT: vmovdqa 160(%r8), %ymm2 -; AVX2-FAST-NEXT: vpermd %ymm2, %ymm0, %ymm5 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1],ymm1[2,3,4,5],ymm5[6,7] -; AVX2-FAST-NEXT: vmovdqa 160(%r9), %ymm3 -; AVX2-FAST-NEXT: vpermd %ymm3, %ymm0, %ymm5 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm1[0],ymm5[1],ymm1[2,3,4,5,6],ymm5[7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm11 = ymm0[0],ymm2[1],ymm0[2,3,4,5,6],ymm2[7] +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-FAST-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm1 = ymm1[0],mem[0],ymm1[1],mem[1],ymm1[4],mem[4],ymm1[5],mem[5] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} ymm5 = ymm4[0],ymm9[0],ymm4[1],ymm9[1],ymm4[4],ymm9[4],ymm4[5],ymm9[5] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT: vmovdqa 160(%r8), %ymm2 +; AVX2-FAST-NEXT: vpermd %ymm2, %ymm4, %ymm5 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1],ymm0[2,3,4,5],ymm5[6,7] +; AVX2-FAST-NEXT: vmovdqa 160(%r9), %ymm1 +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm4, %ymm5 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm10 = ymm0[0],ymm5[1],ymm0[2,3,4,5,6],ymm5[7] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} ymm5 = ymm3[0],ymm9[0],ymm3[1],ymm9[1],ymm3[4],ymm9[4],ymm3[5],ymm9[5] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,2] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm5[2,3],ymm1[4,5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm5[2,3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5],ymm0[6,7] ; AVX2-FAST-NEXT: vpbroadcastd 176(%r9), %ymm5 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm1[0,1,2,3,4],ymm5[5],ymm1[6,7] -; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm1 = ymm4[2],ymm9[2],ymm4[3],ymm9[3],ymm4[6],ymm9[6],ymm4[7],ymm9[7] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,3,2,3,6,7,6,7] -; AVX2-FAST-NEXT: vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm1 = mem[2,3],ymm1[2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm6 = ymm0[0,1,2,3,4],ymm5[5],ymm0[6,7] +; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm0 = ymm3[2],ymm9[2],ymm3[3],ymm9[3],ymm3[6],ymm9[6],ymm3[7],ymm9[7] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm0 = mem[2,3],ymm0[2,3] ; AVX2-FAST-NEXT: vpermd %ymm2, %ymm8, %ymm2 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5],ymm2[6,7] -; AVX2-FAST-NEXT: vpermd %ymm3, %ymm7, %ymm2 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm1[0],ymm2[1],ymm1[2,3,4,5,6],ymm2[7] -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-FAST-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] -; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 -; AVX2-FAST-NEXT: vmovdqa 192(%r8), %ymm3 -; AVX2-FAST-NEXT: vpermd %ymm3, %ymm0, %ymm2 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5],ymm2[6,7] -; AVX2-FAST-NEXT: vmovdqa 192(%r9), %ymm2 -; AVX2-FAST-NEXT: vpermd %ymm2, %ymm0, %ymm4 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0],ymm4[1],ymm1[2,3,4,5,6],ymm4[7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5],ymm2[6,7] +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm7, %ymm1 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm9 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6],ymm1[7] +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-FAST-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm1 = ymm1[0],mem[0],ymm1[1],mem[1],ymm1[4],mem[4],ymm1[5],mem[5] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT: vmovdqa 192(%r8), %ymm1 +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm4, %ymm2 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5],ymm2[6,7] +; AVX2-FAST-NEXT: vmovdqa 192(%r9), %ymm2 +; AVX2-FAST-NEXT: vpermd %ymm2, %ymm4, %ymm3 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm5 = ymm0[0],ymm3[1],ymm0[2,3,4,5,6],ymm3[7] +; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload ; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} ymm13 = ymm12[0],ymm15[0],ymm12[1],ymm15[1],ymm12[4],ymm15[4],ymm12[5],ymm15[5] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm13 = ymm13[2,2,2,2] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm13[2,3],ymm1[4,5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5],ymm1[6,7] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm13[2,3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] ; AVX2-FAST-NEXT: vpbroadcastd 208(%r9), %ymm13 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm1[0,1,2,3,4],ymm13[5],ymm1[6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0,1,2,3,4],ymm13[5],ymm0[6,7] ; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm13 = ymm12[2],ymm15[2],ymm12[3],ymm15[3],ymm12[6],ymm15[6],ymm12[7],ymm15[7] ; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm13 = ymm13[2,3,2,3,6,7,6,7] ; AVX2-FAST-NEXT: vperm2i128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm13 = mem[2,3],ymm13[2,3] -; AVX2-FAST-NEXT: vpermd %ymm3, %ymm8, %ymm3 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm13[2,3,4,5],ymm3[6,7] +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm8, %ymm1 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm13[2,3,4,5],ymm1[6,7] ; AVX2-FAST-NEXT: vpermd %ymm2, %ymm7, %ymm2 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3,4,5,6],ymm2[7] -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-FAST-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4,5,6],ymm2[7] +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-FAST-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm2 = xmm2[2],mem[2],xmm2[3],mem[3] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm12, %ymm3 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm2, %ymm12, %ymm2 ; AVX2-FAST-NEXT: vmovdqa 224(%r8), %ymm13 -; AVX2-FAST-NEXT: vpermd %ymm13, %ymm0, %ymm15 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm15[0,1],ymm3[2,3,4,5],ymm15[6,7] +; AVX2-FAST-NEXT: vpermd %ymm13, %ymm4, %ymm15 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm15[0,1],ymm2[2,3,4,5],ymm15[6,7] ; AVX2-FAST-NEXT: vmovdqa 224(%r9), %ymm15 -; AVX2-FAST-NEXT: vpermd %ymm15, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0],ymm0[1],ymm3[2,3,4,5,6],ymm0[7] +; AVX2-FAST-NEXT: vpermd %ymm15, %ymm4, %ymm4 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[2,3,4,5,6],ymm4[7] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm3 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] +; AVX2-FAST-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm4 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} ymm12 = ymm0[0],ymm14[0],ymm0[1],ymm14[1],ymm0[4],ymm14[4],ymm0[5],ymm14[5] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[2,2,2,2] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,1,2,3] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm12[2,3],ymm3[4,5,6,7] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm13[4,5],ymm3[6,7] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,1,2,3] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1],ymm12[2,3],ymm4[4,5,6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm13[4,5],ymm4[6,7] ; AVX2-FAST-NEXT: vpbroadcastd 240(%r9), %ymm12 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm12[5],ymm3[6,7] +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm12[5],ymm4[6,7] ; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} ymm12 = ymm0[2],ymm14[2],ymm0[3],ymm14[3],ymm0[6],ymm14[6],ymm0[7],ymm14[7] ; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm12 = ymm12[2,3,2,3,6,7,6,7] ; AVX2-FAST-NEXT: vperm2i128 $19, (%rsp), %ymm12, %ymm12 # 32-byte Folded Reload @@ -7449,10 +7442,10 @@ ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0],ymm7[1],ymm8[2,3,4,5,6],ymm7[7] ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FAST-NEXT: vmovdqa %ymm7, 1504(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm3, 1440(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm1, 1408(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm2, 1312(%rax) -; AVX2-FAST-NEXT: vmovdqa %ymm4, 1248(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm4, 1440(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm2, 1408(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm1, 1312(%rax) +; AVX2-FAST-NEXT: vmovdqa %ymm3, 1248(%rax) ; AVX2-FAST-NEXT: vmovdqa %ymm5, 1216(%rax) ; AVX2-FAST-NEXT: vmovdqa %ymm9, 1120(%rax) ; AVX2-FAST-NEXT: vmovdqa %ymm6, 1056(%rax) diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-7.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-7.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-7.ll @@ -103,8 +103,7 @@ ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6],ymm1[7] ; AVX2-SLOW-NEXT: vmovaps {{.*#+}} xmm3 = <3,5,7,u> ; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm3, %ymm2 -; AVX2-SLOW-NEXT: vbroadcastf128 {{.*#+}} ymm3 = [3,5,0,1,3,5,0,1] -; AVX2-SLOW-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-SLOW-NEXT: vmovaps {{.*#+}} ymm3 = ; AVX2-SLOW-NEXT: vpermps %ymm0, %ymm3, %ymm0 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1,2],xmm0[3] ; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm0 @@ -133,8 +132,7 @@ ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3] ; AVX2-FAST-NEXT: vmovaps {{.*#+}} xmm1 = <3,5,7,u> ; AVX2-FAST-NEXT: vpermps %ymm2, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vbroadcastf128 {{.*#+}} ymm3 = [3,5,0,1,3,5,0,1] -; AVX2-FAST-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm3 = ; AVX2-FAST-NEXT: vpermps %ymm0, %ymm3, %ymm3 ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1,2],xmm3[3] ; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm4 = <0,2,4,6,u,u,u,1> @@ -173,8 +171,7 @@ ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} xmm3 = <3,5,7,u> ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm2, %ymm3, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vbroadcastf128 {{.*#+}} ymm3 = [3,5,0,1,3,5,0,1] -; AVX2-FAST-PERLANE-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} ymm3 = ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm0, %ymm3, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1,2],xmm0[3] ; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm0, %xmm0 @@ -420,8 +417,7 @@ ; AVX2-SLOW-NEXT: vinsertf128 $1, (%rsi), %ymm4, %ymm4 ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm0, %ymm5, %ymm6 ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm7 -; AVX2-SLOW-NEXT: vbroadcastf128 {{.*#+}} ymm8 = [2,6,0,3,2,6,0,3] -; AVX2-SLOW-NEXT: # ymm8 = mem[0,1,0,1] +; AVX2-SLOW-NEXT: vmovaps {{.*#+}} ymm8 = <2,6,u,u,u,u,u,3> ; AVX2-SLOW-NEXT: vpermps %ymm6, %ymm8, %ymm8 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm9 = ymm4[3,3,3,3,7,7,7,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[0,1,0,2] @@ -434,18 +430,16 @@ ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm10 = ymm7[1,1,1,1,5,5,5,5] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1,2,3,4],ymm9[5],ymm10[6,7] ; AVX2-SLOW-NEXT: vinsertps {{.*#+}} xmm5 = zero,xmm5[1],xmm0[1],zero -; AVX2-SLOW-NEXT: vbroadcastf128 {{.*#+}} ymm10 = [5,0,2,6,5,0,2,6] -; AVX2-SLOW-NEXT: # ymm10 = mem[0,1,0,1] +; AVX2-SLOW-NEXT: vmovaps {{.*#+}} ymm10 = <5,u,u,u,u,u,2,6> ; AVX2-SLOW-NEXT: vpermps %ymm4, %ymm10, %ymm10 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm10[0],ymm5[1,2],ymm10[3,4,5,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2],ymm9[3,4,5],ymm5[6,7] ; AVX2-SLOW-NEXT: vmovaps {{.*#+}} xmm9 = ; AVX2-SLOW-NEXT: vpermps %ymm6, %ymm9, %ymm6 -; AVX2-SLOW-NEXT: vbroadcastf128 {{.*#+}} ymm9 = [0,4,0,1,0,4,0,1] -; AVX2-SLOW-NEXT: # ymm9 = mem[0,1,0,1] +; AVX2-SLOW-NEXT: vmovaps {{.*#+}} ymm9 = <0,4,u,u,u,u,u,1> ; AVX2-SLOW-NEXT: vpermps %ymm4, %ymm9, %ymm4 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm6[2,3],ymm4[4,5,6,7] -; AVX2-SLOW-NEXT: vbroadcastsd {{.*#+}} ymm6 = [17179869184,17179869184,17179869184,17179869184] +; AVX2-SLOW-NEXT: vmovaps {{.*#+}} ymm6 = ; AVX2-SLOW-NEXT: vpermps %ymm7, %ymm6, %ymm6 ; AVX2-SLOW-NEXT: vbroadcastss (%r10), %ymm7 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm7[6,7] @@ -478,15 +472,13 @@ ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1,2,3,4],ymm7[5],ymm8[6,7] ; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm8 = ; AVX2-FAST-NEXT: vpermps %ymm3, %ymm8, %ymm8 -; AVX2-FAST-NEXT: vbroadcastf128 {{.*#+}} ymm9 = [5,0,2,6,5,0,2,6] -; AVX2-FAST-NEXT: # ymm9 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm9 = <5,u,u,u,u,u,2,6> ; AVX2-FAST-NEXT: vpermps %ymm2, %ymm9, %ymm9 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0],ymm8[1,2],ymm9[3,4,5,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1,2],ymm7[3,4,5],ymm8[6,7] ; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm8 = ; AVX2-FAST-NEXT: vpermps %ymm2, %ymm8, %ymm8 -; AVX2-FAST-NEXT: vbroadcastf128 {{.*#+}} ymm9 = [2,6,0,3,2,6,0,3] -; AVX2-FAST-NEXT: # ymm9 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm9 = <2,6,u,u,u,u,u,3> ; AVX2-FAST-NEXT: vpermps %ymm3, %ymm9, %ymm9 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1,2,3,4],ymm8[5,6],ymm9[7] ; AVX2-FAST-NEXT: vinsertf128 $1, %xmm0, %ymm4, %ymm4 @@ -495,11 +487,10 @@ ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm8[0,1],ymm4[2,3,4],ymm8[5,6,7] ; AVX2-FAST-NEXT: vmovaps {{.*#+}} xmm5 = ; AVX2-FAST-NEXT: vpermps %ymm3, %ymm5, %ymm3 -; AVX2-FAST-NEXT: vbroadcastf128 {{.*#+}} ymm5 = [0,4,0,1,0,4,0,1] -; AVX2-FAST-NEXT: # ymm5 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm5 = <0,4,u,u,u,u,u,1> ; AVX2-FAST-NEXT: vpermps %ymm2, %ymm5, %ymm2 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3],ymm2[4,5,6,7] -; AVX2-FAST-NEXT: vbroadcastsd {{.*#+}} ymm3 = [17179869184,17179869184,17179869184,17179869184] +; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm3 = ; AVX2-FAST-NEXT: vpermps %ymm6, %ymm3, %ymm3 ; AVX2-FAST-NEXT: vbroadcastss (%r10), %ymm5 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm5[6,7] @@ -529,8 +520,7 @@ ; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, (%rsi), %ymm4, %ymm4 ; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm0, %ymm5, %ymm6 ; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vbroadcastf128 {{.*#+}} ymm8 = [2,6,0,3,2,6,0,3] -; AVX2-FAST-PERLANE-NEXT: # ymm8 = mem[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} ymm8 = <2,6,u,u,u,u,u,3> ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm6, %ymm8, %ymm8 ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm9 = ymm4[3,3,3,3,7,7,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[0,1,0,2] @@ -543,18 +533,16 @@ ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm10 = ymm7[1,1,1,1,5,5,5,5] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1,2,3,4],ymm9[5],ymm10[6,7] ; AVX2-FAST-PERLANE-NEXT: vinsertps {{.*#+}} xmm5 = zero,xmm5[1],xmm0[1],zero -; AVX2-FAST-PERLANE-NEXT: vbroadcastf128 {{.*#+}} ymm10 = [5,0,2,6,5,0,2,6] -; AVX2-FAST-PERLANE-NEXT: # ymm10 = mem[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} ymm10 = <5,u,u,u,u,u,2,6> ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm4, %ymm10, %ymm10 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm10[0],ymm5[1,2],ymm10[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2],ymm9[3,4,5],ymm5[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} xmm9 = ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm6, %ymm9, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vbroadcastf128 {{.*#+}} ymm9 = [0,4,0,1,0,4,0,1] -; AVX2-FAST-PERLANE-NEXT: # ymm9 = mem[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} ymm9 = <0,4,u,u,u,u,u,1> ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm4, %ymm9, %ymm4 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm6[2,3],ymm4[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vbroadcastsd {{.*#+}} ymm6 = [17179869184,17179869184,17179869184,17179869184] +; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} ymm6 = ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm7, %ymm6, %ymm6 ; AVX2-FAST-PERLANE-NEXT: vbroadcastss (%r10), %ymm7 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm7[6,7] @@ -953,15 +941,14 @@ ; AVX2-FAST-LABEL: store_i32_stride7_vf8: ; AVX2-FAST: # %bb.0: ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-NEXT: vmovaps (%rdi), %ymm1 +; AVX2-FAST-NEXT: vmovaps (%rdi), %ymm0 ; AVX2-FAST-NEXT: vmovaps (%rsi), %ymm2 ; AVX2-FAST-NEXT: vmovaps (%rdx), %ymm10 ; AVX2-FAST-NEXT: vmovaps (%rcx), %ymm3 ; AVX2-FAST-NEXT: vmovaps (%r8), %ymm7 ; AVX2-FAST-NEXT: vmovaps (%r9), %ymm8 -; AVX2-FAST-NEXT: vmovaps (%rax), %xmm0 -; AVX2-FAST-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm6 +; AVX2-FAST-NEXT: vmovaps (%rax), %xmm1 +; AVX2-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm6 ; AVX2-FAST-NEXT: vmovaps (%r8), %xmm4 ; AVX2-FAST-NEXT: vmovaps (%r9), %xmm5 ; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm9 = xmm5[1,1,1,1] @@ -977,10 +964,10 @@ ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm15 = xmm15[0,1],xmm13[2],xmm15[3] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[0,1,2,1] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm9 = ymm15[0],ymm9[1,2],ymm15[3,4,5,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2],ymm6[3,4,5],ymm9[6,7] -; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm9[0,1,2],ymm6[3,4,5],ymm9[6,7] +; AVX2-FAST-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm6 = ymm10[2],ymm3[2],ymm10[3],ymm3[3],ymm10[6],ymm3[6],ymm10[7],ymm3[7] -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm9 = ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[6],ymm2[6],ymm1[7],ymm2[7] +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm9 = ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[6],ymm2[6],ymm0[7],ymm2[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[2,2,2,2] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm9[0,1,2,3],ymm6[4,5,6,7] ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm9 = ymm8[1,1,2,2,5,5,6,6] @@ -988,10 +975,10 @@ ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[2,1,2,3] ; AVX2-FAST-NEXT: vmovaps 16(%rax), %xmm15 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0],ymm15[1],ymm9[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1],ymm6[2,3,4,5],ymm9[6,7] -; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm9[0,1],ymm6[2,3,4,5],ymm9[6,7] +; AVX2-FAST-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm6 = ymm2[1,1,1,1,5,5,5,5] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm1[1],ymm6[2,3,4],ymm1[5],ymm6[6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm0[1],ymm6[2,3,4],ymm0[5],ymm6[6,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[2,2,2,2] ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm15 = ymm10[1,1],ymm3[1,1],ymm10[5,5],ymm3[5,5] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm15[5,6],ymm6[7] @@ -1001,14 +988,13 @@ ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[2,1,2,3] ; AVX2-FAST-NEXT: vbroadcastsd 16(%rax), %ymm15 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1],ymm15[2,3],ymm9[4,5,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2],ymm6[3,4,5,6],ymm9[7] -; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm9[0,1,2],ymm6[3,4,5,6],ymm9[7] +; AVX2-FAST-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm6 = xmm13[3,3],xmm14[3,3] ; AVX2-FAST-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 ; AVX2-FAST-NEXT: vunpckhps {{.*#+}} xmm9 = xmm12[2],xmm11[2],xmm12[3],xmm11[3] -; AVX2-FAST-NEXT: vbroadcastf128 {{.*#+}} ymm0 = [0,1,2,2,0,1,2,2] -; AVX2-FAST-NEXT: # ymm0 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpermps %ymm9, %ymm0, %ymm9 +; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm15 = <0,1,2,2,u,u,2,2> +; AVX2-FAST-NEXT: vpermps %ymm9, %ymm15, %ymm9 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm9[0,1,2,3,4],ymm6[5,6],ymm9[7] ; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm9 = xmm5[2,2,2,2] ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm9 = xmm4[0,1,2],xmm9[3] @@ -1017,7 +1003,7 @@ ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm9[2,3,4],ymm6[5,6,7] ; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm9 = ymm3[2],ymm10[2],ymm3[3],ymm10[3],ymm3[6],ymm10[6],ymm3[7],ymm10[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[3,3,3,3] -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm10 = ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[6],ymm1[6],ymm2[7],ymm1[7] +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm10 = ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[6],ymm0[6],ymm2[7],ymm0[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[3,3,3,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1,2],ymm9[3,4,5,6,7] ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm7 = ymm7[3,3],ymm8[3,3],ymm7[7,7],ymm8[7,7] @@ -1028,25 +1014,25 @@ ; AVX2-FAST-NEXT: vbroadcastss %xmm12, %xmm9 ; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1] ; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm9 = xmm13[0],xmm14[0],xmm13[1],xmm14[1] -; AVX2-FAST-NEXT: vpermps %ymm9, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm8[2,3],ymm0[4,5,6,7] -; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm8 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] -; AVX2-FAST-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 -; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-FAST-NEXT: vbroadcastsd %xmm10, %ymm9 -; AVX2-FAST-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm8[0],ymm9[0],ymm8[2],ymm9[2] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm8[4,5,6],ymm0[7] -; AVX2-FAST-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[4],ymm2[4],ymm1[5],ymm2[5] +; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm10 = <0,1,2,2,u,u,2,2> +; AVX2-FAST-NEXT: vpermps %ymm9, %ymm10, %ymm9 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1],ymm8[2,3],ymm9[4,5,6,7] +; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm9 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] +; AVX2-FAST-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm9 +; AVX2-FAST-NEXT: vbroadcastsd %xmm1, %ymm10 +; AVX2-FAST-NEXT: vunpcklpd {{.*#+}} ymm9 = ymm9[0],ymm10[0],ymm9[2],ymm10[2] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6],ymm8[7] +; AVX2-FAST-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[4],ymm2[4],ymm0[5],ymm2[5] ; AVX2-FAST-NEXT: vbroadcastss 16(%rdx), %ymm2 ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm3 = ymm3[3,1,2,0,7,5,6,4] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6],ymm3[7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5],ymm2[6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5],ymm2[6,7] ; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm2 = xmm4[3,3],xmm5[3,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1,2],xmm10[3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-NEXT: vmovaps %ymm1, 96(%rax) -; AVX2-FAST-NEXT: vmovaps %ymm0, (%rax) +; AVX2-FAST-NEXT: vmovaps %ymm0, 96(%rax) +; AVX2-FAST-NEXT: vmovaps %ymm8, (%rax) ; AVX2-FAST-NEXT: vmovaps %ymm7, 192(%rax) ; AVX2-FAST-NEXT: vmovaps %ymm6, 64(%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -2058,55 +2044,54 @@ ; ; AVX2-FAST-LABEL: store_i32_stride7_vf16: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: subq $536, %rsp # imm = 0x218 +; AVX2-FAST-NEXT: subq $520, %rsp # imm = 0x208 ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-FAST-NEXT: vmovaps (%rax), %xmm2 -; AVX2-FAST-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vmovaps (%rax), %xmm8 +; AVX2-FAST-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vmovaps 32(%rax), %xmm0 -; AVX2-FAST-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill ; AVX2-FAST-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vmovaps (%r8), %xmm7 -; AVX2-FAST-NEXT: vmovaps %xmm7, (%rsp) # 16-byte Spill -; AVX2-FAST-NEXT: vmovaps 32(%r8), %xmm3 -; AVX2-FAST-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovaps (%r9), %xmm8 -; AVX2-FAST-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovaps 32(%r9), %xmm4 -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm4[1,1,1,1] -; AVX2-FAST-NEXT: vmovaps %xmm4, %xmm14 -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2,3] +; AVX2-FAST-NEXT: vmovaps (%r8), %xmm10 +; AVX2-FAST-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vmovaps 32(%r8), %xmm2 +; AVX2-FAST-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vmovaps (%r9), %xmm11 +; AVX2-FAST-NEXT: vmovaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vmovaps 32(%r9), %xmm1 +; AVX2-FAST-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,1,1,1] +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3] ; AVX2-FAST-NEXT: vbroadcastsd %xmm1, %ymm1 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5],ymm1[6,7] -; AVX2-FAST-NEXT: vmovaps (%rcx), %xmm11 -; AVX2-FAST-NEXT: vmovaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vmovaps (%rcx), %xmm2 +; AVX2-FAST-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vmovaps 32(%rcx), %xmm9 -; AVX2-FAST-NEXT: vmovaps 32(%rdx), %xmm6 -; AVX2-FAST-NEXT: vinsertps {{.*#+}} xmm1 = zero,xmm6[1],xmm9[1],zero +; AVX2-FAST-NEXT: vmovaps 32(%rdx), %xmm5 +; AVX2-FAST-NEXT: vinsertps {{.*#+}} xmm3 = zero,xmm5[1],xmm9[1],zero ; AVX2-FAST-NEXT: vmovaps (%rdi), %xmm12 ; AVX2-FAST-NEXT: vmovaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovaps 32(%rdi), %xmm5 -; AVX2-FAST-NEXT: vmovaps (%rsi), %xmm10 -; AVX2-FAST-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovaps 32(%rsi), %xmm3 -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm4 = xmm3[1,1,2,2] -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],xmm5[2],xmm4[3] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,2,1] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0],ymm1[1,2],ymm4[3,4,5,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5],ymm1[6,7] +; AVX2-FAST-NEXT: vmovaps 32(%rdi), %xmm7 +; AVX2-FAST-NEXT: vmovaps (%rsi), %xmm1 +; AVX2-FAST-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vmovaps 32(%rsi), %xmm4 +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm6 = xmm4[1,1,2,2] +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1],xmm7[2],xmm6[3] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,2,1] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm6[0],ymm3[1,2],ymm6[3,4,5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3,4,5],ymm3[6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm8[1,1,1,1] -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm7[1],xmm1[2,3] -; AVX2-FAST-NEXT: vbroadcastsd %xmm1, %ymm1 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5],ymm1[6,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm10[1,1,2,2] -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm12[2],xmm1[3] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] -; AVX2-FAST-NEXT: vmovaps (%rdx), %xmm2 -; AVX2-FAST-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vinsertps {{.*#+}} xmm4 = zero,xmm2[1],xmm11[1],zero -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm4[1,2],ymm1[3,4,5,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5],ymm1[6,7] +; AVX2-FAST-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm3 = xmm11[1,1,1,1] +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0],xmm10[1],xmm3[2,3] +; AVX2-FAST-NEXT: vbroadcastsd %xmm3, %ymm3 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3,4],ymm0[5],ymm3[6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm3 = xmm1[1,1,2,2] +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm12[2],xmm3[3] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,2,1] +; AVX2-FAST-NEXT: vmovaps (%rdx), %xmm13 +; AVX2-FAST-NEXT: vinsertps {{.*#+}} xmm6 = zero,xmm13[1],xmm2[1],zero +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm6[1,2],ymm3[3,4,5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3,4,5],ymm3[6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps (%rdi), %ymm0 ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -2114,178 +2099,177 @@ ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] -; AVX2-FAST-NEXT: vmovaps (%rdx), %ymm1 -; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps (%rcx), %ymm2 +; AVX2-FAST-NEXT: vmovaps (%rdx), %ymm2 ; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm1 = ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[6],ymm2[6],ymm1[7],ymm2[7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FAST-NEXT: vmovaps (%r8), %ymm2 -; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps (%r9), %ymm1 +; AVX2-FAST-NEXT: vmovaps (%rcx), %ymm1 ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,1,2,2,5,5,6,6] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4,5],ymm2[6],ymm1[7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX2-FAST-NEXT: vmovaps 16(%rax), %xmm4 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm4[1],ymm1[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm3 = ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[6],ymm1[6],ymm2[7],ymm1[7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] +; AVX2-FAST-NEXT: vmovaps (%r8), %ymm1 +; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovaps (%r9), %ymm2 +; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm3 = ymm2[1,1,2,2,5,5,6,6] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm1[2],ymm3[3,4,5],ymm1[6],ymm3[7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[2,1,2,3] +; AVX2-FAST-NEXT: vmovaps 16(%rax), %xmm6 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm6[1],ymm3[2,3,4,5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3,4,5],ymm3[6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps 32(%rdi), %ymm13 -; AVX2-FAST-NEXT: vmovaps 32(%rsi), %ymm12 -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm0 = ymm13[2],ymm12[2],ymm13[3],ymm12[3],ymm13[6],ymm12[6],ymm13[7],ymm12[7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm4 = ymm0[2,2,2,2] -; AVX2-FAST-NEXT: vmovaps 32(%rdx), %ymm2 -; AVX2-FAST-NEXT: vmovaps 32(%rcx), %ymm11 -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm0 = ymm2[2],ymm11[2],ymm2[3],ymm11[3],ymm2[6],ymm11[6],ymm2[7],ymm11[7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm0[4,5,6,7] -; AVX2-FAST-NEXT: vmovaps 32(%r8), %ymm10 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm10[6,7] -; AVX2-FAST-NEXT: vmovaps 32(%r9), %ymm4 -; AVX2-FAST-NEXT: vbroadcastsd {{.*#+}} ymm15 = [25769803781,25769803781,25769803781,25769803781] -; AVX2-FAST-NEXT: vpermps %ymm4, %ymm15, %ymm15 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm15[0],ymm1[1,2,3,4,5,6],ymm15[7] -; AVX2-FAST-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm0[2,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7] +; AVX2-FAST-NEXT: vmovaps 32(%rdi), %ymm12 +; AVX2-FAST-NEXT: vmovaps 32(%rsi), %ymm11 +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm0 = ymm12[2],ymm11[2],ymm12[3],ymm11[3],ymm12[6],ymm11[6],ymm12[7],ymm11[7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm3 = ymm0[2,2,2,2] +; AVX2-FAST-NEXT: vmovaps 32(%rdx), %ymm6 +; AVX2-FAST-NEXT: vmovaps 32(%rcx), %ymm10 +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm14 = ymm6[2],ymm10[2],ymm6[3],ymm10[3],ymm6[6],ymm10[6],ymm6[7],ymm10[7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm14[4,5,6,7] +; AVX2-FAST-NEXT: vmovaps 32(%r8), %ymm8 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm8[6,7] +; AVX2-FAST-NEXT: vmovaps 32(%r9), %ymm3 +; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm15 = <5,u,u,u,u,u,u,6> +; AVX2-FAST-NEXT: vpermps %ymm3, %ymm15, %ymm15 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0],ymm0[1,2,3,4,5,6],ymm15[7] +; AVX2-FAST-NEXT: vperm2f128 {{.*#+}} ymm14 = mem[2,3],ymm14[2,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm14[1],ymm0[2,3,4],ymm14[5],ymm0[6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm0 = ymm11[2],ymm2[2],ymm11[3],ymm2[3],ymm11[6],ymm2[6],ymm11[7],ymm2[7] +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm0 = ymm10[2],ymm6[2],ymm10[3],ymm6[3],ymm10[6],ymm6[6],ymm10[7],ymm6[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,3] -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm1 = ymm12[2],ymm13[2],ymm12[3],ymm13[3],ymm12[6],ymm13[6],ymm12[7],ymm13[7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] -; AVX2-FAST-NEXT: vbroadcastss 60(%r8), %ymm1 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] -; AVX2-FAST-NEXT: vbroadcastss 60(%r9), %ymm1 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX2-FAST-NEXT: vbroadcastsd 56(%rax), %ymm1 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6],ymm1[7] +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm14 = ymm11[2],ymm12[2],ymm11[3],ymm12[3],ymm11[6],ymm12[6],ymm11[7],ymm12[7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[3,3,3,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1,2],ymm0[3,4,5,6,7] +; AVX2-FAST-NEXT: vbroadcastss 60(%r8), %ymm14 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm14[5],ymm0[6,7] +; AVX2-FAST-NEXT: vbroadcastss 60(%r9), %ymm14 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm14[6,7] +; AVX2-FAST-NEXT: vbroadcastsd 56(%rax), %ymm14 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0],ymm0[1,2,3,4,5,6],ymm14[7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm5[3,3],xmm3[3,3] -; AVX2-FAST-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm1 -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} xmm15 = xmm6[2],xmm9[2],xmm6[3],xmm9[3] -; AVX2-FAST-NEXT: vbroadcastf128 {{.*#+}} ymm7 = [0,1,2,2,0,1,2,2] -; AVX2-FAST-NEXT: # ymm7 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpermps %ymm15, %ymm7, %ymm15 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm15[0,1,2,3,4],ymm1[5,6],ymm15[7] -; AVX2-FAST-NEXT: vmovaps %xmm14, %xmm0 -; AVX2-FAST-NEXT: vmovaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm15 = xmm14[2,2,2,2] -; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm15 = xmm8[0,1,2],xmm15[3] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm7[3,3],xmm4[3,3] +; AVX2-FAST-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} xmm15 = xmm5[2],xmm9[2],xmm5[3],xmm9[3] +; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm1 = <0,1,2,2,u,u,2,2> +; AVX2-FAST-NEXT: vpermps %ymm15, %ymm1, %ymm15 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm15[0,1,2,3,4],ymm0[5,6],ymm15[7] +; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm15 = xmm1[2,2,2,2] +; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm15 = xmm0[0,1,2],xmm15[3] ; AVX2-FAST-NEXT: vbroadcastsd 40(%rax), %ymm14 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1,2,3],ymm14[4,5,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm14[2,3,4],ymm1[5,6,7] -; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vbroadcastss %xmm9, %xmm1 -; AVX2-FAST-NEXT: vbroadcastss %xmm6, %xmm6 -; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm1 = xmm6[0],xmm1[0],xmm6[1],xmm1[1] -; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] -; AVX2-FAST-NEXT: vpermps %ymm3, %ymm7, %ymm3 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3],ymm3[4,5,6,7] -; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm3 = xmm8[0],xmm0[0],xmm8[1],xmm0[1] -; AVX2-FAST-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX2-FAST-NEXT: vbroadcastsd %xmm15, %ymm5 -; AVX2-FAST-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm3[0],ymm5[0],ymm3[2],ymm5[2] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm3[4,5,6],ymm1[7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm14[2,3,4],ymm2[5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vbroadcastss %xmm9, %xmm2 +; AVX2-FAST-NEXT: vbroadcastss %xmm5, %xmm5 +; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1] +; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm2 = xmm7[0],xmm4[0],xmm7[1],xmm4[1] +; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm7 = <0,1,2,2,u,u,2,2> +; AVX2-FAST-NEXT: vpermps %ymm2, %ymm7, %ymm2 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm2[0,1],ymm5[2,3],ymm2[4,5,6,7] +; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX2-FAST-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 +; AVX2-FAST-NEXT: vmovaps (%rsp), %xmm15 # 16-byte Reload +; AVX2-FAST-NEXT: vbroadcastsd %xmm15, %ymm4 +; AVX2-FAST-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm2[0],ymm4[0],ymm2[2],ymm4[2] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm2[4,5,6],ymm5[7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm2[1,1],ymm11[1,1],ymm2[5,5],ymm11[5,5] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm12[1,1,1,1,5,5,5,5] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm13[1],ymm2[2,3,4],ymm13[5],ymm2[6,7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[2,2,2,2] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6],ymm2[7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm4[0,0,0,0,4,4,4,4] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm3 = ymm10[0,1,0,1,4,5,4,5] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3,4],ymm2[5],ymm3[6,7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[2,1,2,3] -; AVX2-FAST-NEXT: vbroadcastsd 48(%rax), %ymm3 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3],ymm2[4,5,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm1[3,4,5,6],ymm2[7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm6[1,1],ymm10[1,1],ymm6[5,5],ymm10[5,5] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm11[1,1,1,1,5,5,5,5] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm12[1],ymm1[2,3,4],ymm12[5],ymm1[6,7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,2] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6],ymm1[7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm3[0,0,0,0,4,4,4,4] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm8[0,1,0,1,4,5,4,5] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] +; AVX2-FAST-NEXT: vbroadcastsd 48(%rax), %ymm2 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6],ymm1[7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm2[3,3],xmm6[3,3] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm5[3,3],xmm6[3,3] ; AVX2-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} xmm3 = xmm9[2],xmm0[2],xmm9[3],xmm0[3] -; AVX2-FAST-NEXT: vpermps %ymm3, %ymm7, %ymm3 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3,4],ymm1[5,6],ymm3[7] -; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm3 = xmm10[2,2,2,2] -; AVX2-FAST-NEXT: vmovaps (%rsp), %xmm14 # 16-byte Reload -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm3 = xmm14[0,1,2],xmm3[3] -; AVX2-FAST-NEXT: vbroadcastsd 8(%rax), %ymm4 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3,4],ymm1[5,6,7] +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} xmm2 = xmm13[2],xmm0[2],xmm13[3],xmm0[3] +; AVX2-FAST-NEXT: vpermps %ymm2, %ymm7, %ymm2 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6],ymm2[7] +; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm2 = xmm8[2,2,2,2] +; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm2 = xmm14[0,1,2],xmm2[3] +; AVX2-FAST-NEXT: vbroadcastsd 8(%rax), %ymm3 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3,4],ymm1[5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vbroadcastss %xmm0, %xmm3 -; AVX2-FAST-NEXT: vbroadcastss %xmm9, %xmm4 -; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm4 = xmm2[0],xmm6[0],xmm2[1],xmm6[1] -; AVX2-FAST-NEXT: vpermps %ymm4, %ymm7, %ymm0 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3],ymm0[4,5,6,7] -; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm3 = xmm14[0],xmm10[0],xmm14[1],xmm10[1] +; AVX2-FAST-NEXT: vbroadcastss %xmm0, %xmm2 +; AVX2-FAST-NEXT: vbroadcastss %xmm13, %xmm3 +; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm3 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] +; AVX2-FAST-NEXT: vpermps %ymm3, %ymm7, %ymm3 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5,6,7] +; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm3 = xmm14[0],xmm8[0],xmm14[1],xmm8[1] ; AVX2-FAST-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-FAST-NEXT: vbroadcastsd %xmm10, %ymm4 +; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX2-FAST-NEXT: vbroadcastsd %xmm9, %ymm4 ; AVX2-FAST-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[2],ymm4[2] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6],ymm0[7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm3[4,5,6],ymm2[7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm3 = ymm0[1,1,1,1,5,5,5,5] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm3 = ymm7[1,1,1,1,5,5,5,5] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm6[1],ymm3[2,3,4],ymm6[5],ymm3[6,7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm3[2,2,2,2] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm3[1,1],ymm1[1,1],ymm3[5,5],ymm1[5,5] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm9 = ymm2[0,1,2,3,4],ymm4[5,6],ymm2[7] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm7[0,0,0,0,4,4,4,4] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[2,2,2,2] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm5 = ymm2[0,1,0,1,4,5,4,5] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm2[1,1],ymm8[1,1],ymm2[5,5],ymm8[5,5] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5,6],ymm3[7] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm1[0,0,0,0,4,4,4,4] +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm5 = ymm0[0,1,0,1,4,5,4,5] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2,3,4],ymm4[5],ymm5[6,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[2,1,2,3] ; AVX2-FAST-NEXT: vbroadcastsd 16(%rax), %ymm5 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm5[2,3],ymm4[4,5,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm9 = ymm4[0,1,2],ymm9[3,4,5,6],ymm4[7] -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm4 = ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[6],ymm3[6],ymm1[7],ymm3[7] -; AVX2-FAST-NEXT: vmovaps %ymm1, %ymm3 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3,4,5,6],ymm4[7] +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm4 = ymm8[2],ymm2[2],ymm8[3],ymm2[3],ymm8[6],ymm2[6],ymm8[7],ymm2[7] +; AVX2-FAST-NEXT: vmovaps %ymm8, %ymm2 ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[3,3,3,3] -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm5 = ymm0[2],ymm6[2],ymm0[3],ymm6[3],ymm0[6],ymm6[6],ymm0[7],ymm6[7] -; AVX2-FAST-NEXT: vmovaps %ymm0, %ymm1 -; AVX2-FAST-NEXT: vmovaps %ymm6, %ymm0 +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm5 = ymm7[2],ymm6[2],ymm7[3],ymm6[3],ymm7[6],ymm6[6],ymm7[7],ymm6[7] +; AVX2-FAST-NEXT: vmovaps %ymm7, %ymm13 +; AVX2-FAST-NEXT: vmovaps %ymm6, %ymm8 ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[3,3,3,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3,4,5,6,7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm5 = ymm2[3,3],ymm7[3,3],ymm2[7,7],ymm7[7,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm5 = ymm0[3,3],ymm1[3,3],ymm0[7,7],ymm1[7,7] ; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm6 = mem[2,3,2,3,6,7,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0],ymm5[1,2],ymm6[3,4],ymm5[5,6],ymm6[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[2,1,2,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0],ymm4[1,2,3,4],ymm5[5,6,7] -; AVX2-FAST-NEXT: vunpcklps {{.*#+}} ymm5 = ymm13[0],ymm12[0],ymm13[1],ymm12[1],ymm13[4],ymm12[4],ymm13[5],ymm12[5] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm6 = ymm11[3,1,2,0,7,5,6,4] +; AVX2-FAST-NEXT: vunpcklps {{.*#+}} ymm5 = ymm12[0],ymm11[0],ymm12[1],ymm11[1],ymm12[4],ymm11[4],ymm12[5],ymm11[5] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm6 = ymm10[3,1,2,0,7,5,6,4] ; AVX2-FAST-NEXT: vbroadcastss 48(%rdx), %ymm7 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm7[6],ymm6[7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5],ymm6[6,7] -; AVX2-FAST-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm6 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm6 = xmm8[3,3],mem[3,3] +; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX2-FAST-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm6 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm6 = xmm0[3,3],mem[3,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1,2],xmm15[3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm6[1,2,3],ymm5[4,5,6,7] -; AVX2-FAST-NEXT: vunpcklps {{.*#+}} ymm6 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm7 = ymm3[3,1,2,0,7,5,6,4] +; AVX2-FAST-NEXT: vunpcklps {{.*#+}} ymm6 = ymm8[0],ymm13[0],ymm8[1],ymm13[1],ymm8[4],ymm13[4],ymm8[5],ymm13[5] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm7 = ymm2[3,1,2,0,7,5,6,4] ; AVX2-FAST-NEXT: vbroadcastss 16(%rdx), %ymm8 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm8[6],ymm7[7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5],ymm7[6,7] ; AVX2-FAST-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm7 # 16-byte Folded Reload ; AVX2-FAST-NEXT: # xmm7 = xmm14[3,3],mem[3,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1,2],xmm10[3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1,2],xmm9[3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm7[1,2,3],ymm6[4,5,6,7] ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FAST-NEXT: vmovaps %ymm6, 96(%rax) ; AVX2-FAST-NEXT: vmovaps %ymm5, 320(%rax) ; AVX2-FAST-NEXT: vmovaps %ymm4, 192(%rax) -; AVX2-FAST-NEXT: vmovaps %ymm9, 128(%rax) +; AVX2-FAST-NEXT: vmovaps %ymm3, 128(%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, (%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -2306,7 +2290,7 @@ ; AVX2-FAST-NEXT: vmovaps %ymm0, 416(%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, 384(%rax) -; AVX2-FAST-NEXT: addq $536, %rsp # imm = 0x218 +; AVX2-FAST-NEXT: addq $520, %rsp # imm = 0x208 ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; @@ -4685,7 +4669,7 @@ ; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm0 = ymm4[2],ymm1[2],ymm4[3],ymm1[3],ymm4[6],ymm1[6],ymm4[7],ymm1[7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0,1,2,3,4,5],mem[6,7] -; AVX2-FAST-NEXT: vbroadcastsd {{.*#+}} ymm11 = [25769803781,25769803781,25769803781,25769803781] +; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm11 = <5,u,u,u,u,u,u,6> ; AVX2-FAST-NEXT: vpermps 96(%r9), %ymm11, %ymm11 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0],ymm15[1,2,3,4,5,6],ymm11[7] ; AVX2-FAST-NEXT: vmovaps 96(%rax), %ymm15 @@ -4713,8 +4697,7 @@ ; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload ; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm2 = xmm15[0],xmm13[0],xmm15[1],xmm13[1] -; AVX2-FAST-NEXT: vbroadcastf128 {{.*#+}} ymm7 = [0,1,2,2,0,1,2,2] -; AVX2-FAST-NEXT: # ymm7 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm7 = <0,1,2,2,u,u,2,2> ; AVX2-FAST-NEXT: vpermps %ymm2, %ymm7, %ymm2 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5,6,7] ; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload @@ -9368,7 +9351,7 @@ ; ; AVX2-FAST-LABEL: store_i32_stride7_vf64: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: subq $3096, %rsp # imm = 0xC18 +; AVX2-FAST-NEXT: subq $3048, %rsp # imm = 0xBE8 ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FAST-NEXT: vmovaps (%rax), %xmm0 ; AVX2-FAST-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -9558,11 +9541,11 @@ ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] -; AVX2-FAST-NEXT: vmovaps (%rdx), %ymm14 +; AVX2-FAST-NEXT: vmovaps (%rdx), %ymm11 ; AVX2-FAST-NEXT: vmovaps (%rcx), %ymm1 ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm1 = ymm14[2],ymm1[2],ymm14[3],ymm1[3],ymm14[6],ymm1[6],ymm14[7],ymm1[7] -; AVX2-FAST-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm1 = ymm11[2],ymm1[2],ymm11[3],ymm1[3],ymm11[6],ymm1[6],ymm11[7],ymm1[7] +; AVX2-FAST-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FAST-NEXT: vmovaps (%r8), %ymm2 ; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -9691,9 +9674,8 @@ ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps 192(%rdi), %ymm1 -; AVX2-FAST-NEXT: vmovaps 192(%rsi), %ymm11 -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm0 = ymm1[2],ymm11[2],ymm1[3],ymm11[3],ymm1[6],ymm11[6],ymm1[7],ymm11[7] -; AVX2-FAST-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovaps 192(%rsi), %ymm10 +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm0 = ymm1[2],ymm10[2],ymm1[3],ymm10[3],ymm1[6],ymm10[6],ymm1[7],ymm10[7] ; AVX2-FAST-NEXT: vmovaps %ymm1, %ymm13 ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] @@ -9720,9 +9702,9 @@ ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2],xmm2[3] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,2,1] ; AVX2-FAST-NEXT: vmovaps 224(%rcx), %xmm3 -; AVX2-FAST-NEXT: vmovaps 224(%rdx), %xmm6 -; AVX2-FAST-NEXT: vinsertps {{.*#+}} xmm5 = zero,xmm6[1],xmm3[1],zero -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm5[1,2],ymm2[3,4,5,6,7] +; AVX2-FAST-NEXT: vmovaps 224(%rdx), %xmm5 +; AVX2-FAST-NEXT: vinsertps {{.*#+}} xmm6 = zero,xmm5[1],xmm3[1],zero +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm6[1,2],ymm2[3,4,5,6,7] ; AVX2-FAST-NEXT: vbroadcastss 228(%r8), %ymm4 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm4[3],ymm2[4,5,6,7] ; AVX2-FAST-NEXT: vmovaps 224(%r9), %xmm7 @@ -9730,16 +9712,15 @@ ; AVX2-FAST-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm15[4,5],ymm2[6,7] ; AVX2-FAST-NEXT: vmovaps 224(%rax), %xmm15 -; AVX2-FAST-NEXT: vinsertf128 $1, %xmm15, %ymm5, %ymm5 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm5[1],ymm2[2,3,4],ymm5[5],ymm2[6,7] +; AVX2-FAST-NEXT: vinsertf128 $1, %xmm15, %ymm6, %ymm6 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm6[1],ymm2[2,3,4],ymm6[5],ymm2[6,7] ; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vbroadcastss %xmm3, %xmm2 -; AVX2-FAST-NEXT: vbroadcastss %xmm6, %xmm5 -; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm4 = xmm5[0],xmm2[0],xmm5[1],xmm2[1] +; AVX2-FAST-NEXT: vbroadcastss %xmm5, %xmm6 +; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm4 = xmm6[0],xmm2[0],xmm6[1],xmm2[1] ; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX2-FAST-NEXT: vbroadcastf128 {{.*#+}} ymm10 = [0,1,2,2,0,1,2,2] -; AVX2-FAST-NEXT: # ymm10 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpermps %ymm2, %ymm10, %ymm2 +; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm14 = <0,1,2,2,u,u,2,2> +; AVX2-FAST-NEXT: vpermps %ymm2, %ymm14, %ymm2 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm4[2,3],ymm2[4,5,6,7] ; AVX2-FAST-NEXT: vbroadcastsd 224(%r8), %ymm4 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5],ymm2[6,7] @@ -9749,12 +9730,12 @@ ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6],ymm2[7] ; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3],xmm1[3,3] -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} xmm1 = xmm6[2],xmm3[2],xmm6[3],xmm3[3] +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} xmm1 = xmm5[2],xmm3[2],xmm5[3],xmm3[3] ; AVX2-FAST-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpermps %ymm1, %ymm10, %ymm1 +; AVX2-FAST-NEXT: vpermps %ymm1, %ymm14, %ymm1 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6],ymm1[7] -; AVX2-FAST-NEXT: vmovaps 224(%r8), %ymm6 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm6[2,3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vmovaps 224(%r8), %ymm5 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm5[2,3],ymm0[4,5,6,7] ; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm7[2,2,2,2] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6,7] ; AVX2-FAST-NEXT: vbroadcastss 232(%rax), %ymm1 @@ -9778,7 +9759,7 @@ ; AVX2-FAST-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm4 = ymm9[2],ymm8[2],ymm9[3],ymm8[3],ymm9[6],ymm8[6],ymm9[7],ymm8[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[3,3,3,3] -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm15 = ymm11[2],ymm13[2],ymm11[3],ymm13[3],ymm11[6],ymm13[6],ymm11[7],ymm13[7] +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm15 = ymm10[2],ymm13[2],ymm10[3],ymm13[3],ymm10[6],ymm13[6],ymm10[7],ymm13[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[3,3,3,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm15[0,1,2],ymm4[3,4,5,6,7] ; AVX2-FAST-NEXT: vbroadcastss 220(%r8), %ymm15 @@ -9801,15 +9782,15 @@ ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[2,2,2,2] ; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm15 = ymm3[2],ymm0[2],ymm3[3],ymm0[3],ymm3[6],ymm0[6],ymm3[7],ymm0[7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm15[4,5,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3,4,5],ymm6[6,7] -; AVX2-FAST-NEXT: vbroadcastsd {{.*#+}} ymm7 = [25769803781,25769803781,25769803781,25769803781] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0,1,2,3,4,5],ymm5[6,7] +; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm7 = <5,u,u,u,u,u,u,6> ; AVX2-FAST-NEXT: vpermps 224(%r9), %ymm7, %ymm7 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0],ymm6[1,2,3,4,5,6],ymm7[7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0],ymm5[1,2,3,4,5,6],ymm7[7] ; AVX2-FAST-NEXT: vmovaps 224(%rax), %ymm7 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2],ymm7[3],ymm4[4,5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm7[2,3],ymm15[2,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm6[0],ymm4[1],ymm6[2,3,4],ymm4[5],ymm6[6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2,3,4],ymm4[5],ymm5[6,7] ; AVX2-FAST-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm3[2],ymm0[3],ymm3[3],ymm0[6],ymm3[6],ymm0[7],ymm3[7] ; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm1 = ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[6],ymm2[6],ymm1[7],ymm2[7] @@ -9823,30 +9804,30 @@ ; AVX2-FAST-NEXT: vbroadcastsd 248(%rax), %ymm1 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6],ymm1[7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FAST-NEXT: vbroadcastss %xmm5, %xmm0 +; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX2-FAST-NEXT: vbroadcastss %xmm6, %xmm0 ; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX2-FAST-NEXT: vbroadcastss %xmm4, %xmm1 ; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload ; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload ; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm1 = xmm8[0],xmm7[0],xmm8[1],xmm7[1] -; AVX2-FAST-NEXT: vpermps %ymm1, %ymm10, %ymm1 +; AVX2-FAST-NEXT: vpermps %ymm1, %ymm14, %ymm1 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7] ; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm1 = xmm3[0],xmm6[0],xmm3[1],xmm6[1] +; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm1 = xmm3[0],xmm5[0],xmm3[1],xmm5[1] ; AVX2-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-FAST-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 16-byte Folded Reload ; AVX2-FAST-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6],ymm0[7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm8[3,3],xmm7[3,3] -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} xmm1 = xmm4[2],xmm5[2],xmm4[3],xmm5[3] +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} xmm1 = xmm4[2],xmm6[2],xmm4[3],xmm6[3] ; AVX2-FAST-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpermps %ymm1, %ymm10, %ymm1 +; AVX2-FAST-NEXT: vpermps %ymm1, %ymm14, %ymm1 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6],ymm1[7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm6[2,2,2,2] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm5[2,2,2,2] ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3] ; AVX2-FAST-NEXT: vbroadcastsd 8(%rax), %ymm2 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] @@ -9857,8 +9838,8 @@ ; AVX2-FAST-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm0 = ymm0[0],mem[1],ymm0[2,3,4],mem[5],ymm0[6,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] -; AVX2-FAST-NEXT: vshufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm1 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm1 = ymm14[1,1],mem[1,1],ymm14[5,5],mem[5,5] +; AVX2-FAST-NEXT: vshufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm1 = ymm11[1,1],mem[1,1],ymm11[5,5],mem[5,5] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6],ymm0[7] ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm12[0,0,0,0,4,4,4,4] ; AVX2-FAST-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload @@ -9877,7 +9858,7 @@ ; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm1 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] -; AVX2-FAST-NEXT: vpermps %ymm1, %ymm10, %ymm1 +; AVX2-FAST-NEXT: vpermps %ymm1, %ymm14, %ymm1 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7] ; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload @@ -9890,7 +9871,7 @@ ; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm5[3,3],xmm6[3,3] ; AVX2-FAST-NEXT: vunpckhps {{.*#+}} xmm1 = xmm8[2],xmm4[2],xmm8[3],xmm4[3] ; AVX2-FAST-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpermps %ymm1, %ymm10, %ymm1 +; AVX2-FAST-NEXT: vpermps %ymm1, %ymm14, %ymm1 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6],ymm1[7] ; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm7[2,2,2,2] ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3] @@ -9925,7 +9906,7 @@ ; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm1 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] -; AVX2-FAST-NEXT: vpermps %ymm1, %ymm10, %ymm1 +; AVX2-FAST-NEXT: vpermps %ymm1, %ymm14, %ymm1 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7] ; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload @@ -9938,7 +9919,7 @@ ; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm5[3,3],xmm4[3,3] ; AVX2-FAST-NEXT: vunpckhps {{.*#+}} xmm1 = xmm7[2],xmm6[2],xmm7[3],xmm6[3] ; AVX2-FAST-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpermps %ymm1, %ymm10, %ymm1 +; AVX2-FAST-NEXT: vpermps %ymm1, %ymm14, %ymm1 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6],ymm1[7] ; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm3[2,2,2,2] ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm8[0,1,2],xmm1[3] @@ -9973,7 +9954,7 @@ ; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm1 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; AVX2-FAST-NEXT: vpermps %ymm1, %ymm10, %ymm1 +; AVX2-FAST-NEXT: vpermps %ymm1, %ymm14, %ymm1 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7] ; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload ; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload @@ -9986,7 +9967,7 @@ ; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm4[3,3],xmm3[3,3] ; AVX2-FAST-NEXT: vunpckhps {{.*#+}} xmm1 = xmm6[2],xmm5[2],xmm6[3],xmm5[3] ; AVX2-FAST-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpermps %ymm1, %ymm10, %ymm1 +; AVX2-FAST-NEXT: vpermps %ymm1, %ymm14, %ymm1 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6],ymm1[7] ; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm7[2,2,2,2] ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm8[0,1,2],xmm1[3] @@ -10021,7 +10002,7 @@ ; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm1 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; AVX2-FAST-NEXT: vpermps %ymm1, %ymm10, %ymm1 +; AVX2-FAST-NEXT: vpermps %ymm1, %ymm14, %ymm1 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7] ; AVX2-FAST-NEXT: vmovaps (%rsp), %xmm8 # 16-byte Reload ; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload @@ -10034,7 +10015,7 @@ ; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm4[3,3],xmm3[3,3] ; AVX2-FAST-NEXT: vunpckhps {{.*#+}} xmm1 = xmm6[2],xmm5[2],xmm6[3],xmm5[3] ; AVX2-FAST-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpermps %ymm1, %ymm10, %ymm1 +; AVX2-FAST-NEXT: vpermps %ymm1, %ymm14, %ymm1 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6],ymm1[7] ; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm7[2,2,2,2] ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm8[0,1,2],xmm1[3] @@ -10069,7 +10050,7 @@ ; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm1 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; AVX2-FAST-NEXT: vpermps %ymm1, %ymm10, %ymm1 +; AVX2-FAST-NEXT: vpermps %ymm1, %ymm14, %ymm1 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7] ; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload ; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload @@ -10082,7 +10063,7 @@ ; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm4[3,3],xmm3[3,3] ; AVX2-FAST-NEXT: vunpckhps {{.*#+}} xmm1 = xmm6[2],xmm5[2],xmm6[3],xmm5[3] ; AVX2-FAST-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpermps %ymm1, %ymm10, %ymm1 +; AVX2-FAST-NEXT: vpermps %ymm1, %ymm14, %ymm1 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6],ymm1[7] ; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm7[2,2,2,2] ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm8[0,1,2],xmm1[3] @@ -10095,9 +10076,9 @@ ; AVX2-FAST-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm0 = ymm0[0],mem[1],ymm0[2,3,4],mem[5],ymm0[6,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,1],ymm14[1,1],ymm1[5,5],ymm14[5,5] +; AVX2-FAST-NEXT: vshufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm1 = ymm1[1,1],mem[1,1],ymm1[5,5],mem[5,5] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6],ymm0[7] ; AVX2-FAST-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm1 = mem[0,0,0,0,4,4,4,4] @@ -10109,42 +10090,40 @@ ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6],ymm1[7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-FAST-NEXT: vbroadcastss %xmm5, %xmm0 +; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX2-FAST-NEXT: vbroadcastss %xmm6, %xmm0 ; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX2-FAST-NEXT: vbroadcastss %xmm4, %xmm1 ; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload ; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm1 = xmm0[0],xmm7[0],xmm0[1],xmm7[1] -; AVX2-FAST-NEXT: vpermps %ymm1, %ymm10, %ymm1 +; AVX2-FAST-NEXT: vpermps %ymm1, %ymm14, %ymm1 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm1[0,1],ymm2[2,3],ymm1[4,5,6,7] -; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload +; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm1 = xmm3[0],xmm6[0],xmm3[1],xmm6[1] +; AVX2-FAST-NEXT: vunpcklps {{.*#+}} xmm1 = xmm3[0],xmm5[0],xmm3[1],xmm5[1] ; AVX2-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-FAST-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 16-byte Folded Reload ; AVX2-FAST-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm8[0,1,2,3],ymm1[4,5,6],ymm8[7] ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3],xmm7[3,3] -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} xmm1 = xmm4[2],xmm5[2],xmm4[3],xmm5[3] -; AVX2-FAST-NEXT: vpermps %ymm1, %ymm10, %ymm1 +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} xmm1 = xmm4[2],xmm6[2],xmm4[3],xmm6[3] +; AVX2-FAST-NEXT: vpermps %ymm1, %ymm14, %ymm1 ; AVX2-FAST-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6],ymm1[7] -; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm6[2,2,2,2] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm5[2,2,2,2] ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3] ; AVX2-FAST-NEXT: vbroadcastsd 200(%rax), %ymm2 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4],ymm0[5,6,7] -; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm14 = ymm0[0,1],ymm1[2,3,4],ymm0[5,6,7] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,1],ymm13[1,1],ymm0[5,5],ymm13[5,5] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm10[1,1,1,1,5,5,5,5] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm7[1,1,1,1,5,5,5,5] -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm6[1],ymm1[2,3,4],ymm6[5],ymm1[6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm7[1],ymm1[2,3,4],ymm7[5],ymm1[6,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,2] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6],ymm1[7] ; AVX2-FAST-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload @@ -10181,8 +10160,8 @@ ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm1 = ymm1[3,3],mem[3,3],ymm1[7,7],mem[7,7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm5 = mem[2,3,2,3,6,7,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0],ymm1[1,2],ymm5[3,4],ymm1[5,6],ymm5[7] +; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm6 = mem[2,3,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0],ymm1[1,2],ymm6[3,4],ymm1[5,6],ymm6[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4],ymm1[5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -10210,8 +10189,8 @@ ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm1 = ymm1[3,3],mem[3,3],ymm1[7,7],mem[7,7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm5 = mem[2,3,2,3,6,7,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0],ymm1[1,2],ymm5[3,4],ymm1[5,6],ymm5[7] +; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm6 = mem[2,3,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0],ymm1[1,2],ymm6[3,4],ymm1[5,6],ymm6[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4],ymm1[5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -10238,77 +10217,78 @@ ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm1 = ymm1[3,3],mem[3,3],ymm1[7,7],mem[7,7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm5 = mem[2,3,2,3,6,7,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0],ymm1[1,2],ymm5[3,4],ymm1[5,6],ymm5[7] +; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm6 = mem[2,3,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0],ymm1[1,2],ymm6[3,4],ymm1[5,6],ymm6[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm10 = ymm1[0],ymm0[1,2,3,4],ymm1[5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm1[0],ymm0[1,2,3,4],ymm1[5,6,7] ; AVX2-FAST-NEXT: vbroadcastss 112(%rdx), %ymm0 ; AVX2-FAST-NEXT: vmovaps %ymm11, %ymm3 -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm5 = ymm11[3,1,2,0,7,5,6,4] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5],ymm0[6],ymm5[7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm6 = ymm11[3,1,2,0,7,5,6,4] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5],ymm0[6],ymm6[7] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vunpcklps {{.*#+}} ymm5 = ymm12[0],ymm2[0],ymm12[1],ymm2[1],ymm12[4],ymm2[4],ymm12[5],ymm2[5] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5],ymm0[6,7] +; AVX2-FAST-NEXT: vunpcklps {{.*#+}} ymm6 = ymm12[0],ymm2[0],ymm12[1],ymm2[1],ymm12[4],ymm2[4],ymm12[5],ymm2[5] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5],ymm0[6,7] ; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX2-FAST-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm5 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm5 = xmm4[3,3],mem[3,3] -; AVX2-FAST-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload -; AVX2-FAST-NEXT: # xmm5 = xmm5[0,1,2],mem[3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm11 = ymm0[0],ymm5[1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm6 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm6 = xmm4[3,3],mem[3,3] +; AVX2-FAST-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload +; AVX2-FAST-NEXT: # xmm6 = xmm6[0,1,2],mem[3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm11 = ymm0[0],ymm6[1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm0 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm0 = ymm3[2],mem[2],ymm3[3],mem[3],ymm3[6],mem[6],ymm3[7],mem[7] -; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm5 = ymm2[2],ymm12[2],ymm2[3],ymm12[3],ymm2[6],ymm12[6],ymm2[7],ymm12[7] +; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm6 = ymm2[2],ymm12[2],ymm2[3],ymm12[3],ymm2[6],ymm12[6],ymm2[7],ymm12[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,3] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[3,3,3,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2],ymm0[3,4,5,6,7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[3,3,3,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2],ymm0[3,4,5,6,7] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm5 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm5 = ymm1[3,3],mem[3,3],ymm1[7,7],mem[7,7] +; AVX2-FAST-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm6 = ymm1[3,3],mem[3,3],ymm1[7,7],mem[7,7] ; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm12 = mem[2,3,2,3,6,7,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm12[0],ymm5[1,2],ymm12[3,4],ymm5[5,6],ymm12[7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[2,1,2,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0],ymm0[1,2,3,4],ymm5[5,6,7] -; AVX2-FAST-NEXT: vbroadcastss 144(%rdx), %ymm5 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm12[0],ymm6[1,2],ymm12[3,4],ymm6[5,6],ymm12[7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[2,1,2,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm6[0],ymm0[1,2,3,4],ymm6[5,6,7] +; AVX2-FAST-NEXT: vbroadcastss 144(%rdx), %ymm6 ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm12 = ymm2[3,1,2,0,7,5,6,4] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm12[0,1,2,3,4,5],ymm5[6],ymm12[7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm12[0,1,2,3,4,5],ymm6[6],ymm12[7] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-NEXT: vunpcklps {{.*#+}} ymm12 = ymm9[0],ymm1[0],ymm9[1],ymm1[1],ymm9[4],ymm1[4],ymm9[5],ymm1[5] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm12[4,5],ymm5[6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm12[4,5],ymm6[6,7] ; AVX2-FAST-NEXT: vmovaps (%rsp), %xmm3 # 16-byte Reload ; AVX2-FAST-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm12 # 16-byte Folded Reload ; AVX2-FAST-NEXT: # xmm12 = xmm3[3,3],mem[3,3] ; AVX2-FAST-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm12 # 16-byte Folded Reload ; AVX2-FAST-NEXT: # xmm12 = xmm12[0,1,2],mem[3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm12 = ymm5[0],ymm12[1,2,3],ymm5[4,5,6,7] -; AVX2-FAST-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm5 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm5 = ymm2[2],mem[2],ymm2[3],mem[3],ymm2[6],mem[6],ymm2[7],mem[7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm12 = ymm6[0],ymm12[1,2,3],ymm6[4,5,6,7] +; AVX2-FAST-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm6 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm6 = ymm2[2],mem[2],ymm2[3],mem[3],ymm2[6],mem[6],ymm2[7],mem[7] ; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm0 = ymm1[2],ymm9[2],ymm1[3],ymm9[3],ymm1[6],ymm9[6],ymm1[7],ymm9[7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[3,3,3,3] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[3,3,3,3] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm5[3,4,5,6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm6[3,4,5,6,7] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm5 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm5 = ymm1[3,3],mem[3,3],ymm1[7,7],mem[7,7] +; AVX2-FAST-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm6 = ymm1[3,3],mem[3,3],ymm1[7,7],mem[7,7] ; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm9 = mem[2,3,2,3,6,7,6,7] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm9[0],ymm5[1,2],ymm9[3,4],ymm5[5,6],ymm9[7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[2,1,2,3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0],ymm0[1,2,3,4],ymm5[5,6,7] -; AVX2-FAST-NEXT: vbroadcastss 176(%rdx), %ymm5 -; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm9 = ymm14[3,1,2,0,7,5,6,4] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm9[0,1,2,3,4,5],ymm5[6],ymm9[7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm9[0],ymm6[1,2],ymm9[3,4],ymm6[5,6],ymm9[7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[2,1,2,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm6[0],ymm0[1,2,3,4],ymm6[5,6,7] +; AVX2-FAST-NEXT: vbroadcastss 176(%rdx), %ymm6 +; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm9 = ymm1[3,1,2,0,7,5,6,4] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm9[0,1,2,3,4,5],ymm6[6],ymm9[7] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vunpcklps {{.*#+}} ymm9 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[4],ymm0[4],ymm2[5],ymm0[5] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm9[4,5],ymm5[6,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm9[4,5],ymm6[6,7] ; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload ; AVX2-FAST-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm9 # 16-byte Folded Reload ; AVX2-FAST-NEXT: # xmm9 = xmm8[3,3],mem[3,3] ; AVX2-FAST-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload ; AVX2-FAST-NEXT: # xmm9 = xmm9[0,1,2],mem[3] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm9[1,2,3],ymm5[4,5,6,7] -; AVX2-FAST-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm9 # 32-byte Folded Reload -; AVX2-FAST-NEXT: # ymm9 = ymm14[2],mem[2],ymm14[3],mem[3],ymm14[6],mem[6],ymm14[7],mem[7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm9[1,2,3],ymm6[4,5,6,7] +; AVX2-FAST-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm9 # 32-byte Folded Reload +; AVX2-FAST-NEXT: # ymm9 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] ; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[6],ymm2[6],ymm0[7],ymm2[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[3,3,3,3] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,3] @@ -10320,7 +10300,7 @@ ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0],ymm9[1,2],ymm8[3,4],ymm9[5,6],ymm8[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[2,1,2,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0],ymm0[1,2,3,4],ymm8[5,6,7] -; AVX2-FAST-NEXT: vunpcklps {{.*#+}} ymm8 = ymm6[0],ymm7[0],ymm6[1],ymm7[1],ymm6[4],ymm7[4],ymm6[5],ymm7[5] +; AVX2-FAST-NEXT: vunpcklps {{.*#+}} ymm8 = ymm7[0],ymm10[0],ymm7[1],ymm10[1],ymm7[4],ymm10[4],ymm7[5],ymm10[5] ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm9 = ymm13[3,1,2,0,7,5,6,4] ; AVX2-FAST-NEXT: vbroadcastss 208(%rdx), %ymm7 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm9[0,1,2,3,4,5],ymm7[6],ymm9[7] @@ -10334,12 +10314,12 @@ ; AVX2-FAST-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FAST-NEXT: vmovaps %ymm7, 1440(%rax) ; AVX2-FAST-NEXT: vmovaps %ymm0, 1312(%rax) -; AVX2-FAST-NEXT: vmovaps %ymm5, 1216(%rax) +; AVX2-FAST-NEXT: vmovaps %ymm6, 1216(%rax) ; AVX2-FAST-NEXT: vmovaps %ymm3, 1088(%rax) ; AVX2-FAST-NEXT: vmovaps %ymm12, 992(%rax) ; AVX2-FAST-NEXT: vmovaps %ymm4, 864(%rax) ; AVX2-FAST-NEXT: vmovaps %ymm11, 768(%rax) -; AVX2-FAST-NEXT: vmovaps %ymm10, 640(%rax) +; AVX2-FAST-NEXT: vmovaps %ymm5, 640(%rax) ; AVX2-FAST-NEXT: vmovaps %ymm15, 544(%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, 416(%rax) @@ -10353,8 +10333,7 @@ ; AVX2-FAST-NEXT: vmovaps %ymm0, 1504(%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, 1472(%rax) -; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vmovaps %ymm0, 1408(%rax) +; AVX2-FAST-NEXT: vmovaps %ymm14, 1408(%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, 1344(%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -10435,7 +10414,7 @@ ; AVX2-FAST-NEXT: vmovaps %ymm0, 1568(%rax) ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm0, 1600(%rax) -; AVX2-FAST-NEXT: addq $3096, %rsp # imm = 0xC18 +; AVX2-FAST-NEXT: addq $3048, %rsp # imm = 0xBE8 ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-8.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-8.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-8.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-8.ll @@ -259,37 +259,41 @@ ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm8 ; AVX2-ONLY-NEXT: vinsertf128 $1, (%r9), %ymm5, %ymm5 ; AVX2-ONLY-NEXT: vinsertf128 $1, (%r10), %ymm6, %ymm7 -; AVX2-ONLY-NEXT: vbroadcastsd {{.*#+}} ymm6 = [17179869184,17179869184,17179869184,17179869184] -; AVX2-ONLY-NEXT: vpermps %ymm7, %ymm6, %ymm9 -; AVX2-ONLY-NEXT: vpermps %ymm5, %ymm6, %ymm6 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm9[6,7] +; AVX2-ONLY-NEXT: vmovaps {{.*#+}} ymm6 = +; AVX2-ONLY-NEXT: vpermps %ymm7, %ymm6, %ymm6 +; AVX2-ONLY-NEXT: vmovaps {{.*#+}} ymm9 = +; AVX2-ONLY-NEXT: vpermps %ymm5, %ymm9, %ymm9 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm9[0,1,2,3,4,5],ymm6[6,7] ; AVX2-ONLY-NEXT: vmovaps {{.*#+}} xmm9 = ; AVX2-ONLY-NEXT: vpermps %ymm8, %ymm9, %ymm9 ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm10 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm9 = xmm10[0,1],xmm9[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm9[0,1,2,3],ymm6[4,5,6,7] -; AVX2-ONLY-NEXT: vbroadcastsd {{.*#+}} ymm9 = [21474836481,21474836481,21474836481,21474836481] -; AVX2-ONLY-NEXT: vpermps %ymm7, %ymm9, %ymm10 -; AVX2-ONLY-NEXT: vpermps %ymm5, %ymm9, %ymm9 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm10[6,7] +; AVX2-ONLY-NEXT: vmovaps {{.*#+}} ymm9 = +; AVX2-ONLY-NEXT: vpermps %ymm7, %ymm9, %ymm9 +; AVX2-ONLY-NEXT: vmovaps {{.*#+}} ymm10 = +; AVX2-ONLY-NEXT: vpermps %ymm5, %ymm10, %ymm10 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1,2,3,4,5],ymm9[6,7] ; AVX2-ONLY-NEXT: vmovaps {{.*#+}} xmm10 = <1,5,u,u> ; AVX2-ONLY-NEXT: vpermps %ymm2, %ymm10, %ymm10 ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm11 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm10[0,1],xmm11[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7] -; AVX2-ONLY-NEXT: vbroadcastsd {{.*#+}} ymm10 = [25769803778,25769803778,25769803778,25769803778] -; AVX2-ONLY-NEXT: vpermps %ymm7, %ymm10, %ymm11 -; AVX2-ONLY-NEXT: vpermps %ymm5, %ymm10, %ymm10 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm11[6,7] +; AVX2-ONLY-NEXT: vmovaps {{.*#+}} ymm10 = +; AVX2-ONLY-NEXT: vpermps %ymm7, %ymm10, %ymm10 +; AVX2-ONLY-NEXT: vmovaps {{.*#+}} ymm11 = +; AVX2-ONLY-NEXT: vpermps %ymm5, %ymm11, %ymm11 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0,1,2,3,4,5],ymm10[6,7] ; AVX2-ONLY-NEXT: vmovaps {{.*#+}} xmm11 = ; AVX2-ONLY-NEXT: vpermps %ymm8, %ymm11, %ymm8 ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm3 = xmm3[2],xmm4[2],xmm3[3],xmm4[3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm8[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm10[4,5,6,7] -; AVX2-ONLY-NEXT: vbroadcastsd {{.*#+}} ymm4 = [30064771075,30064771075,30064771075,30064771075] -; AVX2-ONLY-NEXT: vpermps %ymm7, %ymm4, %ymm7 -; AVX2-ONLY-NEXT: vpermps %ymm5, %ymm4, %ymm4 -; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm7[6,7] +; AVX2-ONLY-NEXT: vmovaps {{.*#+}} ymm4 = +; AVX2-ONLY-NEXT: vpermps %ymm7, %ymm4, %ymm4 +; AVX2-ONLY-NEXT: vmovaps {{.*#+}} ymm7 = +; AVX2-ONLY-NEXT: vpermps %ymm5, %ymm7, %ymm5 +; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7] ; AVX2-ONLY-NEXT: vmovaps {{.*#+}} xmm5 = <3,7,u,u> ; AVX2-ONLY-NEXT: vpermps %ymm2, %ymm5, %ymm2 ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-3.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-3.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-3.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-3.ll @@ -963,21 +963,18 @@ ; AVX2-ONLY-NEXT: vmovdqa 32(%rdx), %ymm5 ; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm6 = ymm0[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,27,28,29,30,31,16,17,18,19,20,21,22,23,24,25,26] ; AVX2-ONLY-NEXT: vpslldq {{.*#+}} ymm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm4[0,1,2,3,4],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm4[16,17,18,19,20] -; AVX2-ONLY-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0] -; AVX2-ONLY-NEXT: # ymm8 = mem[0,1,0,1] +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0] ; AVX2-ONLY-NEXT: vpblendvb %ymm8, %ymm6, %ymm7, %ymm7 ; AVX2-ONLY-NEXT: vpalignr {{.*#+}} ymm9 = ymm1[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,27,28,29,30,31,16,17,18,19,20,21,22,23,24,25,26] ; AVX2-ONLY-NEXT: vpslldq {{.*#+}} ymm10 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm5[0,1,2,3,4],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm5[16,17,18,19,20] ; AVX2-ONLY-NEXT: vpblendvb %ymm8, %ymm9, %ymm10, %ymm10 ; AVX2-ONLY-NEXT: vpslldq {{.*#+}} ymm0 = zero,zero,zero,zero,zero,ymm0[0,1,2,3,4,5,6,7,8,9,10],zero,zero,zero,zero,zero,ymm0[16,17,18,19,20,21,22,23,24,25,26] -; AVX2-ONLY-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [0,0,0,0,0,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,0,0,0,0,0] -; AVX2-ONLY-NEXT: # ymm11 = mem[0,1,0,1] +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm11 = ; AVX2-ONLY-NEXT: vpblendvb %ymm11, %ymm2, %ymm0, %ymm0 ; AVX2-ONLY-NEXT: vpslldq {{.*#+}} ymm1 = zero,zero,zero,zero,zero,ymm1[0,1,2,3,4,5,6,7,8,9,10],zero,zero,zero,zero,zero,ymm1[16,17,18,19,20,21,22,23,24,25,26] ; AVX2-ONLY-NEXT: vpblendvb %ymm11, %ymm3, %ymm1, %ymm1 ; AVX2-ONLY-NEXT: vpsrldq {{.*#+}} ymm11 = ymm4[5,6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,ymm4[21,22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero -; AVX2-ONLY-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255] -; AVX2-ONLY-NEXT: # ymm12 = mem[0,1,0,1] +; AVX2-ONLY-NEXT: vmovdqa {{.*#+}} ymm12 = ; AVX2-ONLY-NEXT: vpblendvb %ymm12, %ymm2, %ymm11, %ymm11 ; AVX2-ONLY-NEXT: vpsrldq {{.*#+}} ymm13 = ymm5[5,6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,ymm5[21,22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero ; AVX2-ONLY-NEXT: vpblendvb %ymm12, %ymm3, %ymm13, %ymm12 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-5.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-5.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-5.ll @@ -1585,8 +1585,7 @@ ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm5 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25,u,27,u,u,26,u,28,u,30,u,u,29,u,31,u] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm6 = ymm1[0,1,2,3,5,6,7,6,8,9,10,11,13,14,15,14] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[2,2,3,3,6,6,7,7] -; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [255,0,255,0,0,255,0,255,0,255,0,0,255,0,255,0,255,0,255,0,0,255,0,255,0,255,0,0,255,0,255,0] -; AVX2-SLOW-NEXT: # ymm7 = mem[0,1,0,1] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = <255,0,255,u,0,255,0,255,0,255,u,0,255,0,255,u,255,0,255,u,0,255,0,255,0,255,u,0,255,0,255,u> ; AVX2-SLOW-NEXT: vpblendvb %ymm7, %ymm5, %ymm6, %ymm5 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,3,3] ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm6 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27,u,u,26,u,28,u,u,u,u,29,u,31,u,u,30] @@ -3397,8 +3396,7 @@ ; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm12, %ymm6 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm11 = ymm14[0,1,2,3,5,6,7,6,8,9,10,11,13,14,15,14] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm11 = ymm11[2,2,3,3,6,6,7,7] -; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [255,0,255,0,0,255,0,255,0,255,0,0,255,0,255,0,255,0,255,0,0,255,0,255,0,255,0,0,255,0,255,0] -; AVX2-SLOW-NEXT: # ymm12 = mem[0,1,0,1] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm12 = <255,0,255,u,0,255,0,255,0,255,u,0,255,0,255,u,255,0,255,u,0,255,0,255,0,255,u,0,255,0,255,u> ; AVX2-SLOW-NEXT: vpblendvb %ymm12, %ymm6, %ymm11, %ymm6 ; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm4, %ymm4 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,5,6,7,6,8,9,10,11,13,14,15,14] @@ -3409,8 +3407,7 @@ ; AVX2-SLOW-NEXT: vpshufhw $230, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm10 = mem[0,1,2,3,6,5,6,7,8,9,10,11,14,13,14,15] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm10 = ymm10[2,2,3,3,6,6,7,7] -; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [255,0,0,255,0,255,0,0,0,0,255,0,255,0,0,255,255,0,0,255,0,255,0,0,0,0,255,0,255,0,0,255] -; AVX2-SLOW-NEXT: # ymm11 = mem[0,1,0,1] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm11 = <255,u,0,255,0,255,u,0,u,0,255,0,255,u,0,255,255,u,0,255,0,255,u,0,u,0,255,0,255,u,0,255> ; AVX2-SLOW-NEXT: vpblendvb %ymm11, %ymm5, %ymm10, %ymm5 ; AVX2-SLOW-NEXT: vpshufb %ymm4, %ymm9, %ymm4 ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm9 = ymm13[0,1,2,3,6,5,6,7,8,9,10,11,14,13,14,15] diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll @@ -559,8 +559,7 @@ ; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [1,3,5,7,1,3,5,7] -; AVX2-FAST-NEXT: # ymm1 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [1,3,5,7,1,3,5,7] ; AVX2-FAST-NEXT: vpermd %ymm2, %ymm1, %ymm1 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,ymm1[1,5,9,13],zero,zero,zero,ymm1[2,6,10,14],zero,zero,zero,ymm1[19,23,27,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = <1,3,5,u,5,1,3,u> @@ -571,8 +570,7 @@ ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,0,1] ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,ymm2[0,8],zero,zero,zero,zero,zero,ymm2[1,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[19,27],zero,zero,zero,zero,zero,ymm2[20,28],zero,zero ; AVX2-FAST-NEXT: vpor %ymm2, %ymm3, %ymm2 -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,2,4,6,0,2,4,6] -; AVX2-FAST-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = <0,2,4,u,0,2,4,u> ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm3, %ymm0 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,0,4,8,u,u,u,u,1,5,9,u,u,u,u,18,22,26,u,u,u,u,19,23,27,u,u,u,u] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255] @@ -1362,8 +1360,7 @@ ; AVX2-FAST-NEXT: vpblendvb %ymm12, %ymm11, %ymm7, %ymm7 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm11 = xmm2[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,0,1,0] -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm12 = [1,5,2,6,1,5,2,6] -; AVX2-FAST-NEXT: # ymm12 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = <1,5,u,u,1,5,2,6> ; AVX2-FAST-NEXT: vpermd %ymm10, %ymm12, %ymm13 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm13 = ymm13[0,4,u,u,u,u,u,1,5,u,u,u,u,u,2,6,u,u,u,u,u,19,23,u,u,u,u,u,24,28,u,u] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = <255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u> @@ -1376,8 +1373,7 @@ ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = [0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255] ; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm12, %ymm11, %ymm11 ; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm12 = xmm2[0,1,2,3,4,5,5,6] -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [2,2,3,3,2,2,3,3] -; AVX2-FAST-NEXT: # ymm13 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = [2,2,3,3,2,2,3,3] ; AVX2-FAST-NEXT: vpermd %ymm12, %ymm13, %ymm12 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm10[1,3,1,3] ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,1,9,u,u,u,u,u,2,10,u,u,u,u,u,19,27,u,u,u,u,u,20,28,u,u,u,u,u,21] @@ -2892,8 +2888,7 @@ ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,21,u,19,u,u,u,u,22,u,20,u,u] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm4[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,0,1,1,4,4,5,5] -; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [0,0,0,255,0,255,0,0,0,0,255,0,255,0,0,0,0,0,0,255,0,255,0,0,0,0,255,0,255,0,0,0] -; AVX2-SLOW-NEXT: # ymm7 = mem[0,1,0,1] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm7 = ; AVX2-SLOW-NEXT: vpblendvb %ymm7, %ymm1, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm5[18],zero,zero,zero,zero,ymm5[21],zero,ymm5[19],zero,zero,zero,zero,ymm5[22],zero,ymm5[20] @@ -2993,8 +2988,7 @@ ; AVX2-FAST-NEXT: vpblendvb %ymm6, %ymm4, %ymm5, %ymm4 ; AVX2-FAST-NEXT: vmovdqa (%rax), %xmm11 ; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm5 = xmm11[0,1,2,3,4,5,5,6] -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [2,2,3,3,2,2,3,3] -; AVX2-FAST-NEXT: # ymm6 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm6 = [2,2,3,3,2,2,3,3] ; AVX2-FAST-NEXT: vpermd %ymm5, %ymm6, %ymm5 ; AVX2-FAST-NEXT: vmovdqa (%r9), %xmm12 ; AVX2-FAST-NEXT: vmovdqa (%r8), %xmm14 @@ -5881,7 +5875,7 @@ ; ; AVX2-SLOW-LABEL: store_i8_stride7_vf64: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: subq $808, %rsp # imm = 0x328 +; AVX2-SLOW-NEXT: subq $744, %rsp # imm = 0x2E8 ; AVX2-SLOW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %ymm0 ; AVX2-SLOW-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -5944,9 +5938,7 @@ ; AVX2-SLOW-NEXT: vmovdqa (%rcx), %ymm2 ; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[11,u,u,u,u,14,u,12,u,u,u,u,15,u,13,u,27,u,u,u,u,30,u,28,u,u,u,u,31,u,29,u] -; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [255,0,0,0,0,255,0,255,0,0,0,0,255,0,255,0,255,0,0,0,0,255,0,255,0,0,0,0,255,0,255,0] -; AVX2-SLOW-NEXT: # ymm3 = mem[0,1,0,1] -; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = <255,u,u,u,0,255,0,255,u,u,u,0,255,0,255,u,255,u,u,u,0,255,0,255,u,u,u,0,255,0,255,u> ; AVX2-SLOW-NEXT: vpblendvb %ymm3, %ymm2, %ymm1, %ymm1 ; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm2 ; AVX2-SLOW-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -5955,9 +5947,7 @@ ; AVX2-SLOW-NEXT: vmovdqa (%rsi), %ymm3 ; AVX2-SLOW-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,14,u,12,u,u,u,u,15,u,13,u,u,u,u,u,u,30,u,28,u,u,u,u,31,u,29,u,u,u] -; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,0,255,0,255,0,0,0,0,255,0,255,0,0,0,0,0,0,255,0,255,0,0,0,0,255,0,255,0,0,0,0] -; AVX2-SLOW-NEXT: # ymm4 = mem[0,1,0,1] -; AVX2-SLOW-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm4 = ; AVX2-SLOW-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm2 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,2,3] @@ -6161,8 +6151,7 @@ ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload ; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm13, %ymm4 -; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [0,0,0,255,0,255,0,0,0,0,255,0,255,0,0,0,0,0,0,255,0,255,0,0,0,0,255,0,255,0,0,0] -; AVX2-SLOW-NEXT: # ymm5 = mem[0,1,0,1] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = ; AVX2-SLOW-NEXT: vpblendvb %ymm5, %ymm2, %ymm4, %ymm2 ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload ; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm7, %ymm3 @@ -6224,13 +6213,13 @@ ; AVX2-SLOW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128,128,18] ; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm13, %ymm4 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = [0,128,14,128,128,128,128,1,128,15,128,128,128,128,2,128,16,128,30,128,128,128,128,17,128,31,128,128,128,128,18,128] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = [128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128,128,18,128] ; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm11, %ymm8 ; AVX2-SLOW-NEXT: vpor %ymm4, %ymm8, %ymm4 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm8 = [128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128] ; AVX2-SLOW-NEXT: vpshufb %ymm8, %ymm10, %ymm9 ; AVX2-SLOW-NEXT: vmovdqa %ymm10, %ymm15 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm10 = [128,128,0,128,14,128,128,128,128,1,128,15,128,128,128,128,128,128,16,128,30,128,128,128,128,17,128,31,128,128,128,128] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm10 = [128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128,128] ; AVX2-SLOW-NEXT: vpshufb %ymm10, %ymm12, %ymm11 ; AVX2-SLOW-NEXT: vmovdqa %ymm12, %ymm14 ; AVX2-SLOW-NEXT: vpor %ymm9, %ymm11, %ymm9 @@ -6244,7 +6233,7 @@ ; AVX2-SLOW-NEXT: vpshufb %ymm10, %ymm0, %ymm8 ; AVX2-SLOW-NEXT: vpor %ymm5, %ymm8, %ymm5 ; AVX2-SLOW-NEXT: vpblendvb %ymm11, %ymm3, %ymm5, %ymm3 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = [128,1,2,3,0,128,14,128,0,1,0,1,128,15,128,15,128,17,18,19,16,128,30,128,16,17,16,17,128,31,128,31] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm5 = [128,1,2,3,0,1,14,128,0,1,0,1,14,15,128,15,16,17,18,19,16,128,30,31,16,17,16,17,128,31,30,31] ; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vpshufb %ymm5, %ymm0, %ymm8 ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm9 = [13,128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128] @@ -6270,13 +6259,13 @@ ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm5 = ymm15[11,u,u,u,u,14,u,12,u,u,u,u,15,u,13,u,27,u,u,u,u,30,u,28,u,u,u,u,31,u,29,u] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm8 = ymm14[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm8 = ymm8[0,2,3,3,4,6,7,7] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = <255,u,u,u,0,255,0,255,u,u,u,0,255,0,255,u,255,u,u,u,0,255,0,255,u,u,u,0,255,0,255,u> ; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm5, %ymm8, %ymm5 ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm8 = ymm13[u,u,u,14,u,12,u,u,u,u,15,u,13,u,u,u,u,u,u,30,u,28,u,u,u,u,31,u,29,u,u,u] ; AVX2-SLOW-NEXT: vpshufhw $190, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm9 = mem[0,1,2,3,6,7,7,6,8,9,10,11,14,15,15,14] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[2,2,3,3,6,6,7,7] -; AVX2-SLOW-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm6 = ; AVX2-SLOW-NEXT: vpblendvb %ymm6, %ymm9, %ymm8, %ymm8 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,3,2,3] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3] @@ -6320,7 +6309,7 @@ ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovaps %ymm0, 384(%rax) ; AVX2-SLOW-NEXT: vmovdqa %ymm5, 416(%rax) -; AVX2-SLOW-NEXT: addq $808, %rsp # imm = 0x328 +; AVX2-SLOW-NEXT: addq $744, %rsp # imm = 0x2E8 ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; @@ -6405,8 +6394,8 @@ ; AVX2-FAST-NEXT: vmovdqa 32(%rcx), %xmm3 ; AVX2-FAST-NEXT: vmovdqa 32(%rdx), %xmm4 ; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] -; AVX2-FAST-NEXT: vmovdqa %xmm4, %xmm7 -; AVX2-FAST-NEXT: vmovdqa %xmm3, %xmm8 +; AVX2-FAST-NEXT: vmovdqa %xmm4, %xmm8 +; AVX2-FAST-NEXT: vmovdqa %xmm3, %xmm9 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = <4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9> ; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm2, %xmm2 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] @@ -6421,9 +6410,9 @@ ; AVX2-FAST-NEXT: vpshufb %xmm1, %xmm0, %xmm0 ; AVX2-FAST-NEXT: vmovdqa (%rcx), %xmm1 ; AVX2-FAST-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vmovdqa (%rdx), %xmm9 -; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm9[0],xmm1[0],xmm9[1],xmm1[1],xmm9[2],xmm1[2],xmm9[3],xmm1[3],xmm9[4],xmm1[4],xmm9[5],xmm1[5],xmm9[6],xmm1[6],xmm9[7],xmm1[7] -; AVX2-FAST-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX2-FAST-NEXT: vmovdqa (%rdx), %xmm7 +; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm7[0],xmm1[0],xmm7[1],xmm1[1],xmm7[2],xmm1[2],xmm7[3],xmm1[3],xmm7[4],xmm1[4],xmm7[5],xmm1[5],xmm7[6],xmm1[6],xmm7[7],xmm1[7] +; AVX2-FAST-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm1, %xmm1 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] @@ -6466,25 +6455,25 @@ ; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm2, %xmm11 ; AVX2-FAST-NEXT: vpor %xmm1, %xmm11, %xmm1 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm11 = -; AVX2-FAST-NEXT: vmovdqa %xmm8, %xmm5 -; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm8, %xmm12 +; AVX2-FAST-NEXT: vmovdqa %xmm9, %xmm5 +; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm9, %xmm12 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm15 = -; AVX2-FAST-NEXT: vmovdqa %xmm7, %xmm4 -; AVX2-FAST-NEXT: vpshufb %xmm15, %xmm7, %xmm13 +; AVX2-FAST-NEXT: vmovdqa %xmm8, %xmm4 +; AVX2-FAST-NEXT: vpshufb %xmm15, %xmm8, %xmm13 ; AVX2-FAST-NEXT: vpor %xmm12, %xmm13, %xmm12 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm12 = ymm12[0,1,0,1] ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm13 = ; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm1, %ymm12, %ymm1 ; AVX2-FAST-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm9, %xmm0 ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm0, %xmm8, %xmm0 -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm7, %xmm10 +; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm8, %xmm10 ; AVX2-FAST-NEXT: vpor %xmm0, %xmm10, %xmm0 ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm3, %xmm10 -; AVX2-FAST-NEXT: vpshufb %xmm15, %xmm9, %xmm11 +; AVX2-FAST-NEXT: vpshufb %xmm15, %xmm7, %xmm11 ; AVX2-FAST-NEXT: vpor %xmm10, %xmm11, %xmm10 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,1,0,1] @@ -6503,8 +6492,8 @@ ; AVX2-FAST-NEXT: vpblendvb %ymm1, %ymm11, %ymm15, %ymm11 ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm6, %xmm10 -; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX2-FAST-NEXT: vpshufb %xmm12, %xmm9, %xmm12 +; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload +; AVX2-FAST-NEXT: vpshufb %xmm12, %xmm7, %xmm12 ; AVX2-FAST-NEXT: vpor %xmm10, %xmm12, %xmm10 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,1,0,1] ; AVX2-FAST-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload @@ -6531,7 +6520,7 @@ ; AVX2-FAST-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm1 # 16-byte Folded Reload ; AVX2-FAST-NEXT: # xmm1 = xmm3[8],mem[8],xmm3[9],mem[9],xmm3[10],mem[10],xmm3[11],mem[11],xmm3[12],mem[12],xmm3[13],mem[13],xmm3[14],mem[14],xmm3[15],mem[15] ; AVX2-FAST-NEXT: vpshufb %xmm10, %xmm1, %xmm1 -; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm8[8],xmm7[8],xmm8[9],xmm7[9],xmm8[10],xmm7[10],xmm8[11],xmm7[11],xmm8[12],xmm7[12],xmm8[13],xmm7[13],xmm8[14],xmm7[14],xmm8[15],xmm7[15] +; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm9[8],xmm8[8],xmm9[9],xmm8[9],xmm9[10],xmm8[10],xmm9[11],xmm8[11],xmm9[12],xmm8[12],xmm9[13],xmm8[13],xmm9[14],xmm8[14],xmm9[15],xmm8[15] ; AVX2-FAST-NEXT: vpshufb %xmm11, %xmm8, %xmm8 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,0,1] @@ -6539,20 +6528,19 @@ ; AVX2-FAST-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm3 # 16-byte Folded Reload ; AVX2-FAST-NEXT: # xmm3 = xmm14[8],mem[8],xmm14[9],mem[9],xmm14[10],mem[10],xmm14[11],mem[11],xmm14[12],mem[12],xmm14[13],mem[13],xmm14[14],mem[14],xmm14[15],mem[15] ; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm5 = xmm2[0,1,2,3,4,5,5,6] -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [2,2,3,3,2,2,3,3] -; AVX2-FAST-NEXT: # ymm7 = mem[0,1,0,1] -; AVX2-FAST-NEXT: vpermd %ymm5, %ymm7, %ymm5 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [2,2,3,3,2,2,3,3] +; AVX2-FAST-NEXT: vpermd %ymm5, %ymm2, %ymm5 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm8 = ; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm3, %xmm3 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,1] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm10 = -; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm3, %ymm5, %ymm3 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = +; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm3, %ymm5, %ymm3 ; AVX2-FAST-NEXT: vpshufhw {{.*#+}} xmm5 = xmm15[0,1,2,3,4,5,5,6] -; AVX2-FAST-NEXT: vpermd %ymm5, %ymm7, %ymm5 -; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm9[8],xmm6[8],xmm9[9],xmm6[9],xmm9[10],xmm6[10],xmm9[11],xmm6[11],xmm9[12],xmm6[12],xmm9[13],xmm6[13],xmm9[14],xmm6[14],xmm9[15],xmm6[15] +; AVX2-FAST-NEXT: vpermd %ymm5, %ymm2, %ymm5 +; AVX2-FAST-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm7[8],xmm6[8],xmm7[9],xmm6[9],xmm7[10],xmm6[10],xmm7[11],xmm6[11],xmm7[12],xmm6[12],xmm7[13],xmm6[13],xmm7[14],xmm6[14],xmm7[15],xmm6[15] ; AVX2-FAST-NEXT: vpshufb %xmm8, %xmm2, %xmm2 ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1] -; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm2, %ymm5, %ymm2 +; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm2, %ymm5, %ymm2 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0] ; AVX2-FAST-NEXT: vpblendvb %ymm4, %ymm0, %ymm3, %ymm0 ; AVX2-FAST-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -6689,13 +6677,13 @@ ; AVX2-FAST-NEXT: vpblendvb %ymm10, %ymm9, %ymm0, %ymm9 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128,128,18] ; AVX2-FAST-NEXT: vpshufb %ymm0, %ymm2, %ymm10 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = [0,128,14,128,128,128,128,1,128,15,128,128,128,128,2,128,16,128,30,128,128,128,128,17,128,31,128,128,128,128,18,128] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm11 = [128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128,128,18,128] ; AVX2-FAST-NEXT: vpshufb %ymm11, %ymm1, %ymm12 ; AVX2-FAST-NEXT: vpor %ymm10, %ymm12, %ymm10 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm12 = [128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload ; AVX2-FAST-NEXT: vpshufb %ymm12, %ymm13, %ymm13 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = [128,128,0,128,14,128,128,128,128,1,128,15,128,128,128,128,128,128,16,128,30,128,128,128,128,17,128,31,128,128,128,128] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm14 = [128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128,128] ; AVX2-FAST-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm1, %ymm7 ; AVX2-FAST-NEXT: vpor %ymm7, %ymm13, %ymm7 @@ -6712,7 +6700,7 @@ ; AVX2-FAST-NEXT: vpshufb %ymm14, %ymm2, %ymm2 ; AVX2-FAST-NEXT: vpor %ymm1, %ymm2, %ymm1 ; AVX2-FAST-NEXT: vpblendvb %ymm13, %ymm0, %ymm1, %ymm0 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [128,1,2,3,0,128,14,128,0,1,0,1,128,15,128,15,128,17,18,19,16,128,30,128,16,17,16,17,128,31,128,31] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [128,1,2,3,0,1,14,128,0,1,0,1,14,15,128,15,16,17,18,19,16,128,30,31,16,17,16,17,128,31,30,31] ; AVX2-FAST-NEXT: vpshufb %ymm1, %ymm5, %ymm2 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = [13,128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128] ; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm15, %ymm10 @@ -7116,13 +7104,13 @@ ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm9, %ymm8, %ymm0, %ymm8 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128,128,18] ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm0, %ymm2, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm10 = [0,128,14,128,128,128,128,1,128,15,128,128,128,128,2,128,16,128,30,128,128,128,128,17,128,31,128,128,128,128,18,128] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm10 = [128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128,128,18,128] ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm10, %ymm1, %ymm11 ; AVX2-FAST-PERLANE-NEXT: vpor %ymm9, %ymm11, %ymm9 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm11 = [128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128] ; AVX2-FAST-PERLANE-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm11, %ymm1, %ymm12 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = [128,128,0,128,14,128,128,128,128,1,128,15,128,128,128,128,128,128,16,128,30,128,128,128,128,17,128,31,128,128,128,128] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = [128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128,128,128,128] ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm6, %ymm6 ; AVX2-FAST-PERLANE-NEXT: vpor %ymm6, %ymm12, %ymm6 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm12 = @@ -7137,7 +7125,7 @@ ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm2, %ymm9, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vpor %ymm1, %ymm2, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vpblendvb %ymm12, %ymm0, %ymm1, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm1 = [128,1,2,3,0,128,14,128,0,1,0,1,128,15,128,15,128,17,18,19,16,128,30,128,16,17,16,17,128,31,128,31] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm1 = [128,1,2,3,0,1,14,128,0,1,0,1,14,15,128,15,16,17,18,19,16,128,30,31,16,17,16,17,128,31,30,31] ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm1, %ymm5, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm5 = [13,128,128,128,128,128,128,14,128,128,128,128,128,128,15,128,128,128,128,128,128,16,128,128,128,128,128,128,17,128,128,128] ; AVX2-FAST-PERLANE-NEXT: vpshufb %ymm5, %ymm14, %ymm9 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-8.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-8.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-8.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-8.ll @@ -461,8 +461,7 @@ ; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0] ; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm1 -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,2,4,6,0,2,4,6] -; AVX2-FAST-NEXT: # ymm2 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,0,2,4,6] ; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm3 ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = ; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm3, %ymm3 @@ -470,8 +469,7 @@ ; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm5 = <0,4,8,12,u,u,u,u,1,5,9,13,u,u,u,u,2,6,10,14,u,u,u,u,3,7,11,15,u,u,u,u> ; AVX2-FAST-NEXT: vpshufb %ymm5, %ymm2, %ymm2 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2],ymm3[3],ymm2[4],ymm3[5],ymm2[6],ymm3[7] -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [1,3,5,7,1,3,5,7] -; AVX2-FAST-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [1,3,5,7,1,3,5,7] ; AVX2-FAST-NEXT: vpermd %ymm1, %ymm3, %ymm1 ; AVX2-FAST-NEXT: vpshufb %ymm4, %ymm1, %ymm1 ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm3, %ymm0 @@ -1588,268 +1586,269 @@ ; ; AVX1-ONLY-LABEL: store_i8_stride8_vf32: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $72, %rsp +; AVX1-ONLY-NEXT: subq $56, %rsp ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX1-ONLY-NEXT: vmovdqa (%r10), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa (%r10), %xmm8 +; AVX1-ONLY-NEXT: vmovdqa (%rax), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa (%rax), %xmm4 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3],xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm1[0,0,2,1,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm1[0,2,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm0[0,0,2,1,4,4,6,5] -; AVX1-ONLY-NEXT: vbroadcastsd {{.*#+}} ymm0 = [281474976710655,281474976710655,281474976710655,281474976710655] -; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vmovdqa (%r9), %xmm7 -; AVX1-ONLY-NEXT: vmovdqa (%r8), %xmm8 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm12 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3],xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm12[0,1,1,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm12[2,1,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3],xmm0[4],xmm8[4],xmm0[5],xmm8[5],xmm0[6],xmm8[6],xmm0[7],xmm8[7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm0[0,0,2,1,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,0,2,1,4,4,6,5] -; AVX1-ONLY-NEXT: vandps %ymm0, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vorps %ymm2, %ymm3, %ymm5 -; AVX1-ONLY-NEXT: vmovdqa (%rsi), %xmm2 -; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vbroadcastsd {{.*#+}} ymm9 = [1.3906711615669959E-309,1.3906711615669959E-309,1.3906711615669959E-309,1.3906711615669959E-309] +; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm9, %ymm3 +; AVX1-ONLY-NEXT: vmovdqa (%r9), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa (%r8), %xmm5 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm12 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3],xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm12[0,1,1,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm12[2,1,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,0,2,1,4,4,6,5] +; AVX1-ONLY-NEXT: vandps %ymm4, %ymm9, %ymm4 +; AVX1-ONLY-NEXT: vorps %ymm3, %ymm4, %ymm13 +; AVX1-ONLY-NEXT: vmovdqa (%rsi), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm6 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3],xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7] -; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm6[0],xmm1[0],xmm6[1],xmm1[1],xmm6[2],xmm1[2],xmm6[3],xmm1[3],xmm6[4],xmm1[4],xmm6[5],xmm1[5],xmm6[6],xmm1[6],xmm6[7],xmm1[7] +; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm3[1,1,1,1] ; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm10 = xmm10[0],zero,zero,zero,xmm10[1],zero,zero,zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm2, %ymm13 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm4, %ymm14 ; AVX1-ONLY-NEXT: vmovdqa (%rcx), %xmm10 ; AVX1-ONLY-NEXT: vmovdqa (%rdx), %xmm11 ; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm15 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3],xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm15[0,0,2,1,4,5,6,7] -; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm14 = xmm15[0,2,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm14 = xmm14[0],zero,xmm14[1],zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm2, %ymm14 -; AVX1-ONLY-NEXT: vbroadcastsd {{.*#+}} ymm2 = [18446744069414649855,18446744069414649855,18446744069414649855,18446744069414649855] -; AVX1-ONLY-NEXT: vandps %ymm2, %ymm13, %ymm13 -; AVX1-ONLY-NEXT: vandnps %ymm14, %ymm2, %ymm14 -; AVX1-ONLY-NEXT: vorps %ymm14, %ymm13, %ymm13 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm13[0],ymm5[1],ymm13[2],ymm5[3],ymm13[4],ymm5[5],ymm13[6],ymm5[7] -; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm1[0,1,2,3,4,4,6,5] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm5, %ymm1 -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm12[0,1,2,3,4,5,5,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm15[0,0,2,1,4,5,6,7] +; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm15[0,2,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm4, %ymm1 +; AVX1-ONLY-NEXT: vbroadcastsd {{.*#+}} ymm7 = [NaN,NaN,NaN,NaN] +; AVX1-ONLY-NEXT: vandps %ymm7, %ymm14, %ymm14 +; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm7, %ymm1 +; AVX1-ONLY-NEXT: vorps %ymm1, %ymm14, %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm13[1],ymm1[2],ymm13[3],ymm1[4],ymm13[5],ymm1[6],ymm13[7] +; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,4,6,5] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm12[0,1,2,3,4,5,5,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,6,5,7,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vmovdqa 16(%r10), %xmm13 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vmovdqa 16(%r10), %xmm4 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm9, %ymm0 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[0,2,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vandps %ymm0, %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm5, %ymm1 -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm15[0,1,2,3,4,4,6,5] +; AVX1-ONLY-NEXT: vandps %ymm1, %ymm9, %ymm1 +; AVX1-ONLY-NEXT: vmovaps %ymm9, %ymm14 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm1, %ymm0 +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm15[0,1,2,3,4,4,6,5] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm12 = xmm15[0,1,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm5, %ymm12 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm3[2,3,2,3] -; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm12 = xmm3[2,3,2,3] +; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm12 = xmm12[0],zero,zero,zero,xmm12[1],zero,zero,zero ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[3,3,3,3] ; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm5, %ymm3 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm12, %ymm12 ; AVX1-ONLY-NEXT: vmovdqa 16(%rax), %xmm9 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm12[2,1,3,3,6,5,7,7] -; AVX1-ONLY-NEXT: vandnps %ymm12, %ymm2, %ymm12 -; AVX1-ONLY-NEXT: vandps %ymm2, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vorps %ymm3, %ymm12, %ymm3 -; AVX1-ONLY-NEXT: vmovdqa 16(%r9), %xmm5 -; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0],ymm1[1],ymm3[2],ymm1[3],ymm3[4],ymm1[5],ymm3[6],ymm1[7] -; AVX1-ONLY-NEXT: vmovups %ymm1, (%rsp) # 32-byte Spill -; AVX1-ONLY-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = xmm4[8],mem[8],xmm4[9],mem[9],xmm4[10],mem[10],xmm4[11],mem[11],xmm4[12],mem[12],xmm4[13],mem[13],xmm4[14],mem[14],xmm4[15],mem[15] -; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm3[0,1,2,3,4,4,6,5] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm3 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm8[8],xmm7[8],xmm8[9],xmm7[9],xmm8[10],xmm7[10],xmm8[11],xmm7[11],xmm8[12],xmm7[12],xmm8[13],xmm7[13],xmm8[14],xmm7[14],xmm8[15],xmm7[15] -; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm4[0,1,2,3,4,5,5,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,5,7,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm4 -; AVX1-ONLY-NEXT: vmovdqa 16(%r8), %xmm7 -; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,2,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vandps %ymm0, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vorps %ymm3, %ymm4, %ymm3 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm11[8],xmm10[8],xmm11[9],xmm10[9],xmm11[10],xmm10[10],xmm11[11],xmm10[11],xmm11[12],xmm10[12],xmm11[13],xmm10[13],xmm11[14],xmm10[14],xmm11[15],xmm10[15] -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm1[0,1,2,3,4,4,6,5] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm10 = xmm1[0,1,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[2,1,3,3,6,5,7,7] -; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm2, %ymm4 -; AVX1-ONLY-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm1 = xmm6[8],mem[8],xmm6[9],mem[9],xmm6[10],mem[10],xmm6[11],mem[11],xmm6[12],mem[12],xmm6[13],mem[13],xmm6[14],mem[14],xmm6[15],mem[15] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,1,3,3,6,5,7,7] +; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm7, %ymm1 +; AVX1-ONLY-NEXT: vandps %ymm7, %ymm12, %ymm12 +; AVX1-ONLY-NEXT: vorps %ymm1, %ymm12, %ymm1 +; AVX1-ONLY-NEXT: vmovdqa 16(%r9), %xmm13 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8],xmm8[8],xmm0[9],xmm8[9],xmm0[10],xmm8[10],xmm0[11],xmm8[11],xmm0[12],xmm8[12],xmm0[13],xmm8[13],xmm0[14],xmm8[14],xmm0[15],xmm8[15] ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm1[2,3,2,3] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,4,6,5] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm5[8],xmm2[8],xmm5[9],xmm2[9],xmm5[10],xmm2[10],xmm5[11],xmm2[11],xmm5[12],xmm2[12],xmm5[13],xmm2[13],xmm5[14],xmm2[14],xmm5[15],xmm2[15] +; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm2[0,1,2,3,4,5,5,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,7,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vmovdqa 16(%r8), %xmm8 +; AVX1-ONLY-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm14, %ymm1 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vandps %ymm2, %ymm14, %ymm2 +; AVX1-ONLY-NEXT: vorps %ymm1, %ymm2, %ymm1 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm11[8],xmm10[8],xmm11[9],xmm10[9],xmm11[10],xmm10[10],xmm11[11],xmm10[11],xmm11[12],xmm10[12],xmm11[13],xmm10[13],xmm11[14],xmm10[14],xmm11[15],xmm10[15] +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,4,6,5] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm10 = xmm0[0,1,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,1,3,3,6,5,7,7] +; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm7, %ymm2 +; AVX1-ONLY-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm0 = xmm6[8],mem[8],xmm6[9],mem[9],xmm6[10],mem[10],xmm6[11],mem[11],xmm6[12],mem[12],xmm6[13],mem[13],xmm6[14],mem[14],xmm6[15],mem[15] +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[2,3,2,3] ; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm1[3,3,3,3] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm0[3,3,3,3] ; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm10 = xmm10[0],zero,zero,zero,xmm10[1],zero,zero,zero ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm6, %ymm6 -; AVX1-ONLY-NEXT: vandps %ymm2, %ymm6, %ymm6 -; AVX1-ONLY-NEXT: vorps %ymm4, %ymm6, %ymm4 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0],ymm3[1],ymm4[2],ymm3[3],ymm4[4],ymm3[5],ymm4[6],ymm3[7] -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm11 = xmm9[0],xmm13[0],xmm9[1],xmm13[1],xmm9[2],xmm13[2],xmm9[3],xmm13[3],xmm9[4],xmm13[4],xmm9[5],xmm13[5],xmm9[6],xmm13[6],xmm9[7],xmm13[7] +; AVX1-ONLY-NEXT: vandps %ymm7, %ymm6, %ymm6 +; AVX1-ONLY-NEXT: vorps %ymm2, %ymm6, %ymm2 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4],ymm1[5],ymm2[6],ymm1[7] +; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm11 = xmm9[0],xmm4[0],xmm9[1],xmm4[1],xmm9[2],xmm4[2],xmm9[3],xmm4[3],xmm9[4],xmm4[4],xmm9[5],xmm4[5],xmm9[6],xmm4[6],xmm9[7],xmm4[7] +; AVX1-ONLY-NEXT: vmovdqa %xmm4, %xmm3 +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm11[0,1,2,3,4,4,6,5] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm11[0,1,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm12 = xmm8[0],xmm13[0],xmm8[1],xmm13[1],xmm8[2],xmm13[2],xmm8[3],xmm13[3],xmm8[4],xmm13[4],xmm8[5],xmm13[5],xmm8[6],xmm13[6],xmm8[7],xmm13[7] ; AVX1-ONLY-NEXT: vmovdqa %xmm13, %xmm8 -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm11[0,1,2,3,4,4,6,5] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm11[0,1,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm12 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3],xmm7[4],xmm5[4],xmm7[5],xmm5[5],xmm7[6],xmm5[6],xmm7[7],xmm5[7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm12[0,1,2,3,4,5,5,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm12[0,1,2,3,4,5,5,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm6 = xmm12[0,1,2,3,6,5,7,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,2,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vandps %ymm0, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vorps %ymm3, %ymm4, %ymm5 -; AVX1-ONLY-NEXT: vmovdqa 16(%rcx), %xmm7 -; AVX1-ONLY-NEXT: vmovdqa 16(%rdx), %xmm3 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm10 = xmm3[0],xmm7[0],xmm3[1],xmm7[1],xmm3[2],xmm7[2],xmm3[3],xmm7[3],xmm3[4],xmm7[4],xmm3[5],xmm7[5],xmm3[6],xmm7[6],xmm3[7],xmm7[7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm10[0,1,2,3,4,4,6,5] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm6 = xmm10[0,1,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm4, %ymm1 -; AVX1-ONLY-NEXT: vmovdqa 16(%rsi), %xmm6 -; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm14 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3],xmm4[4],xmm6[4],xmm4[5],xmm6[5],xmm4[6],xmm6[6],xmm4[7],xmm6[7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm14, %ymm1 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vandps %ymm2, %ymm14, %ymm2 +; AVX1-ONLY-NEXT: vorps %ymm1, %ymm2, %ymm6 +; AVX1-ONLY-NEXT: vmovdqa 16(%rcx), %xmm5 +; AVX1-ONLY-NEXT: vmovdqa 16(%rdx), %xmm1 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm10 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3],xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm10[0,1,2,3,4,4,6,5] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm10[0,1,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm0 +; AVX1-ONLY-NEXT: vmovdqa 16(%rsi), %xmm4 +; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm2 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm14 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm13 = xmm14[2,3,2,3] ; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm13 = xmm13[0],zero,zero,zero,xmm13[1],zero,zero,zero ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm14[3,3,3,3] ; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm15 = xmm15[0],zero,zero,zero,xmm15[1],zero,zero,zero ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm13, %ymm13 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,1,3,3,6,5,7,7] -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm2, %ymm1 -; AVX1-ONLY-NEXT: vandps %ymm2, %ymm13, %ymm13 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm13, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm1[0],ymm5[1],ymm1[2],ymm5[3],ymm1[4],ymm5[5],ymm1[6],ymm5[7] -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm9[8],xmm8[8],xmm9[9],xmm8[9],xmm9[10],xmm8[10],xmm9[11],xmm8[11],xmm9[12],xmm8[12],xmm9[13],xmm8[13],xmm9[14],xmm8[14],xmm9[15],xmm8[15] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm5[0,1,2,3,4,4,6,5] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm9 = xmm5[0,1,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm15 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm15 = xmm8[8],mem[8],xmm8[9],mem[9],xmm8[10],mem[10],xmm8[11],mem[11],xmm8[12],mem[12],xmm8[13],mem[13],xmm8[14],mem[14],xmm8[15],mem[15] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,1,3,3,6,5,7,7] +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm7, %ymm0 +; AVX1-ONLY-NEXT: vandps %ymm7, %ymm13, %ymm13 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm13, %ymm0 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm0[0],ymm6[1],ymm0[2],ymm6[3],ymm0[4],ymm6[5],ymm0[6],ymm6[7] +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm9[8],xmm3[8],xmm9[9],xmm3[9],xmm9[10],xmm3[10],xmm9[11],xmm3[11],xmm9[12],xmm3[12],xmm9[13],xmm3[13],xmm9[14],xmm3[14],xmm9[15],xmm3[15] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm3[0,1,2,3,4,4,6,5] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm13 = xmm3[0,1,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm15 = xmm9[8],xmm8[8],xmm9[9],xmm8[9],xmm9[10],xmm8[10],xmm9[11],xmm8[11],xmm9[12],xmm8[12],xmm9[13],xmm8[13],xmm9[14],xmm8[14],xmm9[15],xmm8[15] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm8 = xmm15[0,1,2,3,4,5,5,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm9 = xmm15[0,1,2,3,6,5,7,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm8, %ymm8 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm0, %ymm1 +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm13 = xmm15[0,1,2,3,6,5,7,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm8, %ymm8 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vbroadcastsd {{.*#+}} ymm9 = [1.3906711615669959E-309,1.3906711615669959E-309,1.3906711615669959E-309,1.3906711615669959E-309] +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm9, %ymm0 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[0,2,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vandps %ymm0, %ymm8, %ymm8 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm8, %ymm8 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm3[8],xmm7[8],xmm3[9],xmm7[9],xmm3[10],xmm7[10],xmm3[11],xmm7[11],xmm3[12],xmm7[12],xmm3[13],xmm7[13],xmm3[14],xmm7[14],xmm3[15],xmm7[15] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm1[0,1,2,3,4,4,6,5] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm7 = xmm1[0,1,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm4[8],xmm6[8],xmm4[9],xmm6[9],xmm4[10],xmm6[10],xmm4[11],xmm6[11],xmm4[12],xmm6[12],xmm4[13],xmm6[13],xmm4[14],xmm6[14],xmm4[15],xmm6[15] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm4[2,3,2,3] -; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm4[3,3,3,3] -; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm6, %ymm6 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[2,1,3,3,6,5,7,7] -; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm2, %ymm3 -; AVX1-ONLY-NEXT: vandps %ymm2, %ymm6, %ymm6 -; AVX1-ONLY-NEXT: vorps %ymm3, %ymm6, %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm8[1],ymm3[2],ymm8[3],ymm3[4],ymm8[5],ymm3[6],ymm8[7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm5[0,0,2,1,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,2,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm6, %ymm5 -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm15[0,1,1,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm15[2,1,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm6, %ymm6 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[0,0,2,1,4,4,6,5] -; AVX1-ONLY-NEXT: vandnps %ymm5, %ymm0, %ymm5 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm6[0,0,2,1,4,4,6,5] -; AVX1-ONLY-NEXT: vandps %ymm0, %ymm6, %ymm6 -; AVX1-ONLY-NEXT: vorps %ymm5, %ymm6, %ymm5 -; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm6 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,1,1,1] +; AVX1-ONLY-NEXT: vandps %ymm9, %ymm8, %ymm8 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm8, %ymm8 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm1[8],xmm5[8],xmm1[9],xmm5[9],xmm1[10],xmm5[10],xmm1[11],xmm5[11],xmm1[12],xmm5[12],xmm1[13],xmm5[13],xmm1[14],xmm5[14],xmm1[15],xmm5[15] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,4,6,5] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm0[0,1,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm2[8],xmm4[8],xmm2[9],xmm4[9],xmm2[10],xmm4[10],xmm2[11],xmm4[11],xmm2[12],xmm4[12],xmm2[13],xmm4[13],xmm2[14],xmm4[14],xmm2[15],xmm4[15] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[2,3,2,3] ; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm6, %ymm4 -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm1[0,0,2,1,4,5,6,7] -; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm6 = xmm6[0],zero,xmm6[1],zero -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm6, %ymm1 -; AVX1-ONLY-NEXT: vandps %ymm2, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm2, %ymm1 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[3,3,3,3] +; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,1,3,3,6,5,7,7] +; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm7, %ymm1 +; AVX1-ONLY-NEXT: vandps %ymm7, %ymm4, %ymm4 ; AVX1-ONLY-NEXT: vorps %ymm1, %ymm4, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm5[1],ymm1[2],ymm5[3],ymm1[4],ymm5[5],ymm1[6],ymm5[7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm11[0,0,2,1,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm11[0,2,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm8[1],ymm1[2],ymm8[3],ymm1[4],ymm8[5],ymm1[6],ymm8[7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm3[0,0,2,1,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm15[0,1,1,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm15[2,1,3,3,4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm12[0,1,1,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm12[2,1,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,0,2,1,4,4,6,5] +; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm9, %ymm3 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,0,2,1,4,4,6,5] -; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm0, %ymm4 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[0,0,2,1,4,4,6,5] -; AVX1-ONLY-NEXT: vandps %ymm0, %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vorps %ymm4, %ymm5, %ymm4 -; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm14[0],zero,zero,zero,xmm14[1],zero,zero,zero -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm14[1,1,1,1] -; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm10[0,0,2,1,4,5,6,7] -; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm6 = xmm6[0],zero,xmm6[1],zero -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm10[0,2,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm7 = xmm7[0],zero,xmm7[1],zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm6, %ymm6 -; AVX1-ONLY-NEXT: vandps %ymm2, %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vandnps %ymm6, %ymm2, %ymm6 -; AVX1-ONLY-NEXT: vorps %ymm6, %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2],ymm4[3],ymm5[4],ymm4[5],ymm5[6],ymm4[7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm6[0,0,2,1,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,2,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm7[0,1,1,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[2,1,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm6, %ymm6 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[0,0,2,1,4,4,6,5] -; AVX1-ONLY-NEXT: vandnps %ymm5, %ymm0, %ymm5 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm6[0,0,2,1,4,4,6,5] -; AVX1-ONLY-NEXT: vandps %ymm0, %ymm6, %ymm0 -; AVX1-ONLY-NEXT: vorps %ymm5, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[1,1,1,1] -; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm7[0,0,2,1,4,5,6,7] -; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm6 = xmm6[0],zero,xmm6[1],zero -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,2,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm7 = xmm7[0],zero,xmm7[1],zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm6, %ymm6 -; AVX1-ONLY-NEXT: vandps %ymm2, %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vandnps %ymm6, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vorps %ymm2, %ymm5, %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2],ymm0[3],ymm2[4],ymm0[5],ymm2[6],ymm0[7] +; AVX1-ONLY-NEXT: vandps %ymm4, %ymm9, %ymm4 +; AVX1-ONLY-NEXT: vorps %ymm3, %ymm4, %ymm3 +; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,1,1] +; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm4, %ymm2 +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm0[0,0,2,1,4,5,6,7] +; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm4, %ymm0 +; AVX1-ONLY-NEXT: vandps %ymm7, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm7, %ymm0 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm2, %ymm0 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2],ymm3[3],ymm0[4],ymm3[5],ymm0[6],ymm3[7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm11[0,0,2,1,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm11[0,2,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm12[0,1,1,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm12[2,1,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,0,2,1,4,4,6,5] +; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm9, %ymm2 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,0,2,1,4,4,6,5] +; AVX1-ONLY-NEXT: vandps %ymm3, %ymm9, %ymm3 +; AVX1-ONLY-NEXT: vorps %ymm2, %ymm3, %ymm2 +; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm14[0],zero,zero,zero,xmm14[1],zero,zero,zero +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm14[1,1,1,1] +; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm10[0,0,2,1,4,5,6,7] +; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm10[0,2,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vandps %ymm7, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm7, %ymm4 +; AVX1-ONLY-NEXT: vorps %ymm4, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2],ymm2[3],ymm3[4],ymm2[5],ymm3[6],ymm2[7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm4[0,0,2,1,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,2,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm5[0,1,1,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[2,1,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,0,2,1,4,4,6,5] +; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm9, %ymm3 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,0,2,1,4,4,6,5] +; AVX1-ONLY-NEXT: vandps %ymm4, %ymm9, %ymm4 +; AVX1-ONLY-NEXT: vorps %ymm3, %ymm4, %ymm3 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload +; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[1,1,1,1] +; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm8[0,0,2,1,4,5,6,7] +; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,2,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm8 = xmm8[0],zero,xmm8[1],zero +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm5, %ymm5 +; AVX1-ONLY-NEXT: vandps %ymm7, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vandnps %ymm5, %ymm7, %ymm5 +; AVX1-ONLY-NEXT: vorps %ymm5, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2],ymm3[3],ymm4[4],ymm3[5],ymm4[6],ymm3[7] ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX1-ONLY-NEXT: vmovaps %ymm0, 64(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm4, 128(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm1, 192(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm3, 224(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm13, 160(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm3, 64(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm2, 128(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm0, 192(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm1, 224(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm6, 160(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 96(%rax) -; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rax) -; AVX1-ONLY-NEXT: addq $72, %rsp +; AVX1-ONLY-NEXT: addq $56, %rsp ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; @@ -4146,306 +4145,305 @@ ; ; AVX1-ONLY-LABEL: store_i8_stride8_vf64: ; AVX1-ONLY: # %bb.0: -; AVX1-ONLY-NEXT: subq $360, %rsp # imm = 0x168 +; AVX1-ONLY-NEXT: subq $344, %rsp # imm = 0x158 ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX1-ONLY-NEXT: vmovdqa (%r10), %xmm0 -; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa (%rax), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa (%r10), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,4,6,5] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm1[0,1,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vbroadcastsd {{.*#+}} ymm5 = [281474976710655,281474976710655,281474976710655,281474976710655] -; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm5, %ymm0 +; AVX1-ONLY-NEXT: vmovdqa (%rax), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,4,6,5] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vbroadcastsd {{.*#+}} ymm8 = [1.3906711615669959E-309,1.3906711615669959E-309,1.3906711615669959E-309,1.3906711615669959E-309] +; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm8, %ymm2 ; AVX1-ONLY-NEXT: vmovdqa (%r9), %xmm3 ; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa (%r8), %xmm2 -; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm2[0,1,2,3,4,5,5,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm2[0,1,2,3,6,5,7,7] +; AVX1-ONLY-NEXT: vmovdqa (%r8), %xmm1 +; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm1[0,1,2,3,4,5,5,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm1[0,1,2,3,6,5,7,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vorps %ymm0, %ymm3, %ymm0 +; AVX1-ONLY-NEXT: vandps %ymm3, %ymm8, %ymm3 +; AVX1-ONLY-NEXT: vorps %ymm2, %ymm3, %ymm3 ; AVX1-ONLY-NEXT: vmovdqa (%rcx), %xmm4 ; AVX1-ONLY-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa (%rdx), %xmm3 -; AVX1-ONLY-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm3[0,1,2,3,4,4,6,5] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm6 = xmm3[0,1,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vmovdqa (%rsi), %xmm7 -; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm6 +; AVX1-ONLY-NEXT: vmovdqa (%rdx), %xmm2 +; AVX1-ONLY-NEXT: vmovdqa %xmm2, (%rsp) # 16-byte Spill +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm2[0,1,2,3,4,4,6,5] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm2[0,1,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vmovdqa (%rsi), %xmm6 ; AVX1-ONLY-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3],xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm6[2,3,2,3] +; AVX1-ONLY-NEXT: vmovdqa (%rdi), %xmm5 +; AVX1-ONLY-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3],xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm5[2,3,2,3] +; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm5[3,3,3,3] ; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm6[3,3,3,3] -; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm8 = xmm8[0],zero,zero,zero,xmm8[1],zero,zero,zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm7, %ymm7 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm4[2,1,3,3,6,5,7,7] -; AVX1-ONLY-NEXT: vbroadcastsd {{.*#+}} ymm4 = [18446744069414649855,18446744069414649855,18446744069414649855,18446744069414649855] -; AVX1-ONLY-NEXT: vandnps %ymm8, %ymm4, %ymm8 -; AVX1-ONLY-NEXT: vandps %ymm4, %ymm7, %ymm7 -; AVX1-ONLY-NEXT: vorps %ymm7, %ymm8, %ymm7 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0],ymm0[1],ymm7[2],ymm0[3],ymm7[4],ymm0[5],ymm7[6],ymm0[7] -; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm1[0,0,2,1,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm2[0,1,1,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,1,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm2 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm6, %ymm6 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[2,1,3,3,6,5,7,7] +; AVX1-ONLY-NEXT: vbroadcastsd {{.*#+}} ymm9 = [NaN,NaN,NaN,NaN] +; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm9, %ymm4 +; AVX1-ONLY-NEXT: vandps %ymm6, %ymm9, %ymm6 +; AVX1-ONLY-NEXT: vorps %ymm4, %ymm6, %ymm4 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2],ymm3[3],ymm4[4],ymm3[5],ymm4[6],ymm3[7] +; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm0[0,0,2,1,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0 +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm1[0,1,1,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,1,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm3 ; AVX1-ONLY-NEXT: vmovdqa 48(%r10), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0,2,1,4,4,6,5] -; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm5, %ymm0 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,0,2,1,4,4,6,5] -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vorps %ymm0, %ymm2, %ymm0 -; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[1,1,1,1] -; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm2, %ymm6 -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm3[0,0,2,1,4,5,6,7] +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm8, %ymm0 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,0,2,1,4,4,6,5] +; AVX1-ONLY-NEXT: vandps %ymm3, %ymm8, %ymm3 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm3, %ymm0 +; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm5[1,1,1,1] +; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm2[0,0,2,1,4,5,6,7] +; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm3 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm4, %ymm4 ; AVX1-ONLY-NEXT: vmovdqa 48(%rax), %xmm2 -; AVX1-ONLY-NEXT: vmovdqa %xmm2, (%rsp) # 16-byte Spill -; AVX1-ONLY-NEXT: vandps %ymm4, %ymm6, %ymm6 -; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm4, %ymm3 -; AVX1-ONLY-NEXT: vorps %ymm3, %ymm6, %ymm3 +; AVX1-ONLY-NEXT: vandps %ymm3, %ymm9, %ymm3 +; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm9, %ymm4 +; AVX1-ONLY-NEXT: vorps %ymm4, %ymm3, %ymm3 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0],ymm0[1],ymm3[2],ymm0[3],ymm3[4],ymm0[5],ymm3[6],ymm0[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm11 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm11[0,0,2,1,4,5,6,7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm11[0,2,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm6 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovdqa 48(%r9), %xmm3 -; AVX1-ONLY-NEXT: vmovdqa 48(%r8), %xmm0 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm12 = xmm0[8],xmm3[8],xmm0[9],xmm3[9],xmm0[10],xmm3[10],xmm0[11],xmm3[11],xmm0[12],xmm3[12],xmm0[13],xmm3[13],xmm0[14],xmm3[14],xmm0[15],xmm3[15] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm12[0,1,1,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm8 = xmm12[2,1,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm7, %ymm7 +; AVX1-ONLY-NEXT: vmovdqa 48(%r8), %xmm5 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm5[8],xmm3[8],xmm5[9],xmm3[9],xmm5[10],xmm3[10],xmm5[11],xmm3[11],xmm5[12],xmm3[12],xmm5[13],xmm3[13],xmm5[14],xmm3[14],xmm5[15],xmm3[15] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm4[0,1,1,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm4[2,1,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm6, %ymm6 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0,2,1,4,4,6,5] +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm8, %ymm0 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm6[0,0,2,1,4,4,6,5] -; AVX1-ONLY-NEXT: vandnps %ymm6, %ymm5, %ymm6 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm7[0,0,2,1,4,4,6,5] -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm7, %ymm7 -; AVX1-ONLY-NEXT: vorps %ymm6, %ymm7, %ymm13 -; AVX1-ONLY-NEXT: vmovdqa 48(%rsi), %xmm8 -; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm9 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm10 = xmm9[8],xmm8[8],xmm9[9],xmm8[9],xmm9[10],xmm8[10],xmm9[11],xmm8[11],xmm9[12],xmm8[12],xmm9[13],xmm8[13],xmm9[14],xmm8[14],xmm9[15],xmm8[15] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm10[1,1,1,1] -; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero +; AVX1-ONLY-NEXT: vandps %ymm6, %ymm8, %ymm6 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm6, %ymm13 +; AVX1-ONLY-NEXT: vmovdqa 48(%rsi), %xmm6 +; AVX1-ONLY-NEXT: vmovdqa 48(%rdi), %xmm8 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm10 = xmm8[8],xmm6[8],xmm8[9],xmm6[9],xmm8[10],xmm6[10],xmm8[11],xmm6[11],xmm8[12],xmm6[12],xmm8[13],xmm6[13],xmm8[14],xmm6[14],xmm8[15],xmm6[15] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm10[1,1,1,1] +; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero ; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm7 = xmm10[0],zero,zero,zero,xmm10[1],zero,zero,zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm7, %ymm14 -; AVX1-ONLY-NEXT: vmovdqa 48(%rcx), %xmm6 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm7, %ymm14 +; AVX1-ONLY-NEXT: vmovdqa 48(%rcx), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa 48(%rdx), %xmm7 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm15 = xmm7[8],xmm6[8],xmm7[9],xmm6[9],xmm7[10],xmm6[10],xmm7[11],xmm6[11],xmm7[12],xmm6[12],xmm7[13],xmm6[13],xmm7[14],xmm6[14],xmm7[15],xmm6[15] +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm15 = xmm7[8],xmm0[8],xmm7[9],xmm0[9],xmm7[10],xmm0[10],xmm7[11],xmm0[11],xmm7[12],xmm0[12],xmm7[13],xmm0[13],xmm7[14],xmm0[14],xmm7[15],xmm0[15] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm15[0,0,2,1,4,5,6,7] ; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm15[0,2,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vandps %ymm4, %ymm14, %ymm2 -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm4, %ymm1 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm2, %ymm1 +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm12 = xmm15[0,2,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm12 = xmm12[0],zero,xmm12[1],zero +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vandps %ymm9, %ymm14, %ymm12 +; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm9, %ymm1 +; AVX1-ONLY-NEXT: vorps %ymm1, %ymm12, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm13[1],ymm1[2],ymm13[3],ymm1[4],ymm13[5],ymm1[6],ymm13[7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm11[0,1,2,3,4,4,6,5] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm11[0,1,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm12[0,1,2,3,4,5,5,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm11 = xmm12[0,1,2,3,6,5,7,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm11 = xmm4[0,1,2,3,4,5,5,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,5,7,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm11, %ymm4 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm5, %ymm1 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm2, %ymm1 -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm15[0,1,2,3,4,4,6,5] +; AVX1-ONLY-NEXT: vbroadcastsd {{.*#+}} ymm12 = [1.3906711615669959E-309,1.3906711615669959E-309,1.3906711615669959E-309,1.3906711615669959E-309] +; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm12, %ymm1 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,2,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vandps %ymm4, %ymm12, %ymm4 +; AVX1-ONLY-NEXT: vorps %ymm1, %ymm4, %ymm1 +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm15[0,1,2,3,4,4,6,5] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm11 = xmm15[0,1,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm4, %ymm4 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm10[2,3,2,3] ; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm11 = xmm11[0],zero,zero,zero,xmm11[1],zero,zero,zero ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[3,3,3,3] ; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm10 = xmm10[0],zero,zero,zero,xmm10[1],zero,zero,zero ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm11, %ymm10 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,1,3,3,6,5,7,7] -; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm4, %ymm2 -; AVX1-ONLY-NEXT: vandps %ymm4, %ymm10, %ymm10 -; AVX1-ONLY-NEXT: vorps %ymm2, %ymm10, %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4],ymm1[5],ymm2[6],ymm1[7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[2,1,3,3,6,5,7,7] +; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm9, %ymm4 +; AVX1-ONLY-NEXT: vandps %ymm9, %ymm10, %ymm10 +; AVX1-ONLY-NEXT: vorps %ymm4, %ymm10, %ymm4 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0],ymm1[1],ymm4[2],ymm1[3],ymm4[4],ymm1[5],ymm4[6],ymm1[7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm2 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm2 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3],xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7] +; AVX1-ONLY-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3],xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm2[0,0,2,1,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm10 = xmm2[0,2,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm0[0,1,1,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm10 = xmm0[2,1,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm2[0,2,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3],xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm3[0,1,1,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm3[2,1,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm4 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,0,2,1,4,4,6,5] -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm5, %ymm1 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,0,2,1,4,4,6,5] -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3],xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm1[1,1,1,1] -; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm8 = xmm8[0],zero,zero,zero,xmm8[1],zero,zero,zero -; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm9 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm9, %ymm8 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm6[0,0,2,1,4,5,6,7] +; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm12, %ymm1 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,0,2,1,4,4,6,5] +; AVX1-ONLY-NEXT: vandps %ymm4, %ymm12, %ymm4 +; AVX1-ONLY-NEXT: vorps %ymm1, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3],xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[1,1,1,1] +; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero +; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm6 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm6, %ymm5 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm7[0],xmm0[0],xmm7[1],xmm0[1],xmm7[2],xmm0[2],xmm7[3],xmm0[3],xmm7[4],xmm0[4],xmm7[5],xmm0[5],xmm7[6],xmm0[6],xmm7[7],xmm0[7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm0[0,0,2,1,4,5,6,7] +; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm6 = xmm6[0],zero,xmm6[1],zero +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm0[0,2,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm7 = xmm7[0],zero,xmm7[1],zero -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm9 = xmm6[0,2,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm9 = xmm9[0],zero,xmm9[1],zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm7, %ymm7 -; AVX1-ONLY-NEXT: vandps %ymm4, %ymm8, %ymm8 -; AVX1-ONLY-NEXT: vandnps %ymm7, %ymm4, %ymm7 -; AVX1-ONLY-NEXT: vorps %ymm7, %ymm8, %ymm7 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm7[0],ymm3[1],ymm7[2],ymm3[3],ymm7[4],ymm3[5],ymm7[6],ymm3[7] -; AVX1-ONLY-NEXT: vmovups %ymm3, (%rsp) # 32-byte Spill -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm2[0,1,2,3,4,4,6,5] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm6, %ymm6 +; AVX1-ONLY-NEXT: vandps %ymm5, %ymm9, %ymm5 +; AVX1-ONLY-NEXT: vandnps %ymm6, %ymm9, %ymm6 +; AVX1-ONLY-NEXT: vorps %ymm6, %ymm5, %ymm5 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2],ymm4[3],ymm5[4],ymm4[5],ymm5[6],ymm4[7] +; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm2[0,1,2,3,4,4,6,5] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm0[0,1,2,3,4,5,5,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,7,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm4, %ymm2 +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm3[0,1,2,3,4,5,5,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,7,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm5, %ymm2 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vorps %ymm2, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm6[0,1,2,3,4,4,6,5] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm6[0,1,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm12, %ymm2 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vandps %ymm3, %ymm12, %ymm3 +; AVX1-ONLY-NEXT: vorps %ymm2, %ymm3, %ymm2 +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm0[0,1,2,3,4,4,6,5] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[2,3,2,3] ; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,3,3,3] ; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,1,3,3,6,5,7,7] -; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm4, %ymm2 -; AVX1-ONLY-NEXT: vandps %ymm4, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vorps %ymm2, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,1,3,3,6,5,7,7] +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm9, %ymm0 +; AVX1-ONLY-NEXT: vandps %ymm1, %ymm9, %ymm1 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm1, %ymm0 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 32(%r10), %xmm6 -; AVX1-ONLY-NEXT: vmovdqa 32(%rax), %xmm8 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm11 = xmm8[8],xmm6[8],xmm8[9],xmm6[9],xmm8[10],xmm6[10],xmm8[11],xmm6[11],xmm8[12],xmm6[12],xmm8[13],xmm6[13],xmm8[14],xmm6[14],xmm8[15],xmm6[15] +; AVX1-ONLY-NEXT: vmovdqa 32(%r10), %xmm5 +; AVX1-ONLY-NEXT: vmovdqa 32(%rax), %xmm7 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm11 = xmm7[8],xmm5[8],xmm7[9],xmm5[9],xmm7[10],xmm5[10],xmm7[11],xmm5[11],xmm7[12],xmm5[12],xmm7[13],xmm5[13],xmm7[14],xmm5[14],xmm7[15],xmm5[15] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm11[0,0,2,1,4,5,6,7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm11[0,2,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovdqa 32(%r9), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovdqa 32(%r8), %xmm2 -; AVX1-ONLY-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm12 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm12[0,1,1,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm12[2,1,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm4[0,1,1,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm4[2,1,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm3, %ymm3 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0,2,1,4,4,6,5] -; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm5, %ymm0 +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm12, %ymm0 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,0,2,1,4,4,6,5] -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vorps %ymm0, %ymm3, %ymm13 -; AVX1-ONLY-NEXT: vmovdqa 32(%rsi), %xmm7 -; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm9 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm10 = xmm9[8],xmm7[8],xmm9[9],xmm7[9],xmm9[10],xmm7[10],xmm9[11],xmm7[11],xmm9[12],xmm7[12],xmm9[13],xmm7[13],xmm9[14],xmm7[14],xmm9[15],xmm7[15] +; AVX1-ONLY-NEXT: vandps %ymm3, %ymm12, %ymm3 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm3, %ymm12 +; AVX1-ONLY-NEXT: vmovdqa 32(%rsi), %xmm6 +; AVX1-ONLY-NEXT: vmovdqa 32(%rdi), %xmm8 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm10 = xmm8[8],xmm6[8],xmm8[9],xmm6[9],xmm8[10],xmm6[10],xmm8[11],xmm6[11],xmm8[12],xmm6[12],xmm8[13],xmm6[13],xmm8[14],xmm6[14],xmm8[15],xmm6[15] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm10[1,1,1,1] ; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero ; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm10[0],zero,zero,zero,xmm10[1],zero,zero,zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm14 -; AVX1-ONLY-NEXT: vmovdqa 32(%rcx), %xmm3 -; AVX1-ONLY-NEXT: vmovdqa 32(%rdx), %xmm0 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm15 = xmm0[8],xmm3[8],xmm0[9],xmm3[9],xmm0[10],xmm3[10],xmm0[11],xmm3[11],xmm0[12],xmm3[12],xmm0[13],xmm3[13],xmm0[14],xmm3[14],xmm0[15],xmm3[15] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm15[0,0,2,1,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm13 +; AVX1-ONLY-NEXT: vmovdqa 32(%rcx), %xmm0 +; AVX1-ONLY-NEXT: vmovdqa 32(%rdx), %xmm3 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm14 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm15 = xmm14[0,0,2,1,4,5,6,7] +; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm15 = xmm15[0],zero,xmm15[1],zero +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm14[0,2,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm15[0,2,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vandps %ymm4, %ymm14, %ymm2 -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm4, %ymm1 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm2, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm13[1],ymm1[2],ymm13[3],ymm1[4],ymm13[5],ymm1[6],ymm13[7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm15, %ymm1 +; AVX1-ONLY-NEXT: vandps %ymm9, %ymm13, %ymm13 +; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm9, %ymm1 +; AVX1-ONLY-NEXT: vorps %ymm1, %ymm13, %ymm1 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm12[1],ymm1[2],ymm12[3],ymm1[4],ymm12[5],ymm1[6],ymm12[7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm11[0,1,2,3,4,4,6,5] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm11[0,1,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm12[0,1,2,3,4,5,5,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm11 = xmm12[0,1,2,3,6,5,7,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm11 = xmm4[0,1,2,3,4,5,5,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,5,7,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm11, %ymm4 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm5, %ymm1 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm2, %ymm1 -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm15[0,1,2,3,4,4,6,5] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm11 = xmm15[0,1,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vbroadcastsd {{.*#+}} ymm11 = [1.3906711615669959E-309,1.3906711615669959E-309,1.3906711615669959E-309,1.3906711615669959E-309] +; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm11, %ymm1 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,2,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vandps %ymm4, %ymm11, %ymm4 +; AVX1-ONLY-NEXT: vmovaps %ymm11, %ymm12 +; AVX1-ONLY-NEXT: vorps %ymm1, %ymm4, %ymm1 +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm14[0,1,2,3,4,4,6,5] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm11 = xmm14[0,1,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm4, %ymm4 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm11 = xmm10[2,3,2,3] ; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm11 = xmm11[0],zero,zero,zero,xmm11[1],zero,zero,zero ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[3,3,3,3] ; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm10 = xmm10[0],zero,zero,zero,xmm10[1],zero,zero,zero ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm11, %ymm10 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,1,3,3,6,5,7,7] -; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm4, %ymm2 -; AVX1-ONLY-NEXT: vandps %ymm4, %ymm10, %ymm10 -; AVX1-ONLY-NEXT: vorps %ymm2, %ymm10, %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4],ymm1[5],ymm2[6],ymm1[7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[2,1,3,3,6,5,7,7] +; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm9, %ymm4 +; AVX1-ONLY-NEXT: vandps %ymm9, %ymm10, %ymm10 +; AVX1-ONLY-NEXT: vorps %ymm4, %ymm10, %ymm4 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0],ymm1[1],ymm4[2],ymm1[3],ymm4[4],ymm1[5],ymm4[6],ymm1[7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3],xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm6[0,0,2,1,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm6[0,2,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3],xmm7[4],xmm5[4],xmm7[5],xmm5[5],xmm7[6],xmm5[6],xmm7[7],xmm5[7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm5[0,0,2,1,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm5[0,2,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3],xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm8 = xmm2[0,1,1,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm10 = xmm2[2,1,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm8, %ymm8 +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm2[0,1,1,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm2[2,1,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm4, %ymm4 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,0,2,1,4,4,6,5] -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm5, %ymm1 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[0,0,2,1,4,4,6,5] -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm8, %ymm8 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm8, %ymm8 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm9[0],xmm7[0],xmm9[1],xmm7[1],xmm9[2],xmm7[2],xmm9[3],xmm7[3],xmm9[4],xmm7[4],xmm9[5],xmm7[5],xmm9[6],xmm7[6],xmm9[7],xmm7[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm1[1,1,1,1] -; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero -; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm9 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm9, %ymm7 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] +; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm12, %ymm1 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,0,2,1,4,4,6,5] +; AVX1-ONLY-NEXT: vandps %ymm4, %ymm12, %ymm4 +; AVX1-ONLY-NEXT: vorps %ymm1, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3],xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm1[1,1,1,1] +; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero +; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm7 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm7, %ymm6 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm0[0,0,2,1,4,5,6,7] ; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm9 = xmm0[0,2,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm9 = xmm9[0],zero,xmm9[1],zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vandps %ymm4, %ymm7, %ymm7 -; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm4, %ymm3 -; AVX1-ONLY-NEXT: vorps %ymm3, %ymm7, %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm8[1],ymm3[2],ymm8[3],ymm3[4],ymm8[5],ymm3[6],ymm8[7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm0[0,2,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm7 = xmm7[0],zero,xmm7[1],zero +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vandps %ymm6, %ymm9, %ymm6 +; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm9, %ymm3 +; AVX1-ONLY-NEXT: vorps %ymm3, %ymm6, %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2],ymm4[3],ymm3[4],ymm4[5],ymm3[6],ymm4[7] ; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm6[0,1,2,3,4,4,6,5] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm6 = xmm2[0,1,2,3,4,5,5,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm5[0,1,2,3,4,4,6,5] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm5[0,1,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm2[0,1,2,3,4,5,5,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,7,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm6, %ymm2 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm4, %ymm2 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm5, %ymm3 +; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm12, %ymm3 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vandps %ymm2, %ymm12, %ymm2 ; AVX1-ONLY-NEXT: vorps %ymm3, %ymm2, %ymm2 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm0[0,1,2,3,4,4,6,5] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] @@ -4456,190 +4454,189 @@ ; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,1,3,3,6,5,7,7] -; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm4, %ymm0 -; AVX1-ONLY-NEXT: vandps %ymm4, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm9, %ymm0 +; AVX1-ONLY-NEXT: vandps %ymm1, %ymm9, %ymm1 ; AVX1-ONLY-NEXT: vorps %ymm0, %ymm1, %ymm0 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 16(%r10), %xmm12 -; AVX1-ONLY-NEXT: vmovdqa 16(%rax), %xmm7 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm7[8],xmm12[8],xmm7[9],xmm12[9],xmm7[10],xmm12[10],xmm7[11],xmm12[11],xmm7[12],xmm12[12],xmm7[13],xmm12[13],xmm7[14],xmm12[14],xmm7[15],xmm12[15] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm3[0,0,2,1,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm3[0,2,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 +; AVX1-ONLY-NEXT: vmovdqa 16(%r10), %xmm8 +; AVX1-ONLY-NEXT: vmovdqa 16(%rax), %xmm4 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm4[8],xmm8[8],xmm4[9],xmm8[9],xmm4[10],xmm8[10],xmm4[11],xmm8[11],xmm4[12],xmm8[12],xmm4[13],xmm8[13],xmm4[14],xmm8[14],xmm4[15],xmm8[15] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm2[0,0,2,1,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm2[0,2,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 ; AVX1-ONLY-NEXT: vmovdqa 16(%r9), %xmm0 ; AVX1-ONLY-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vmovdqa 16(%r8), %xmm1 -; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm1[0,1,1,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm8 = xmm1[2,1,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm6, %ymm6 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,0,2,1,4,4,6,5] -; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm5, %ymm2 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm6[0,0,2,1,4,4,6,5] -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm6, %ymm6 -; AVX1-ONLY-NEXT: vorps %ymm2, %ymm6, %ymm13 -; AVX1-ONLY-NEXT: vmovdqa 16(%rsi), %xmm11 -; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm10 = xmm6[8],xmm11[8],xmm6[9],xmm11[9],xmm6[10],xmm11[10],xmm6[11],xmm11[11],xmm6[12],xmm11[12],xmm6[13],xmm11[13],xmm6[14],xmm11[14],xmm6[15],xmm11[15] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm10[1,1,1,1] -; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm8 = xmm8[0],zero,zero,zero,xmm8[1],zero,zero,zero -; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm9 = xmm10[0],zero,zero,zero,xmm10[1],zero,zero,zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm9, %ymm2 -; AVX1-ONLY-NEXT: vmovdqa 16(%rcx), %xmm9 -; AVX1-ONLY-NEXT: vmovdqa 16(%rdx), %xmm8 -; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm8[8],xmm9[8],xmm8[9],xmm9[9],xmm8[10],xmm9[10],xmm8[11],xmm9[11],xmm8[12],xmm9[12],xmm8[13],xmm9[13],xmm8[14],xmm9[14],xmm8[15],xmm9[15] +; AVX1-ONLY-NEXT: vmovdqa 16(%r8), %xmm11 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm11[8],xmm0[8],xmm11[9],xmm0[9],xmm11[10],xmm0[10],xmm11[11],xmm0[11],xmm11[12],xmm0[12],xmm11[13],xmm0[13],xmm11[14],xmm0[14],xmm11[15],xmm0[15] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm1[0,1,1,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm1[2,1,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,0,2,1,4,4,6,5] +; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm12, %ymm3 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[0,0,2,1,4,4,6,5] +; AVX1-ONLY-NEXT: vandps %ymm5, %ymm12, %ymm5 +; AVX1-ONLY-NEXT: vorps %ymm3, %ymm5, %ymm12 +; AVX1-ONLY-NEXT: vmovdqa 16(%rsi), %xmm5 +; AVX1-ONLY-NEXT: vmovdqa 16(%rdi), %xmm3 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm10 = xmm3[8],xmm5[8],xmm3[9],xmm5[9],xmm3[10],xmm5[10],xmm3[11],xmm5[11],xmm3[12],xmm5[12],xmm3[13],xmm5[13],xmm3[14],xmm5[14],xmm3[15],xmm5[15] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm10[1,1,1,1] +; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero +; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm7 = xmm10[0],zero,zero,zero,xmm10[1],zero,zero,zero +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm7, %ymm13 +; AVX1-ONLY-NEXT: vmovdqa 16(%rcx), %xmm7 +; AVX1-ONLY-NEXT: vmovdqa 16(%rdx), %xmm6 +; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm6[8],xmm7[8],xmm6[9],xmm7[9],xmm6[10],xmm7[10],xmm6[11],xmm7[11],xmm6[12],xmm7[12],xmm6[13],xmm7[13],xmm6[14],xmm7[14],xmm6[15],xmm7[15] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm15 = xmm0[0,0,2,1,4,5,6,7] ; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm15 = xmm15[0],zero,xmm15[1],zero ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm14 = xmm0[0,2,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm14 = xmm14[0],zero,xmm14[1],zero ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm15, %ymm14 -; AVX1-ONLY-NEXT: vandps %ymm4, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vandnps %ymm14, %ymm4, %ymm14 -; AVX1-ONLY-NEXT: vorps %ymm2, %ymm14, %ymm2 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm2[0],ymm13[1],ymm2[2],ymm13[3],ymm2[4],ymm13[5],ymm2[6],ymm13[7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm3[0,1,2,3,4,4,6,5] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm1[0,1,2,3,4,5,5,7] +; AVX1-ONLY-NEXT: vandps %ymm9, %ymm13, %ymm13 +; AVX1-ONLY-NEXT: vandnps %ymm14, %ymm9, %ymm14 +; AVX1-ONLY-NEXT: vorps %ymm14, %ymm13, %ymm13 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0],ymm12[1],ymm13[2],ymm12[3],ymm13[4],ymm12[5],ymm13[6],ymm12[7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm12 = xmm2[0,1,2,3,4,4,6,5] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm12, %ymm2 +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm12 = xmm1[0,1,2,3,4,5,5,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,7,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm12, %ymm1 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm5, %ymm2 +; AVX1-ONLY-NEXT: vbroadcastsd {{.*#+}} ymm12 = [1.3906711615669959E-309,1.3906711615669959E-309,1.3906711615669959E-309,1.3906711615669959E-309] +; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm12, %ymm2 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vandps %ymm1, %ymm12, %ymm1 ; AVX1-ONLY-NEXT: vorps %ymm2, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,4,6,5] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm2 = xmm10[2,3,2,3] ; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm10[3,3,3,3] -; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[3,3,3,3] +; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm10 = xmm10[0],zero,zero,zero,xmm10[1],zero,zero,zero +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm2, %ymm2 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,1,3,3,6,5,7,7] -; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm4, %ymm0 -; AVX1-ONLY-NEXT: vandps %ymm4, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm9, %ymm0 +; AVX1-ONLY-NEXT: vandps %ymm2, %ymm9, %ymm2 ; AVX1-ONLY-NEXT: vorps %ymm0, %ymm2, %ymm0 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm7[0],xmm12[0],xmm7[1],xmm12[1],xmm7[2],xmm12[2],xmm7[3],xmm12[3],xmm7[4],xmm12[4],xmm7[5],xmm12[5],xmm7[6],xmm12[6],xmm7[7],xmm12[7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,2,1,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm0[0,2,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3],xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm3[0,1,1,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm3[2,1,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,0,2,1,4,4,6,5] -; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm5, %ymm1 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm4[0],xmm8[0],xmm4[1],xmm8[1],xmm4[2],xmm8[2],xmm4[3],xmm8[3],xmm4[4],xmm8[4],xmm4[5],xmm8[5],xmm4[6],xmm8[6],xmm4[7],xmm8[7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm4[0,0,2,1,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm1 = xmm4[0,2,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm1 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm1 = xmm11[0],mem[0],xmm11[1],mem[1],xmm11[2],mem[2],xmm11[3],mem[3],xmm11[4],mem[4],xmm11[5],mem[5],xmm11[6],mem[6],xmm11[7],mem[7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm1[0,1,1,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm8 = xmm1[2,1,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0,2,1,4,4,6,5] +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm12, %ymm0 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,0,2,1,4,4,6,5] -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vorps %ymm1, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm6[0],xmm11[0],xmm6[1],xmm11[1],xmm6[2],xmm11[2],xmm6[3],xmm11[3],xmm6[4],xmm11[4],xmm6[5],xmm11[5],xmm6[6],xmm11[6],xmm6[7],xmm11[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm1[1,1,1,1] +; AVX1-ONLY-NEXT: vandps %ymm2, %ymm12, %ymm2 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm2, %ymm2 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3],xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,1,1] +; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero +; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm5, %ymm3 +; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3],xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm5[0,0,2,1,4,5,6,7] +; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm6 = xmm6[0],zero,xmm6[1],zero +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm5[0,2,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm7 = xmm7[0],zero,xmm7[1],zero +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm6, %ymm6 +; AVX1-ONLY-NEXT: vandps %ymm3, %ymm9, %ymm3 +; AVX1-ONLY-NEXT: vandnps %ymm6, %ymm9, %ymm6 +; AVX1-ONLY-NEXT: vorps %ymm6, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2],ymm2[3],ymm3[4],ymm2[5],ymm3[6],ymm2[7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm4[0,1,2,3,4,4,6,5] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm1[0,1,2,3,4,5,5,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,7,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm4, %ymm1 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm12, %ymm3 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vandps %ymm1, %ymm12, %ymm1 +; AVX1-ONLY-NEXT: vorps %ymm3, %ymm1, %ymm1 +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm5[0,1,2,3,4,4,6,5] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm5[0,1,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[2,3,2,3] +; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm4, %ymm0 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[2,1,3,3,6,5,7,7] +; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm9, %ymm3 +; AVX1-ONLY-NEXT: vandps %ymm0, %ymm9, %ymm0 +; AVX1-ONLY-NEXT: vorps %ymm3, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm3 = xmm0[8],mem[8],xmm0[9],mem[9],xmm0[10],mem[10],xmm0[11],mem[11],xmm0[12],mem[12],xmm0[13],mem[13],xmm0[14],mem[14],xmm0[15],mem[15] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm3[0,0,2,1,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm3[0,2,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm4 = xmm4[8],mem[8],xmm4[9],mem[9],xmm4[10],mem[10],xmm4[11],mem[11],xmm4[12],mem[12],xmm4[13],mem[13],xmm4[14],mem[14],xmm4[15],mem[15] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm4[0,1,1,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm4[2,1,3,3,4,5,6,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0,2,1,4,4,6,5] +; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm12, %ymm0 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[0,0,2,1,4,4,6,5] +; AVX1-ONLY-NEXT: vandps %ymm5, %ymm12, %ymm5 +; AVX1-ONLY-NEXT: vorps %ymm0, %ymm5, %ymm5 +; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm0 = xmm0[8],mem[8],xmm0[9],mem[9],xmm0[10],mem[10],xmm0[11],mem[11],xmm0[12],mem[12],xmm0[13],mem[13],xmm0[14],mem[14],xmm0[15],mem[15] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[1,1,1,1] ; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero -; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm7 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm7 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm7, %ymm6 -; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3],xmm8[4],xmm9[4],xmm8[5],xmm9[5],xmm8[6],xmm9[6],xmm8[7],xmm9[7] +; AVX1-ONLY-NEXT: vmovdqa (%rsp), %xmm7 # 16-byte Reload +; AVX1-ONLY-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload +; AVX1-ONLY-NEXT: # xmm7 = xmm7[8],mem[8],xmm7[9],mem[9],xmm7[10],mem[10],xmm7[11],mem[11],xmm7[12],mem[12],xmm7[13],mem[13],xmm7[14],mem[14],xmm7[15],mem[15] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm8 = xmm7[0,0,2,1,4,5,6,7] ; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm8 = xmm8[0],zero,xmm8[1],zero -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm9 = xmm7[0,2,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm9 = xmm9[0],zero,xmm9[1],zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm8, %ymm8 -; AVX1-ONLY-NEXT: vandps %ymm4, %ymm6, %ymm6 -; AVX1-ONLY-NEXT: vandnps %ymm8, %ymm4, %ymm8 +; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm11 = xmm7[0,2,2,3,4,5,6,7] +; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm11 = xmm11[0],zero,xmm11[1],zero +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm8, %ymm8 +; AVX1-ONLY-NEXT: vandps %ymm6, %ymm9, %ymm6 +; AVX1-ONLY-NEXT: vandnps %ymm8, %ymm9, %ymm8 ; AVX1-ONLY-NEXT: vorps %ymm6, %ymm8, %ymm6 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm6[0],ymm2[1],ymm6[2],ymm2[3],ymm6[4],ymm2[5],ymm6[6],ymm2[7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm6 = xmm0[0,1,2,3,4,4,6,5] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm6, %ymm0 -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm6 = xmm3[0,1,2,3,4,5,5,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,7,7] +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2],ymm5[3],ymm6[4],ymm5[5],ymm6[6],ymm5[7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm6 = xmm3[0,1,2,3,4,4,6,5] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm6, %ymm3 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm5, %ymm0 +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm6 = xmm4[0,1,2,3,4,5,5,7] +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,5,7,7] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm6, %ymm4 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vorps %ymm0, %ymm3, %ymm0 -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm7[0,1,2,3,4,4,6,5] +; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm12, %ymm3 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,2,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vandps %ymm4, %ymm12, %ymm4 +; AVX1-ONLY-NEXT: vorps %ymm3, %ymm4, %ymm3 +; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm7[0,1,2,3,4,4,6,5] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm6 = xmm7[0,1,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm1[2,3,2,3] -; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,3,3,3] -; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm6, %ymm1 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[2,1,3,3,6,5,7,7] -; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm4, %ymm3 -; AVX1-ONLY-NEXT: vandps %ymm4, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vorps %ymm3, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7] -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm0 = xmm0[8],mem[8],xmm0[9],mem[9],xmm0[10],mem[10],xmm0[11],mem[11],xmm0[12],mem[12],xmm0[13],mem[13],xmm0[14],mem[14],xmm0[15],mem[15] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm0[0,0,2,1,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm0[0,2,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm6 = xmm6[8],mem[8],xmm6[9],mem[9],xmm6[10],mem[10],xmm6[11],mem[11],xmm6[12],mem[12],xmm6[13],mem[13],xmm6[14],mem[14],xmm6[15],mem[15] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm6[0,1,1,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm8 = xmm6[2,1,3,3,4,5,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm7, %ymm7 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,0,2,1,4,4,6,5] -; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm5, %ymm3 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm7[0,0,2,1,4,4,6,5] -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm7, %ymm7 -; AVX1-ONLY-NEXT: vorps %ymm3, %ymm7, %ymm7 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm3 = xmm3[8],mem[8],xmm3[9],mem[9],xmm3[10],mem[10],xmm3[11],mem[11],xmm3[12],mem[12],xmm3[13],mem[13],xmm3[14],mem[14],xmm3[15],mem[15] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm3[1,1,1,1] -; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm8 = xmm8[0],zero,zero,zero,xmm8[1],zero,zero,zero -; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm9 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm9, %ymm8 -; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX1-ONLY-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: # xmm9 = xmm9[8],mem[8],xmm9[9],mem[9],xmm9[10],mem[10],xmm9[11],mem[11],xmm9[12],mem[12],xmm9[13],mem[13],xmm9[14],mem[14],xmm9[15],mem[15] -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm11 = xmm9[0,0,2,1,4,5,6,7] -; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm11 = xmm11[0],zero,xmm11[1],zero -; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm12 = xmm9[0,2,2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm12 = xmm12[0],zero,xmm12[1],zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm11, %ymm11 -; AVX1-ONLY-NEXT: vandps %ymm4, %ymm8, %ymm8 -; AVX1-ONLY-NEXT: vandnps %ymm11, %ymm4, %ymm11 -; AVX1-ONLY-NEXT: vorps %ymm11, %ymm8, %ymm8 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0],ymm7[1],ymm8[2],ymm7[3],ymm8[4],ymm7[5],ymm8[6],ymm7[7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm8 = xmm0[0,1,2,3,4,4,6,5] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm8, %ymm0 -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm8 = xmm6[0,1,2,3,4,5,5,7] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,6,5,7,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm8, %ymm6 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm5, %ymm0 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm6[0,2,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vandps %ymm5, %ymm6, %ymm5 -; AVX1-ONLY-NEXT: vorps %ymm0, %ymm5, %ymm0 -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm9[0,1,2,3,4,4,6,5] -; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm6 = xmm9[0,1,2,3,4,6,6,7] -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm3[2,3,2,3] +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm4, %ymm4 +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[2,3,2,3] ; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[3,3,3,3] -; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero -; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm6, %ymm3 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[2,1,3,3,6,5,7,7] -; AVX1-ONLY-NEXT: vandnps %ymm5, %ymm4, %ymm5 -; AVX1-ONLY-NEXT: vandps %ymm4, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vorps %ymm5, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0],ymm0[1],ymm3[2],ymm0[3],ymm3[4],ymm0[5],ymm3[6],ymm0[7] +; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm6, %ymm0 +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[2,1,3,3,6,5,7,7] +; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm9, %ymm4 +; AVX1-ONLY-NEXT: vandps %ymm0, %ymm9, %ymm0 +; AVX1-ONLY-NEXT: vorps %ymm4, %ymm0, %ymm0 +; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2],ymm3[3],ymm0[4],ymm3[5],ymm0[6],ymm3[7] ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX1-ONLY-NEXT: vmovaps %ymm0, 96(%rax) -; AVX1-ONLY-NEXT: vmovaps %ymm7, 64(%rax) +; AVX1-ONLY-NEXT: vmovaps %ymm5, 64(%rax) ; AVX1-ONLY-NEXT: vmovaps %ymm1, 160(%rax) ; AVX1-ONLY-NEXT: vmovaps %ymm2, 128(%rax) ; AVX1-ONLY-NEXT: vmovaps %ymm10, 224(%rax) @@ -4654,7 +4651,7 @@ ; AVX1-ONLY-NEXT: vmovaps %ymm0, 320(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 416(%rax) -; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 384(%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 480(%rax) @@ -4664,7 +4661,7 @@ ; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rax) ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovaps %ymm0, 32(%rax) -; AVX1-ONLY-NEXT: addq $360, %rsp # imm = 0x168 +; AVX1-ONLY-NEXT: addq $344, %rsp # imm = 0x158 ; AVX1-ONLY-NEXT: vzeroupper ; AVX1-ONLY-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vector-lzcnt-512.ll b/llvm/test/CodeGen/X86/vector-lzcnt-512.ll --- a/llvm/test/CodeGen/X86/vector-lzcnt-512.ll +++ b/llvm/test/CodeGen/X86/vector-lzcnt-512.ll @@ -56,7 +56,7 @@ ; AVX512DQ-NEXT: vporq %zmm1, %zmm0, %zmm0 ; AVX512DQ-NEXT: vpsrlq $32, %zmm0, %zmm1 ; AVX512DQ-NEXT: vporq %zmm1, %zmm0, %zmm0 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512DQ-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512DQ-NEXT: vpandn %ymm1, %ymm0, %ymm2 ; AVX512DQ-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm3 @@ -133,7 +133,7 @@ ; AVX512DQ-NEXT: vporq %zmm1, %zmm0, %zmm0 ; AVX512DQ-NEXT: vpsrlq $32, %zmm0, %zmm1 ; AVX512DQ-NEXT: vporq %zmm1, %zmm0, %zmm0 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512DQ-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512DQ-NEXT: vpandn %ymm1, %ymm0, %ymm2 ; AVX512DQ-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm3 @@ -210,7 +210,7 @@ ; AVX512DQ-NEXT: vpord %zmm1, %zmm0, %zmm0 ; AVX512DQ-NEXT: vpsrld $16, %zmm0, %zmm1 ; AVX512DQ-NEXT: vpord %zmm1, %zmm0, %zmm0 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512DQ-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512DQ-NEXT: vpandn %ymm1, %ymm0, %ymm2 ; AVX512DQ-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm3 @@ -295,7 +295,7 @@ ; AVX512DQ-NEXT: vpord %zmm1, %zmm0, %zmm0 ; AVX512DQ-NEXT: vpsrld $16, %zmm0, %zmm1 ; AVX512DQ-NEXT: vpord %zmm1, %zmm0, %zmm0 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512DQ-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512DQ-NEXT: vpandn %ymm1, %ymm0, %ymm2 ; AVX512DQ-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm3 @@ -335,7 +335,7 @@ ; AVX512CD-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero ; AVX512CD-NEXT: vplzcntd %zmm1, %zmm1 ; AVX512CD-NEXT: vpmovdw %zmm1, %ymm1 -; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm2 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX512CD-NEXT: vpbroadcastw {{.*#+}} ymm2 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] ; AVX512CD-NEXT: vpsubw %ymm2, %ymm1, %ymm1 ; AVX512CD-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512CD-NEXT: vplzcntd %zmm0, %zmm0 @@ -382,7 +382,7 @@ ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] ; AVX512DQ-NEXT: vpshufb %ymm1, %ymm2, %ymm3 ; AVX512DQ-NEXT: vpsrlw $4, %ymm1, %ymm4 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512DQ-NEXT: vpbroadcastb {{.*#+}} ymm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512DQ-NEXT: vpand %ymm5, %ymm4, %ymm4 ; AVX512DQ-NEXT: vpxor %xmm6, %xmm6, %xmm6 ; AVX512DQ-NEXT: vpcmpeqb %ymm6, %ymm4, %ymm7 @@ -419,7 +419,7 @@ ; AVX512CD-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero ; AVX512CD-NEXT: vplzcntd %zmm1, %zmm1 ; AVX512CD-NEXT: vpmovdw %zmm1, %ymm1 -; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm2 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX512CD-NEXT: vpbroadcastw {{.*#+}} ymm2 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] ; AVX512CD-NEXT: vpsubw %ymm2, %ymm1, %ymm1 ; AVX512CD-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512CD-NEXT: vplzcntd %zmm0, %zmm0 @@ -466,7 +466,7 @@ ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] ; AVX512DQ-NEXT: vpshufb %ymm1, %ymm2, %ymm3 ; AVX512DQ-NEXT: vpsrlw $4, %ymm1, %ymm4 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512DQ-NEXT: vpbroadcastb {{.*#+}} ymm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512DQ-NEXT: vpand %ymm5, %ymm4, %ymm4 ; AVX512DQ-NEXT: vpxor %xmm6, %xmm6, %xmm6 ; AVX512DQ-NEXT: vpcmpeqb %ymm6, %ymm4, %ymm7 @@ -508,7 +508,7 @@ ; AVX512CD-NEXT: vplzcntd %zmm1, %zmm1 ; AVX512CD-NEXT: vpmovdb %zmm1, %xmm1 ; AVX512CD-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 -; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm2 = [24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24] +; AVX512CD-NEXT: vpbroadcastb {{.*#+}} ymm2 = [24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24] ; AVX512CD-NEXT: vpsubb %ymm2, %ymm1, %ymm1 ; AVX512CD-NEXT: vpmovzxbd {{.*#+}} zmm3 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero ; AVX512CD-NEXT: vplzcntd %zmm3, %zmm3 @@ -564,7 +564,7 @@ ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] ; AVX512DQ-NEXT: vpshufb %ymm1, %ymm2, %ymm3 ; AVX512DQ-NEXT: vpsrlw $4, %ymm1, %ymm1 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512DQ-NEXT: vpbroadcastb {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512DQ-NEXT: vpand %ymm4, %ymm1, %ymm1 ; AVX512DQ-NEXT: vpxor %xmm5, %xmm5, %xmm5 ; AVX512DQ-NEXT: vpcmpeqb %ymm5, %ymm1, %ymm6 @@ -596,7 +596,7 @@ ; AVX512CD-NEXT: vplzcntd %zmm1, %zmm1 ; AVX512CD-NEXT: vpmovdb %zmm1, %xmm1 ; AVX512CD-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 -; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm2 = [24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24] +; AVX512CD-NEXT: vpbroadcastb {{.*#+}} ymm2 = [24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24] ; AVX512CD-NEXT: vpsubb %ymm2, %ymm1, %ymm1 ; AVX512CD-NEXT: vpmovzxbd {{.*#+}} zmm3 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero ; AVX512CD-NEXT: vplzcntd %zmm3, %zmm3 @@ -652,7 +652,7 @@ ; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] ; AVX512DQ-NEXT: vpshufb %ymm1, %ymm2, %ymm3 ; AVX512DQ-NEXT: vpsrlw $4, %ymm1, %ymm1 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512DQ-NEXT: vpbroadcastb {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512DQ-NEXT: vpand %ymm4, %ymm1, %ymm1 ; AVX512DQ-NEXT: vpxor %xmm5, %xmm5, %xmm5 ; AVX512DQ-NEXT: vpcmpeqb %ymm5, %ymm1, %ymm6 diff --git a/llvm/test/CodeGen/X86/vector-mul.ll b/llvm/test/CodeGen/X86/vector-mul.ll --- a/llvm/test/CodeGen/X86/vector-mul.ll +++ b/llvm/test/CodeGen/X86/vector-mul.ll @@ -353,8 +353,7 @@ ; ; X64-AVX2-LABEL: mul_v4i32_17: ; X64-AVX2: # %bb.0: -; X64-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [17,17,17,17] -; X64-AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; X64-AVX2-NEXT: retq ; ; X64-AVX512DQ-LABEL: mul_v4i32_17: @@ -507,8 +506,7 @@ ; ; X64-AVX2-LABEL: mul_v8i32_17: ; X64-AVX2: # %bb.0: -; X64-AVX2-NEXT: vpbroadcastd {{.*#+}} ymm1 = [17,17,17,17,17,17,17,17] -; X64-AVX2-NEXT: vpmulld %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; X64-AVX2-NEXT: retq ; ; X64-AVX512DQ-LABEL: mul_v8i32_17: @@ -653,8 +651,7 @@ ; ; X64-AVX2-LABEL: mul_v4i32_neg33: ; X64-AVX2: # %bb.0: -; X64-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [4294967263,4294967263,4294967263,4294967263] -; X64-AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; X64-AVX2-NEXT: retq ; ; X64-AVX512DQ-LABEL: mul_v4i32_neg33: @@ -842,8 +839,7 @@ ; ; X64-AVX2-LABEL: mul_v8i32_neg33: ; X64-AVX2: # %bb.0: -; X64-AVX2-NEXT: vpbroadcastd {{.*#+}} ymm1 = [4294967263,4294967263,4294967263,4294967263,4294967263,4294967263,4294967263,4294967263] -; X64-AVX2-NEXT: vpmulld %ymm1, %ymm0, %ymm0 +; X64-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; X64-AVX2-NEXT: retq ; ; X64-AVX512DQ-LABEL: mul_v8i32_neg33: @@ -1195,8 +1191,7 @@ ; ; X64-AVX2-LABEL: mul_v4i32_7: ; X64-AVX2: # %bb.0: -; X64-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [7,7,7,7] -; X64-AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; X64-AVX2-NEXT: retq ; ; X64-AVX512DQ-LABEL: mul_v4i32_7: @@ -1321,8 +1316,7 @@ ; ; X64-AVX2-LABEL: mul_v4i32_neg63: ; X64-AVX2: # %bb.0: -; X64-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [4294967233,4294967233,4294967233,4294967233] -; X64-AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; X64-AVX2-NEXT: retq ; ; X64-AVX512DQ-LABEL: mul_v4i32_neg63: diff --git a/llvm/test/CodeGen/X86/vector-pack-128.ll b/llvm/test/CodeGen/X86/vector-pack-128.ll --- a/llvm/test/CodeGen/X86/vector-pack-128.ll +++ b/llvm/test/CodeGen/X86/vector-pack-128.ll @@ -26,8 +26,7 @@ ; AVX2-LABEL: trunc_concat_packssdw_128: ; AVX2: # %bb.0: ; AVX2-NEXT: vpsrad $17, %xmm0, %xmm0 -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [15,15,15,15] -; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; AVX2-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq ; @@ -69,8 +68,7 @@ ; AVX2-LABEL: trunc_concat_packusdw_128: ; AVX2: # %bb.0: ; AVX2-NEXT: vpsrld $17, %xmm0, %xmm0 -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [15,15,15,15] -; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq ; @@ -158,8 +156,7 @@ ; AVX2-LABEL: concat_trunc_packssdw_128: ; AVX2: # %bb.0: ; AVX2-NEXT: vpsrad $17, %xmm0, %xmm0 -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [15,15,15,15] -; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; AVX2-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq ; @@ -204,8 +201,7 @@ ; AVX2-LABEL: concat_trunc_packusdw_128: ; AVX2: # %bb.0: ; AVX2-NEXT: vpsrld $17, %xmm0, %xmm0 -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [15,15,15,15] -; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vector-pack-256.ll b/llvm/test/CodeGen/X86/vector-pack-256.ll --- a/llvm/test/CodeGen/X86/vector-pack-256.ll +++ b/llvm/test/CodeGen/X86/vector-pack-256.ll @@ -59,8 +59,7 @@ ; AVX2-LABEL: trunc_concat_packusdw_256: ; AVX2: # %bb.0: ; AVX2-NEXT: vpsrld $17, %ymm0, %ymm0 -; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15] -; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 ; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vector-popcnt-128-ult-ugt.ll b/llvm/test/CodeGen/X86/vector-popcnt-128-ult-ugt.ll --- a/llvm/test/CodeGen/X86/vector-popcnt-128-ult-ugt.ll +++ b/llvm/test/CodeGen/X86/vector-popcnt-128-ult-ugt.ll @@ -107,7 +107,7 @@ ; BITALG_NOVLX: # %bb.0: ; BITALG_NOVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2] +; BITALG_NOVLX-NEXT: vpbroadcastb {{.*#+}} xmm1 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2] ; BITALG_NOVLX-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -210,7 +210,7 @@ ; ; AVX2-LABEL: ugt_2_v16i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -347,7 +347,7 @@ ; ; AVX2-LABEL: ult_3_v16i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -355,7 +355,7 @@ ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3] ; AVX2-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -364,7 +364,7 @@ ; AVX512VPOPCNTDQ-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero ; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastb {{.*#+}} xmm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -383,7 +383,7 @@ ; BITALG_NOVLX: # %bb.0: ; BITALG_NOVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3] +; BITALG_NOVLX-NEXT: vpbroadcastb {{.*#+}} xmm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3] ; BITALG_NOVLX-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -486,7 +486,7 @@ ; ; AVX2-LABEL: ugt_3_v16i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -623,7 +623,7 @@ ; ; AVX2-LABEL: ult_4_v16i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -631,7 +631,7 @@ ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] ; AVX2-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -640,7 +640,7 @@ ; AVX512VPOPCNTDQ-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero ; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastb {{.*#+}} xmm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -659,7 +659,7 @@ ; BITALG_NOVLX: # %bb.0: ; BITALG_NOVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] +; BITALG_NOVLX-NEXT: vpbroadcastb {{.*#+}} xmm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] ; BITALG_NOVLX-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -762,7 +762,7 @@ ; ; AVX2-LABEL: ugt_4_v16i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -899,7 +899,7 @@ ; ; AVX2-LABEL: ult_5_v16i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -907,7 +907,7 @@ ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5] ; AVX2-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -916,7 +916,7 @@ ; AVX512VPOPCNTDQ-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero ; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastb {{.*#+}} xmm1 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -935,7 +935,7 @@ ; BITALG_NOVLX: # %bb.0: ; BITALG_NOVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5] +; BITALG_NOVLX-NEXT: vpbroadcastb {{.*#+}} xmm1 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5] ; BITALG_NOVLX-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -1038,7 +1038,7 @@ ; ; AVX2-LABEL: ugt_5_v16i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -1175,7 +1175,7 @@ ; ; AVX2-LABEL: ult_6_v16i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -1183,7 +1183,7 @@ ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6] ; AVX2-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -1192,7 +1192,7 @@ ; AVX512VPOPCNTDQ-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero ; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastb {{.*#+}} xmm1 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -1211,7 +1211,7 @@ ; BITALG_NOVLX: # %bb.0: ; BITALG_NOVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6] +; BITALG_NOVLX-NEXT: vpbroadcastb {{.*#+}} xmm1 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6] ; BITALG_NOVLX-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -1314,7 +1314,7 @@ ; ; AVX2-LABEL: ugt_6_v16i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -1451,7 +1451,7 @@ ; ; AVX2-LABEL: ult_7_v16i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -1459,7 +1459,7 @@ ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufb %xmm0, %xmm3, %xmm0 ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] ; AVX2-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -1468,7 +1468,7 @@ ; AVX512VPOPCNTDQ-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero ; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastb {{.*#+}} xmm1 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -1487,7 +1487,7 @@ ; BITALG_NOVLX: # %bb.0: ; BITALG_NOVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; BITALG_NOVLX-NEXT: vpbroadcastb {{.*#+}} xmm1 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] ; BITALG_NOVLX-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -1600,7 +1600,7 @@ ; BITALG_NOVLX: # %bb.0: ; BITALG_NOVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; BITALG_NOVLX-NEXT: vpopcntw %zmm0, %zmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [2,2,2,2,2,2,2,2] +; BITALG_NOVLX-NEXT: vpbroadcastw {{.*#+}} xmm1 = [2,2,2,2,2,2,2,2] ; BITALG_NOVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -1720,7 +1720,7 @@ ; ; AVX2-LABEL: ugt_2_v8i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -1879,7 +1879,7 @@ ; ; AVX2-LABEL: ult_3_v8i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -1890,7 +1890,7 @@ ; AVX2-NEXT: vpsllw $8, %xmm0, %xmm1 ; AVX2-NEXT: vpaddb %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [3,3,3,3,3,3,3,3] +; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm1 = [3,3,3,3,3,3,3,3] ; AVX2-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -1899,7 +1899,7 @@ ; AVX512VPOPCNTDQ-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [3,3,3,3,3,3,3,3] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastw {{.*#+}} xmm1 = [3,3,3,3,3,3,3,3] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -1918,7 +1918,7 @@ ; BITALG_NOVLX: # %bb.0: ; BITALG_NOVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; BITALG_NOVLX-NEXT: vpopcntw %zmm0, %zmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [3,3,3,3,3,3,3,3] +; BITALG_NOVLX-NEXT: vpbroadcastw {{.*#+}} xmm1 = [3,3,3,3,3,3,3,3] ; BITALG_NOVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -2038,7 +2038,7 @@ ; ; AVX2-LABEL: ugt_3_v8i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -2197,7 +2197,7 @@ ; ; AVX2-LABEL: ult_4_v8i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -2208,7 +2208,7 @@ ; AVX2-NEXT: vpsllw $8, %xmm0, %xmm1 ; AVX2-NEXT: vpaddb %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [4,4,4,4,4,4,4,4] +; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm1 = [4,4,4,4,4,4,4,4] ; AVX2-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -2217,7 +2217,7 @@ ; AVX512VPOPCNTDQ-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [4,4,4,4,4,4,4,4] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastw {{.*#+}} xmm1 = [4,4,4,4,4,4,4,4] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -2236,7 +2236,7 @@ ; BITALG_NOVLX: # %bb.0: ; BITALG_NOVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; BITALG_NOVLX-NEXT: vpopcntw %zmm0, %zmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [4,4,4,4,4,4,4,4] +; BITALG_NOVLX-NEXT: vpbroadcastw {{.*#+}} xmm1 = [4,4,4,4,4,4,4,4] ; BITALG_NOVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -2356,7 +2356,7 @@ ; ; AVX2-LABEL: ugt_4_v8i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -2515,7 +2515,7 @@ ; ; AVX2-LABEL: ult_5_v8i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -2526,7 +2526,7 @@ ; AVX2-NEXT: vpsllw $8, %xmm0, %xmm1 ; AVX2-NEXT: vpaddb %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [5,5,5,5,5,5,5,5] +; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm1 = [5,5,5,5,5,5,5,5] ; AVX2-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -2535,7 +2535,7 @@ ; AVX512VPOPCNTDQ-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [5,5,5,5,5,5,5,5] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastw {{.*#+}} xmm1 = [5,5,5,5,5,5,5,5] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -2554,7 +2554,7 @@ ; BITALG_NOVLX: # %bb.0: ; BITALG_NOVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; BITALG_NOVLX-NEXT: vpopcntw %zmm0, %zmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [5,5,5,5,5,5,5,5] +; BITALG_NOVLX-NEXT: vpbroadcastw {{.*#+}} xmm1 = [5,5,5,5,5,5,5,5] ; BITALG_NOVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -2674,7 +2674,7 @@ ; ; AVX2-LABEL: ugt_5_v8i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -2833,7 +2833,7 @@ ; ; AVX2-LABEL: ult_6_v8i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -2844,7 +2844,7 @@ ; AVX2-NEXT: vpsllw $8, %xmm0, %xmm1 ; AVX2-NEXT: vpaddb %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [6,6,6,6,6,6,6,6] +; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm1 = [6,6,6,6,6,6,6,6] ; AVX2-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -2853,7 +2853,7 @@ ; AVX512VPOPCNTDQ-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [6,6,6,6,6,6,6,6] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastw {{.*#+}} xmm1 = [6,6,6,6,6,6,6,6] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -2872,7 +2872,7 @@ ; BITALG_NOVLX: # %bb.0: ; BITALG_NOVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; BITALG_NOVLX-NEXT: vpopcntw %zmm0, %zmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [6,6,6,6,6,6,6,6] +; BITALG_NOVLX-NEXT: vpbroadcastw {{.*#+}} xmm1 = [6,6,6,6,6,6,6,6] ; BITALG_NOVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -2992,7 +2992,7 @@ ; ; AVX2-LABEL: ugt_6_v8i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -3151,7 +3151,7 @@ ; ; AVX2-LABEL: ult_7_v8i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -3162,7 +3162,7 @@ ; AVX2-NEXT: vpsllw $8, %xmm0, %xmm1 ; AVX2-NEXT: vpaddb %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [7,7,7,7,7,7,7,7] +; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm1 = [7,7,7,7,7,7,7,7] ; AVX2-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -3171,7 +3171,7 @@ ; AVX512VPOPCNTDQ-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [7,7,7,7,7,7,7,7] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastw {{.*#+}} xmm1 = [7,7,7,7,7,7,7,7] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -3190,7 +3190,7 @@ ; BITALG_NOVLX: # %bb.0: ; BITALG_NOVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; BITALG_NOVLX-NEXT: vpopcntw %zmm0, %zmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [7,7,7,7,7,7,7,7] +; BITALG_NOVLX-NEXT: vpbroadcastw {{.*#+}} xmm1 = [7,7,7,7,7,7,7,7] ; BITALG_NOVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -3310,7 +3310,7 @@ ; ; AVX2-LABEL: ugt_7_v8i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -3469,7 +3469,7 @@ ; ; AVX2-LABEL: ult_8_v8i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -3480,7 +3480,7 @@ ; AVX2-NEXT: vpsllw $8, %xmm0, %xmm1 ; AVX2-NEXT: vpaddb %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8] +; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8] ; AVX2-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -3489,7 +3489,7 @@ ; AVX512VPOPCNTDQ-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastw {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -3508,7 +3508,7 @@ ; BITALG_NOVLX: # %bb.0: ; BITALG_NOVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; BITALG_NOVLX-NEXT: vpopcntw %zmm0, %zmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8] +; BITALG_NOVLX-NEXT: vpbroadcastw {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8] ; BITALG_NOVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -3628,7 +3628,7 @@ ; ; AVX2-LABEL: ugt_8_v8i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -3787,7 +3787,7 @@ ; ; AVX2-LABEL: ult_9_v8i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -3798,7 +3798,7 @@ ; AVX2-NEXT: vpsllw $8, %xmm0, %xmm1 ; AVX2-NEXT: vpaddb %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [9,9,9,9,9,9,9,9] +; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm1 = [9,9,9,9,9,9,9,9] ; AVX2-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -3807,7 +3807,7 @@ ; AVX512VPOPCNTDQ-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [9,9,9,9,9,9,9,9] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastw {{.*#+}} xmm1 = [9,9,9,9,9,9,9,9] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -3826,7 +3826,7 @@ ; BITALG_NOVLX: # %bb.0: ; BITALG_NOVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; BITALG_NOVLX-NEXT: vpopcntw %zmm0, %zmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [9,9,9,9,9,9,9,9] +; BITALG_NOVLX-NEXT: vpbroadcastw {{.*#+}} xmm1 = [9,9,9,9,9,9,9,9] ; BITALG_NOVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -3946,7 +3946,7 @@ ; ; AVX2-LABEL: ugt_9_v8i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -4105,7 +4105,7 @@ ; ; AVX2-LABEL: ult_10_v8i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -4116,7 +4116,7 @@ ; AVX2-NEXT: vpsllw $8, %xmm0, %xmm1 ; AVX2-NEXT: vpaddb %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [10,10,10,10,10,10,10,10] +; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm1 = [10,10,10,10,10,10,10,10] ; AVX2-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -4125,7 +4125,7 @@ ; AVX512VPOPCNTDQ-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [10,10,10,10,10,10,10,10] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastw {{.*#+}} xmm1 = [10,10,10,10,10,10,10,10] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -4144,7 +4144,7 @@ ; BITALG_NOVLX: # %bb.0: ; BITALG_NOVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; BITALG_NOVLX-NEXT: vpopcntw %zmm0, %zmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [10,10,10,10,10,10,10,10] +; BITALG_NOVLX-NEXT: vpbroadcastw {{.*#+}} xmm1 = [10,10,10,10,10,10,10,10] ; BITALG_NOVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -4264,7 +4264,7 @@ ; ; AVX2-LABEL: ugt_10_v8i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -4423,7 +4423,7 @@ ; ; AVX2-LABEL: ult_11_v8i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -4434,7 +4434,7 @@ ; AVX2-NEXT: vpsllw $8, %xmm0, %xmm1 ; AVX2-NEXT: vpaddb %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [11,11,11,11,11,11,11,11] +; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm1 = [11,11,11,11,11,11,11,11] ; AVX2-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -4443,7 +4443,7 @@ ; AVX512VPOPCNTDQ-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [11,11,11,11,11,11,11,11] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastw {{.*#+}} xmm1 = [11,11,11,11,11,11,11,11] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -4462,7 +4462,7 @@ ; BITALG_NOVLX: # %bb.0: ; BITALG_NOVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; BITALG_NOVLX-NEXT: vpopcntw %zmm0, %zmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [11,11,11,11,11,11,11,11] +; BITALG_NOVLX-NEXT: vpbroadcastw {{.*#+}} xmm1 = [11,11,11,11,11,11,11,11] ; BITALG_NOVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -4582,7 +4582,7 @@ ; ; AVX2-LABEL: ugt_11_v8i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -4741,7 +4741,7 @@ ; ; AVX2-LABEL: ult_12_v8i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -4752,7 +4752,7 @@ ; AVX2-NEXT: vpsllw $8, %xmm0, %xmm1 ; AVX2-NEXT: vpaddb %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [12,12,12,12,12,12,12,12] +; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm1 = [12,12,12,12,12,12,12,12] ; AVX2-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -4761,7 +4761,7 @@ ; AVX512VPOPCNTDQ-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [12,12,12,12,12,12,12,12] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastw {{.*#+}} xmm1 = [12,12,12,12,12,12,12,12] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -4780,7 +4780,7 @@ ; BITALG_NOVLX: # %bb.0: ; BITALG_NOVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; BITALG_NOVLX-NEXT: vpopcntw %zmm0, %zmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [12,12,12,12,12,12,12,12] +; BITALG_NOVLX-NEXT: vpbroadcastw {{.*#+}} xmm1 = [12,12,12,12,12,12,12,12] ; BITALG_NOVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -4900,7 +4900,7 @@ ; ; AVX2-LABEL: ugt_12_v8i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -5059,7 +5059,7 @@ ; ; AVX2-LABEL: ult_13_v8i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -5070,7 +5070,7 @@ ; AVX2-NEXT: vpsllw $8, %xmm0, %xmm1 ; AVX2-NEXT: vpaddb %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [13,13,13,13,13,13,13,13] +; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm1 = [13,13,13,13,13,13,13,13] ; AVX2-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -5079,7 +5079,7 @@ ; AVX512VPOPCNTDQ-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [13,13,13,13,13,13,13,13] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastw {{.*#+}} xmm1 = [13,13,13,13,13,13,13,13] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -5098,7 +5098,7 @@ ; BITALG_NOVLX: # %bb.0: ; BITALG_NOVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; BITALG_NOVLX-NEXT: vpopcntw %zmm0, %zmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [13,13,13,13,13,13,13,13] +; BITALG_NOVLX-NEXT: vpbroadcastw {{.*#+}} xmm1 = [13,13,13,13,13,13,13,13] ; BITALG_NOVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -5218,7 +5218,7 @@ ; ; AVX2-LABEL: ugt_13_v8i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -5377,7 +5377,7 @@ ; ; AVX2-LABEL: ult_14_v8i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -5388,7 +5388,7 @@ ; AVX2-NEXT: vpsllw $8, %xmm0, %xmm1 ; AVX2-NEXT: vpaddb %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [14,14,14,14,14,14,14,14] +; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm1 = [14,14,14,14,14,14,14,14] ; AVX2-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -5397,7 +5397,7 @@ ; AVX512VPOPCNTDQ-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [14,14,14,14,14,14,14,14] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastw {{.*#+}} xmm1 = [14,14,14,14,14,14,14,14] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -5416,7 +5416,7 @@ ; BITALG_NOVLX: # %bb.0: ; BITALG_NOVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; BITALG_NOVLX-NEXT: vpopcntw %zmm0, %zmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [14,14,14,14,14,14,14,14] +; BITALG_NOVLX-NEXT: vpbroadcastw {{.*#+}} xmm1 = [14,14,14,14,14,14,14,14] ; BITALG_NOVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -5536,7 +5536,7 @@ ; ; AVX2-LABEL: ugt_14_v8i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -5695,7 +5695,7 @@ ; ; AVX2-LABEL: ult_15_v8i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -5706,7 +5706,7 @@ ; AVX2-NEXT: vpsllw $8, %xmm0, %xmm1 ; AVX2-NEXT: vpaddb %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -5715,7 +5715,7 @@ ; AVX512VPOPCNTDQ-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastw {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -5734,7 +5734,7 @@ ; BITALG_NOVLX: # %bb.0: ; BITALG_NOVLX-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; BITALG_NOVLX-NEXT: vpopcntw %zmm0, %zmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15] +; BITALG_NOVLX-NEXT: vpbroadcastw {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15] ; BITALG_NOVLX-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -6011,7 +6011,7 @@ ; ; AVX2-LABEL: ugt_2_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -6025,8 +6025,7 @@ ; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2,2,2,2] -; AVX2-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ugt_2_v4i32: @@ -6200,7 +6199,7 @@ ; ; AVX2-LABEL: ult_3_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -6385,7 +6384,7 @@ ; ; AVX2-LABEL: ugt_3_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -6399,8 +6398,7 @@ ; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [3,3,3,3] -; AVX2-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ugt_3_v4i32: @@ -6574,7 +6572,7 @@ ; ; AVX2-LABEL: ult_4_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -6759,7 +6757,7 @@ ; ; AVX2-LABEL: ugt_4_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -6773,8 +6771,7 @@ ; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [4,4,4,4] -; AVX2-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ugt_4_v4i32: @@ -6948,7 +6945,7 @@ ; ; AVX2-LABEL: ult_5_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -7133,7 +7130,7 @@ ; ; AVX2-LABEL: ugt_5_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -7147,8 +7144,7 @@ ; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [5,5,5,5] -; AVX2-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ugt_5_v4i32: @@ -7322,7 +7318,7 @@ ; ; AVX2-LABEL: ult_6_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -7507,7 +7503,7 @@ ; ; AVX2-LABEL: ugt_6_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -7521,8 +7517,7 @@ ; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [6,6,6,6] -; AVX2-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ugt_6_v4i32: @@ -7696,7 +7691,7 @@ ; ; AVX2-LABEL: ult_7_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -7881,7 +7876,7 @@ ; ; AVX2-LABEL: ugt_7_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -7895,8 +7890,7 @@ ; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [7,7,7,7] -; AVX2-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ugt_7_v4i32: @@ -8070,7 +8064,7 @@ ; ; AVX2-LABEL: ult_8_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -8255,7 +8249,7 @@ ; ; AVX2-LABEL: ugt_8_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -8269,8 +8263,7 @@ ; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [8,8,8,8] -; AVX2-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ugt_8_v4i32: @@ -8444,7 +8437,7 @@ ; ; AVX2-LABEL: ult_9_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -8629,7 +8622,7 @@ ; ; AVX2-LABEL: ugt_9_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -8643,8 +8636,7 @@ ; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [9,9,9,9] -; AVX2-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ugt_9_v4i32: @@ -8818,7 +8810,7 @@ ; ; AVX2-LABEL: ult_10_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -9003,7 +8995,7 @@ ; ; AVX2-LABEL: ugt_10_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -9017,8 +9009,7 @@ ; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [10,10,10,10] -; AVX2-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ugt_10_v4i32: @@ -9192,7 +9183,7 @@ ; ; AVX2-LABEL: ult_11_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -9377,7 +9368,7 @@ ; ; AVX2-LABEL: ugt_11_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -9391,8 +9382,7 @@ ; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [11,11,11,11] -; AVX2-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ugt_11_v4i32: @@ -9566,7 +9556,7 @@ ; ; AVX2-LABEL: ult_12_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -9751,7 +9741,7 @@ ; ; AVX2-LABEL: ugt_12_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -9765,8 +9755,7 @@ ; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [12,12,12,12] -; AVX2-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ugt_12_v4i32: @@ -9940,7 +9929,7 @@ ; ; AVX2-LABEL: ult_13_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -10125,7 +10114,7 @@ ; ; AVX2-LABEL: ugt_13_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -10139,8 +10128,7 @@ ; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [13,13,13,13] -; AVX2-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ugt_13_v4i32: @@ -10314,7 +10302,7 @@ ; ; AVX2-LABEL: ult_14_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -10499,7 +10487,7 @@ ; ; AVX2-LABEL: ugt_14_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -10513,8 +10501,7 @@ ; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [14,14,14,14] -; AVX2-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ugt_14_v4i32: @@ -10688,7 +10675,7 @@ ; ; AVX2-LABEL: ult_15_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -10873,7 +10860,7 @@ ; ; AVX2-LABEL: ugt_15_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -10887,8 +10874,7 @@ ; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [15,15,15,15] -; AVX2-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ugt_15_v4i32: @@ -11062,7 +11048,7 @@ ; ; AVX2-LABEL: ult_16_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -11247,7 +11233,7 @@ ; ; AVX2-LABEL: ugt_16_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -11261,8 +11247,7 @@ ; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [16,16,16,16] -; AVX2-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ugt_16_v4i32: @@ -11436,7 +11421,7 @@ ; ; AVX2-LABEL: ult_17_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -11621,7 +11606,7 @@ ; ; AVX2-LABEL: ugt_17_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -11635,8 +11620,7 @@ ; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [17,17,17,17] -; AVX2-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ugt_17_v4i32: @@ -11810,7 +11794,7 @@ ; ; AVX2-LABEL: ult_18_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -11995,7 +11979,7 @@ ; ; AVX2-LABEL: ugt_18_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -12009,8 +11993,7 @@ ; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [18,18,18,18] -; AVX2-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ugt_18_v4i32: @@ -12184,7 +12167,7 @@ ; ; AVX2-LABEL: ult_19_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -12369,7 +12352,7 @@ ; ; AVX2-LABEL: ugt_19_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -12383,8 +12366,7 @@ ; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [19,19,19,19] -; AVX2-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ugt_19_v4i32: @@ -12558,7 +12540,7 @@ ; ; AVX2-LABEL: ult_20_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -12743,7 +12725,7 @@ ; ; AVX2-LABEL: ugt_20_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -12757,8 +12739,7 @@ ; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [20,20,20,20] -; AVX2-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ugt_20_v4i32: @@ -12932,7 +12913,7 @@ ; ; AVX2-LABEL: ult_21_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -13117,7 +13098,7 @@ ; ; AVX2-LABEL: ugt_21_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -13131,8 +13112,7 @@ ; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [21,21,21,21] -; AVX2-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ugt_21_v4i32: @@ -13306,7 +13286,7 @@ ; ; AVX2-LABEL: ult_22_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -13491,7 +13471,7 @@ ; ; AVX2-LABEL: ugt_22_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -13505,8 +13485,7 @@ ; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [22,22,22,22] -; AVX2-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ugt_22_v4i32: @@ -13680,7 +13659,7 @@ ; ; AVX2-LABEL: ult_23_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -13865,7 +13844,7 @@ ; ; AVX2-LABEL: ugt_23_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -13879,8 +13858,7 @@ ; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [23,23,23,23] -; AVX2-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ugt_23_v4i32: @@ -14054,7 +14032,7 @@ ; ; AVX2-LABEL: ult_24_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -14239,7 +14217,7 @@ ; ; AVX2-LABEL: ugt_24_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -14253,8 +14231,7 @@ ; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [24,24,24,24] -; AVX2-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ugt_24_v4i32: @@ -14428,7 +14405,7 @@ ; ; AVX2-LABEL: ult_25_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -14613,7 +14590,7 @@ ; ; AVX2-LABEL: ugt_25_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -14627,8 +14604,7 @@ ; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [25,25,25,25] -; AVX2-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ugt_25_v4i32: @@ -14802,7 +14778,7 @@ ; ; AVX2-LABEL: ult_26_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -14987,7 +14963,7 @@ ; ; AVX2-LABEL: ugt_26_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -15001,8 +14977,7 @@ ; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [26,26,26,26] -; AVX2-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ugt_26_v4i32: @@ -15176,7 +15151,7 @@ ; ; AVX2-LABEL: ult_27_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -15361,7 +15336,7 @@ ; ; AVX2-LABEL: ugt_27_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -15375,8 +15350,7 @@ ; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [27,27,27,27] -; AVX2-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ugt_27_v4i32: @@ -15550,7 +15524,7 @@ ; ; AVX2-LABEL: ult_28_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -15735,7 +15709,7 @@ ; ; AVX2-LABEL: ugt_28_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -15749,8 +15723,7 @@ ; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [28,28,28,28] -; AVX2-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ugt_28_v4i32: @@ -15924,7 +15897,7 @@ ; ; AVX2-LABEL: ult_29_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -16109,7 +16082,7 @@ ; ; AVX2-LABEL: ugt_29_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -16123,8 +16096,7 @@ ; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [29,29,29,29] -; AVX2-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ugt_29_v4i32: @@ -16298,7 +16270,7 @@ ; ; AVX2-LABEL: ult_30_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -16483,7 +16455,7 @@ ; ; AVX2-LABEL: ugt_30_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -16497,8 +16469,7 @@ ; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [30,30,30,30] -; AVX2-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ugt_30_v4i32: @@ -16672,7 +16643,7 @@ ; ; AVX2-LABEL: ult_31_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -16917,7 +16888,7 @@ ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [2,2] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [2,2] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -17054,7 +17025,7 @@ ; ; AVX2-LABEL: ugt_2_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -17213,7 +17184,7 @@ ; ; AVX2-LABEL: ult_3_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -17223,7 +17194,7 @@ ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [3,3] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [3,3] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -17231,7 +17202,7 @@ ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [3,3] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [3,3] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -17249,7 +17220,7 @@ ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [3,3] +; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [3,3] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -17370,7 +17341,7 @@ ; ; AVX2-LABEL: ugt_3_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -17529,7 +17500,7 @@ ; ; AVX2-LABEL: ult_4_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -17539,7 +17510,7 @@ ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [4,4] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [4,4] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -17547,7 +17518,7 @@ ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [4,4] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [4,4] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -17565,7 +17536,7 @@ ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [4,4] +; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [4,4] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -17686,7 +17657,7 @@ ; ; AVX2-LABEL: ugt_4_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -17845,7 +17816,7 @@ ; ; AVX2-LABEL: ult_5_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -17855,7 +17826,7 @@ ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [5,5] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [5,5] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -17863,7 +17834,7 @@ ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [5,5] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [5,5] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -17881,7 +17852,7 @@ ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [5,5] +; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [5,5] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -18002,7 +17973,7 @@ ; ; AVX2-LABEL: ugt_5_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -18161,7 +18132,7 @@ ; ; AVX2-LABEL: ult_6_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -18171,7 +18142,7 @@ ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [6,6] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [6,6] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -18179,7 +18150,7 @@ ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [6,6] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [6,6] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -18197,7 +18168,7 @@ ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [6,6] +; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [6,6] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -18318,7 +18289,7 @@ ; ; AVX2-LABEL: ugt_6_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -18477,7 +18448,7 @@ ; ; AVX2-LABEL: ult_7_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -18487,7 +18458,7 @@ ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [7,7] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [7,7] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -18495,7 +18466,7 @@ ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [7,7] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [7,7] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -18513,7 +18484,7 @@ ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [7,7] +; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [7,7] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -18634,7 +18605,7 @@ ; ; AVX2-LABEL: ugt_7_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -18793,7 +18764,7 @@ ; ; AVX2-LABEL: ult_8_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -18803,7 +18774,7 @@ ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [8,8] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [8,8] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -18811,7 +18782,7 @@ ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [8,8] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [8,8] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -18829,7 +18800,7 @@ ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [8,8] +; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [8,8] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -18950,7 +18921,7 @@ ; ; AVX2-LABEL: ugt_8_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -19109,7 +19080,7 @@ ; ; AVX2-LABEL: ult_9_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -19119,7 +19090,7 @@ ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [9,9] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [9,9] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -19127,7 +19098,7 @@ ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [9,9] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [9,9] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -19145,7 +19116,7 @@ ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [9,9] +; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [9,9] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -19266,7 +19237,7 @@ ; ; AVX2-LABEL: ugt_9_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -19425,7 +19396,7 @@ ; ; AVX2-LABEL: ult_10_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -19435,7 +19406,7 @@ ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [10,10] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [10,10] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -19443,7 +19414,7 @@ ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [10,10] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [10,10] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -19461,7 +19432,7 @@ ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [10,10] +; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [10,10] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -19582,7 +19553,7 @@ ; ; AVX2-LABEL: ugt_10_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -19741,7 +19712,7 @@ ; ; AVX2-LABEL: ult_11_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -19751,7 +19722,7 @@ ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [11,11] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [11,11] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -19759,7 +19730,7 @@ ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [11,11] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [11,11] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -19777,7 +19748,7 @@ ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [11,11] +; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [11,11] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -19898,7 +19869,7 @@ ; ; AVX2-LABEL: ugt_11_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -20057,7 +20028,7 @@ ; ; AVX2-LABEL: ult_12_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -20067,7 +20038,7 @@ ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [12,12] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [12,12] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -20075,7 +20046,7 @@ ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [12,12] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [12,12] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -20093,7 +20064,7 @@ ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [12,12] +; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [12,12] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -20214,7 +20185,7 @@ ; ; AVX2-LABEL: ugt_12_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -20373,7 +20344,7 @@ ; ; AVX2-LABEL: ult_13_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -20383,7 +20354,7 @@ ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [13,13] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [13,13] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -20391,7 +20362,7 @@ ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [13,13] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [13,13] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -20409,7 +20380,7 @@ ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [13,13] +; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [13,13] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -20530,7 +20501,7 @@ ; ; AVX2-LABEL: ugt_13_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -20689,7 +20660,7 @@ ; ; AVX2-LABEL: ult_14_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -20699,7 +20670,7 @@ ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [14,14] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [14,14] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -20707,7 +20678,7 @@ ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [14,14] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [14,14] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -20725,7 +20696,7 @@ ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [14,14] +; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [14,14] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -20846,7 +20817,7 @@ ; ; AVX2-LABEL: ugt_14_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -21005,7 +20976,7 @@ ; ; AVX2-LABEL: ult_15_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -21015,7 +20986,7 @@ ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [15,15] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -21023,7 +20994,7 @@ ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [15,15] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -21041,7 +21012,7 @@ ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15] +; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [15,15] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -21162,7 +21133,7 @@ ; ; AVX2-LABEL: ugt_15_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -21321,7 +21292,7 @@ ; ; AVX2-LABEL: ult_16_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -21331,7 +21302,7 @@ ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [16,16] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [16,16] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -21339,7 +21310,7 @@ ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [16,16] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [16,16] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -21357,7 +21328,7 @@ ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [16,16] +; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [16,16] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -21478,7 +21449,7 @@ ; ; AVX2-LABEL: ugt_16_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -21637,7 +21608,7 @@ ; ; AVX2-LABEL: ult_17_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -21647,7 +21618,7 @@ ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [17,17] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [17,17] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -21655,7 +21626,7 @@ ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [17,17] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [17,17] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -21673,7 +21644,7 @@ ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [17,17] +; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [17,17] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -21794,7 +21765,7 @@ ; ; AVX2-LABEL: ugt_17_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -21953,7 +21924,7 @@ ; ; AVX2-LABEL: ult_18_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -21963,7 +21934,7 @@ ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [18,18] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [18,18] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -21971,7 +21942,7 @@ ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [18,18] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [18,18] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -21989,7 +21960,7 @@ ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [18,18] +; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [18,18] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -22110,7 +22081,7 @@ ; ; AVX2-LABEL: ugt_18_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -22269,7 +22240,7 @@ ; ; AVX2-LABEL: ult_19_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -22279,7 +22250,7 @@ ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [19,19] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [19,19] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -22287,7 +22258,7 @@ ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [19,19] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [19,19] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -22305,7 +22276,7 @@ ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [19,19] +; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [19,19] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -22426,7 +22397,7 @@ ; ; AVX2-LABEL: ugt_19_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -22585,7 +22556,7 @@ ; ; AVX2-LABEL: ult_20_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -22595,7 +22566,7 @@ ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [20,20] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [20,20] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -22603,7 +22574,7 @@ ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [20,20] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [20,20] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -22621,7 +22592,7 @@ ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [20,20] +; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [20,20] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -22742,7 +22713,7 @@ ; ; AVX2-LABEL: ugt_20_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -22901,7 +22872,7 @@ ; ; AVX2-LABEL: ult_21_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -22911,7 +22882,7 @@ ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [21,21] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [21,21] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -22919,7 +22890,7 @@ ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [21,21] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [21,21] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -22937,7 +22908,7 @@ ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [21,21] +; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [21,21] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -23058,7 +23029,7 @@ ; ; AVX2-LABEL: ugt_21_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -23217,7 +23188,7 @@ ; ; AVX2-LABEL: ult_22_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -23227,7 +23198,7 @@ ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [22,22] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [22,22] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -23235,7 +23206,7 @@ ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [22,22] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [22,22] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -23253,7 +23224,7 @@ ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [22,22] +; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [22,22] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -23374,7 +23345,7 @@ ; ; AVX2-LABEL: ugt_22_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -23533,7 +23504,7 @@ ; ; AVX2-LABEL: ult_23_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -23543,7 +23514,7 @@ ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [23,23] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [23,23] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -23551,7 +23522,7 @@ ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [23,23] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [23,23] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -23569,7 +23540,7 @@ ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [23,23] +; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [23,23] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -23690,7 +23661,7 @@ ; ; AVX2-LABEL: ugt_23_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -23849,7 +23820,7 @@ ; ; AVX2-LABEL: ult_24_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -23859,7 +23830,7 @@ ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [24,24] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [24,24] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -23867,7 +23838,7 @@ ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [24,24] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [24,24] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -23885,7 +23856,7 @@ ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [24,24] +; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [24,24] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -24006,7 +23977,7 @@ ; ; AVX2-LABEL: ugt_24_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -24165,7 +24136,7 @@ ; ; AVX2-LABEL: ult_25_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -24175,7 +24146,7 @@ ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [25,25] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [25,25] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -24183,7 +24154,7 @@ ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [25,25] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [25,25] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -24201,7 +24172,7 @@ ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [25,25] +; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [25,25] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -24322,7 +24293,7 @@ ; ; AVX2-LABEL: ugt_25_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -24481,7 +24452,7 @@ ; ; AVX2-LABEL: ult_26_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -24491,7 +24462,7 @@ ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [26,26] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [26,26] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -24499,7 +24470,7 @@ ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [26,26] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [26,26] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -24517,7 +24488,7 @@ ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [26,26] +; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [26,26] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -24638,7 +24609,7 @@ ; ; AVX2-LABEL: ugt_26_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -24797,7 +24768,7 @@ ; ; AVX2-LABEL: ult_27_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -24807,7 +24778,7 @@ ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [27,27] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [27,27] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -24815,7 +24786,7 @@ ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [27,27] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [27,27] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -24833,7 +24804,7 @@ ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [27,27] +; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [27,27] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -24954,7 +24925,7 @@ ; ; AVX2-LABEL: ugt_27_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -25113,7 +25084,7 @@ ; ; AVX2-LABEL: ult_28_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -25123,7 +25094,7 @@ ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [28,28] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [28,28] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -25131,7 +25102,7 @@ ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [28,28] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [28,28] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -25149,7 +25120,7 @@ ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [28,28] +; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [28,28] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -25270,7 +25241,7 @@ ; ; AVX2-LABEL: ugt_28_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -25429,7 +25400,7 @@ ; ; AVX2-LABEL: ult_29_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -25439,7 +25410,7 @@ ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [29,29] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [29,29] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -25447,7 +25418,7 @@ ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [29,29] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [29,29] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -25465,7 +25436,7 @@ ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [29,29] +; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [29,29] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -25586,7 +25557,7 @@ ; ; AVX2-LABEL: ugt_29_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -25745,7 +25716,7 @@ ; ; AVX2-LABEL: ult_30_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -25755,7 +25726,7 @@ ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [30,30] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [30,30] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -25763,7 +25734,7 @@ ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [30,30] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [30,30] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -25781,7 +25752,7 @@ ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [30,30] +; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [30,30] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -25902,7 +25873,7 @@ ; ; AVX2-LABEL: ugt_30_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -26061,7 +26032,7 @@ ; ; AVX2-LABEL: ult_31_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -26071,7 +26042,7 @@ ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [31,31] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [31,31] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -26079,7 +26050,7 @@ ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [31,31] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [31,31] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -26097,7 +26068,7 @@ ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [31,31] +; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [31,31] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -26218,7 +26189,7 @@ ; ; AVX2-LABEL: ugt_31_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -26377,7 +26348,7 @@ ; ; AVX2-LABEL: ult_32_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -26387,7 +26358,7 @@ ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [32,32] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [32,32] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -26395,7 +26366,7 @@ ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [32,32] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [32,32] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -26413,7 +26384,7 @@ ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [32,32] +; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [32,32] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -26534,7 +26505,7 @@ ; ; AVX2-LABEL: ugt_32_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -26693,7 +26664,7 @@ ; ; AVX2-LABEL: ult_33_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -26703,7 +26674,7 @@ ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [33,33] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [33,33] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -26711,7 +26682,7 @@ ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [33,33] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [33,33] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -26729,7 +26700,7 @@ ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [33,33] +; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [33,33] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -26850,7 +26821,7 @@ ; ; AVX2-LABEL: ugt_33_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -27009,7 +26980,7 @@ ; ; AVX2-LABEL: ult_34_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -27019,7 +26990,7 @@ ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [34,34] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [34,34] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -27027,7 +26998,7 @@ ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [34,34] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [34,34] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -27045,7 +27016,7 @@ ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [34,34] +; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [34,34] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -27166,7 +27137,7 @@ ; ; AVX2-LABEL: ugt_34_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -27325,7 +27296,7 @@ ; ; AVX2-LABEL: ult_35_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -27335,7 +27306,7 @@ ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [35,35] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [35,35] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -27343,7 +27314,7 @@ ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [35,35] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [35,35] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -27361,7 +27332,7 @@ ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [35,35] +; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [35,35] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -27482,7 +27453,7 @@ ; ; AVX2-LABEL: ugt_35_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -27641,7 +27612,7 @@ ; ; AVX2-LABEL: ult_36_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -27651,7 +27622,7 @@ ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [36,36] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [36,36] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -27659,7 +27630,7 @@ ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [36,36] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [36,36] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -27677,7 +27648,7 @@ ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [36,36] +; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [36,36] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -27798,7 +27769,7 @@ ; ; AVX2-LABEL: ugt_36_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -27957,7 +27928,7 @@ ; ; AVX2-LABEL: ult_37_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -27967,7 +27938,7 @@ ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [37,37] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [37,37] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -27975,7 +27946,7 @@ ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [37,37] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [37,37] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -27993,7 +27964,7 @@ ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [37,37] +; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [37,37] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -28114,7 +28085,7 @@ ; ; AVX2-LABEL: ugt_37_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -28273,7 +28244,7 @@ ; ; AVX2-LABEL: ult_38_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -28283,7 +28254,7 @@ ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [38,38] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [38,38] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -28291,7 +28262,7 @@ ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [38,38] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [38,38] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -28309,7 +28280,7 @@ ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [38,38] +; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [38,38] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -28430,7 +28401,7 @@ ; ; AVX2-LABEL: ugt_38_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -28589,7 +28560,7 @@ ; ; AVX2-LABEL: ult_39_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -28599,7 +28570,7 @@ ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [39,39] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [39,39] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -28607,7 +28578,7 @@ ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [39,39] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [39,39] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -28625,7 +28596,7 @@ ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [39,39] +; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [39,39] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -28746,7 +28717,7 @@ ; ; AVX2-LABEL: ugt_39_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -28905,7 +28876,7 @@ ; ; AVX2-LABEL: ult_40_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -28915,7 +28886,7 @@ ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [40,40] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [40,40] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -28923,7 +28894,7 @@ ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [40,40] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [40,40] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -28941,7 +28912,7 @@ ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [40,40] +; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [40,40] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -29062,7 +29033,7 @@ ; ; AVX2-LABEL: ugt_40_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -29221,7 +29192,7 @@ ; ; AVX2-LABEL: ult_41_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -29231,7 +29202,7 @@ ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [41,41] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [41,41] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -29239,7 +29210,7 @@ ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [41,41] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [41,41] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -29257,7 +29228,7 @@ ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [41,41] +; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [41,41] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -29378,7 +29349,7 @@ ; ; AVX2-LABEL: ugt_41_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -29537,7 +29508,7 @@ ; ; AVX2-LABEL: ult_42_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -29547,7 +29518,7 @@ ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [42,42] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [42,42] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -29555,7 +29526,7 @@ ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [42,42] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [42,42] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -29573,7 +29544,7 @@ ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [42,42] +; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [42,42] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -29694,7 +29665,7 @@ ; ; AVX2-LABEL: ugt_42_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -29853,7 +29824,7 @@ ; ; AVX2-LABEL: ult_43_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -29863,7 +29834,7 @@ ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [43,43] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [43,43] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -29871,7 +29842,7 @@ ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [43,43] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [43,43] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -29889,7 +29860,7 @@ ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [43,43] +; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [43,43] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -30010,7 +29981,7 @@ ; ; AVX2-LABEL: ugt_43_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -30169,7 +30140,7 @@ ; ; AVX2-LABEL: ult_44_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -30179,7 +30150,7 @@ ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [44,44] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [44,44] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -30187,7 +30158,7 @@ ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [44,44] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [44,44] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -30205,7 +30176,7 @@ ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [44,44] +; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [44,44] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -30326,7 +30297,7 @@ ; ; AVX2-LABEL: ugt_44_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -30485,7 +30456,7 @@ ; ; AVX2-LABEL: ult_45_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -30495,7 +30466,7 @@ ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [45,45] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [45,45] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -30503,7 +30474,7 @@ ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [45,45] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [45,45] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -30521,7 +30492,7 @@ ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [45,45] +; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [45,45] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -30642,7 +30613,7 @@ ; ; AVX2-LABEL: ugt_45_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -30801,7 +30772,7 @@ ; ; AVX2-LABEL: ult_46_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -30811,7 +30782,7 @@ ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [46,46] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [46,46] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -30819,7 +30790,7 @@ ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [46,46] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [46,46] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -30837,7 +30808,7 @@ ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [46,46] +; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [46,46] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -30958,7 +30929,7 @@ ; ; AVX2-LABEL: ugt_46_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -31117,7 +31088,7 @@ ; ; AVX2-LABEL: ult_47_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -31127,7 +31098,7 @@ ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [47,47] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [47,47] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -31135,7 +31106,7 @@ ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [47,47] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [47,47] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -31153,7 +31124,7 @@ ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [47,47] +; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [47,47] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -31274,7 +31245,7 @@ ; ; AVX2-LABEL: ugt_47_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -31433,7 +31404,7 @@ ; ; AVX2-LABEL: ult_48_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -31443,7 +31414,7 @@ ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [48,48] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [48,48] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -31451,7 +31422,7 @@ ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [48,48] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [48,48] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -31469,7 +31440,7 @@ ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [48,48] +; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [48,48] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -31590,7 +31561,7 @@ ; ; AVX2-LABEL: ugt_48_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -31749,7 +31720,7 @@ ; ; AVX2-LABEL: ult_49_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -31759,7 +31730,7 @@ ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [49,49] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [49,49] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -31767,7 +31738,7 @@ ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [49,49] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [49,49] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -31785,7 +31756,7 @@ ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [49,49] +; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [49,49] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -31906,7 +31877,7 @@ ; ; AVX2-LABEL: ugt_49_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -32065,7 +32036,7 @@ ; ; AVX2-LABEL: ult_50_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -32075,7 +32046,7 @@ ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [50,50] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [50,50] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -32083,7 +32054,7 @@ ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [50,50] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [50,50] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -32101,7 +32072,7 @@ ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [50,50] +; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [50,50] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -32222,7 +32193,7 @@ ; ; AVX2-LABEL: ugt_50_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -32381,7 +32352,7 @@ ; ; AVX2-LABEL: ult_51_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -32391,7 +32362,7 @@ ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [51,51] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [51,51] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -32399,7 +32370,7 @@ ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [51,51] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [51,51] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -32417,7 +32388,7 @@ ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [51,51] +; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [51,51] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -32538,7 +32509,7 @@ ; ; AVX2-LABEL: ugt_51_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -32697,7 +32668,7 @@ ; ; AVX2-LABEL: ult_52_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -32707,7 +32678,7 @@ ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [52,52] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [52,52] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -32715,7 +32686,7 @@ ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [52,52] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [52,52] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -32733,7 +32704,7 @@ ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [52,52] +; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [52,52] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -32854,7 +32825,7 @@ ; ; AVX2-LABEL: ugt_52_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -33013,7 +32984,7 @@ ; ; AVX2-LABEL: ult_53_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -33023,7 +32994,7 @@ ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [53,53] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [53,53] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -33031,7 +33002,7 @@ ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [53,53] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [53,53] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -33049,7 +33020,7 @@ ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [53,53] +; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [53,53] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -33170,7 +33141,7 @@ ; ; AVX2-LABEL: ugt_53_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -33329,7 +33300,7 @@ ; ; AVX2-LABEL: ult_54_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -33339,7 +33310,7 @@ ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [54,54] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [54,54] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -33347,7 +33318,7 @@ ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [54,54] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [54,54] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -33365,7 +33336,7 @@ ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [54,54] +; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [54,54] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -33486,7 +33457,7 @@ ; ; AVX2-LABEL: ugt_54_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -33645,7 +33616,7 @@ ; ; AVX2-LABEL: ult_55_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -33655,7 +33626,7 @@ ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [55,55] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [55,55] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -33663,7 +33634,7 @@ ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [55,55] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [55,55] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -33681,7 +33652,7 @@ ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [55,55] +; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [55,55] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -33802,7 +33773,7 @@ ; ; AVX2-LABEL: ugt_55_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -33961,7 +33932,7 @@ ; ; AVX2-LABEL: ult_56_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -33971,7 +33942,7 @@ ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [56,56] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [56,56] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -33979,7 +33950,7 @@ ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [56,56] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [56,56] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -33997,7 +33968,7 @@ ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [56,56] +; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [56,56] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -34118,7 +34089,7 @@ ; ; AVX2-LABEL: ugt_56_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -34277,7 +34248,7 @@ ; ; AVX2-LABEL: ult_57_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -34287,7 +34258,7 @@ ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [57,57] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [57,57] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -34295,7 +34266,7 @@ ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [57,57] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [57,57] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -34313,7 +34284,7 @@ ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [57,57] +; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [57,57] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -34434,7 +34405,7 @@ ; ; AVX2-LABEL: ugt_57_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -34593,7 +34564,7 @@ ; ; AVX2-LABEL: ult_58_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -34603,7 +34574,7 @@ ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [58,58] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [58,58] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -34611,7 +34582,7 @@ ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [58,58] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [58,58] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -34629,7 +34600,7 @@ ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [58,58] +; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [58,58] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -34750,7 +34721,7 @@ ; ; AVX2-LABEL: ugt_58_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -34909,7 +34880,7 @@ ; ; AVX2-LABEL: ult_59_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -34919,7 +34890,7 @@ ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [59,59] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [59,59] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -34927,7 +34898,7 @@ ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [59,59] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [59,59] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -34945,7 +34916,7 @@ ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [59,59] +; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [59,59] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -35066,7 +35037,7 @@ ; ; AVX2-LABEL: ugt_59_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -35225,7 +35196,7 @@ ; ; AVX2-LABEL: ult_60_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -35235,7 +35206,7 @@ ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [60,60] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [60,60] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -35243,7 +35214,7 @@ ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [60,60] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [60,60] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -35261,7 +35232,7 @@ ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [60,60] +; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [60,60] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -35382,7 +35353,7 @@ ; ; AVX2-LABEL: ugt_60_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -35541,7 +35512,7 @@ ; ; AVX2-LABEL: ult_61_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -35551,7 +35522,7 @@ ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [61,61] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [61,61] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -35559,7 +35530,7 @@ ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [61,61] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [61,61] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -35577,7 +35548,7 @@ ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [61,61] +; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [61,61] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -35698,7 +35669,7 @@ ; ; AVX2-LABEL: ugt_61_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -35857,7 +35828,7 @@ ; ; AVX2-LABEL: ult_62_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -35867,7 +35838,7 @@ ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [62,62] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [62,62] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -35875,7 +35846,7 @@ ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [62,62] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [62,62] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -35893,7 +35864,7 @@ ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [62,62] +; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [62,62] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq @@ -36014,7 +35985,7 @@ ; ; AVX2-LABEL: ugt_62_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -36173,7 +36144,7 @@ ; ; AVX2-LABEL: ult_63_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -36183,7 +36154,7 @@ ; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [63,63] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [63,63] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -36191,7 +36162,7 @@ ; AVX512VPOPCNTDQ: # %bb.0: ; AVX512VPOPCNTDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512VPOPCNTDQ-NEXT: vpopcntq %zmm0, %zmm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [63,63] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastq {{.*#+}} xmm1 = [63,63] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; AVX512VPOPCNTDQ-NEXT: vzeroupper ; AVX512VPOPCNTDQ-NEXT: retq @@ -36209,7 +36180,7 @@ ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 ; BITALG_NOVLX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; BITALG_NOVLX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} xmm1 = [63,63] +; BITALG_NOVLX-NEXT: vpbroadcastq {{.*#+}} xmm1 = [63,63] ; BITALG_NOVLX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 ; BITALG_NOVLX-NEXT: vzeroupper ; BITALG_NOVLX-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-popcnt-128.ll b/llvm/test/CodeGen/X86/vector-popcnt-128.ll --- a/llvm/test/CodeGen/X86/vector-popcnt-128.ll +++ b/llvm/test/CodeGen/X86/vector-popcnt-128.ll @@ -84,20 +84,6 @@ ; SSE41-NEXT: psadbw %xmm3, %xmm0 ; SSE41-NEXT: retq ; -; AVX1OR2-LABEL: testv2i64: -; AVX1OR2: # %bb.0: -; AVX1OR2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX1OR2-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX1OR2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX1OR2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX1OR2-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX1OR2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX1OR2-NEXT: vpshufb %xmm0, %xmm3, %xmm0 -; AVX1OR2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 -; AVX1OR2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1OR2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX1OR2-NEXT: retq -; ; XOP-LABEL: testv2i64: ; XOP: # %bb.0: ; XOP-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 @@ -235,24 +221,6 @@ ; SSE41-NEXT: packuswb %xmm3, %xmm0 ; SSE41-NEXT: retq ; -; AVX1OR2-LABEL: testv4i32: -; AVX1OR2: # %bb.0: -; AVX1OR2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX1OR2-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX1OR2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX1OR2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX1OR2-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX1OR2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX1OR2-NEXT: vpshufb %xmm0, %xmm3, %xmm0 -; AVX1OR2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 -; AVX1OR2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1OR2-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX1OR2-NEXT: vpsadbw %xmm1, %xmm2, %xmm2 -; AVX1OR2-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero -; AVX1OR2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX1OR2-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 -; AVX1OR2-NEXT: retq -; ; XOP-LABEL: testv4i32: ; XOP: # %bb.0: ; XOP-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 @@ -390,21 +358,6 @@ ; SSE41-NEXT: psrlw $8, %xmm0 ; SSE41-NEXT: retq ; -; AVX1OR2-LABEL: testv8i16: -; AVX1OR2: # %bb.0: -; AVX1OR2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX1OR2-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX1OR2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX1OR2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX1OR2-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX1OR2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX1OR2-NEXT: vpshufb %xmm0, %xmm3, %xmm0 -; AVX1OR2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 -; AVX1OR2-NEXT: vpsllw $8, %xmm0, %xmm1 -; AVX1OR2-NEXT: vpaddb %xmm0, %xmm1, %xmm0 -; AVX1OR2-NEXT: vpsrlw $8, %xmm0, %xmm0 -; AVX1OR2-NEXT: retq -; ; XOP-LABEL: testv8i16: ; XOP: # %bb.0: ; XOP-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 @@ -518,18 +471,6 @@ ; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: retq ; -; AVX1OR2-LABEL: testv16i8: -; AVX1OR2: # %bb.0: -; AVX1OR2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX1OR2-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX1OR2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX1OR2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX1OR2-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX1OR2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX1OR2-NEXT: vpshufb %xmm0, %xmm3, %xmm0 -; AVX1OR2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 -; AVX1OR2-NEXT: retq -; ; XOP-LABEL: testv16i8: ; XOP: # %bb.0: ; XOP-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 diff --git a/llvm/test/CodeGen/X86/vector-popcnt-256-ult-ugt.ll b/llvm/test/CodeGen/X86/vector-popcnt-256-ult-ugt.ll --- a/llvm/test/CodeGen/X86/vector-popcnt-256-ult-ugt.ll +++ b/llvm/test/CodeGen/X86/vector-popcnt-256-ult-ugt.ll @@ -119,7 +119,7 @@ ; BITALG_NOVLX: # %bb.0: ; BITALG_NOVLX-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} ymm1 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2] +; BITALG_NOVLX-NEXT: vpbroadcastb {{.*#+}} ymm1 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2] ; BITALG_NOVLX-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm0 ; BITALG_NOVLX-NEXT: retq ; @@ -161,7 +161,7 @@ ; ; AVX2-LABEL: ugt_2_v32i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -174,7 +174,7 @@ ; ; AVX512VPOPCNTDQ-LABEL: ugt_2_v32i8: ; AVX512VPOPCNTDQ: # %bb.0: -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -242,7 +242,7 @@ ; ; AVX2-LABEL: ult_3_v32i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -250,13 +250,13 @@ ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufb %ymm0, %ymm3, %ymm0 ; AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3] ; AVX2-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ult_3_v32i8: ; AVX512VPOPCNTDQ: # %bb.0: -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -264,7 +264,7 @@ ; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm0, %ymm3, %ymm0 ; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm2, %ymm0, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastb {{.*#+}} ymm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm0 ; AVX512VPOPCNTDQ-NEXT: retq ; @@ -286,7 +286,7 @@ ; BITALG_NOVLX: # %bb.0: ; BITALG_NOVLX-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} ymm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3] +; BITALG_NOVLX-NEXT: vpbroadcastb {{.*#+}} ymm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3] ; BITALG_NOVLX-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm0 ; BITALG_NOVLX-NEXT: retq ; @@ -328,7 +328,7 @@ ; ; AVX2-LABEL: ugt_3_v32i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -341,7 +341,7 @@ ; ; AVX512VPOPCNTDQ-LABEL: ugt_3_v32i8: ; AVX512VPOPCNTDQ: # %bb.0: -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -409,7 +409,7 @@ ; ; AVX2-LABEL: ult_4_v32i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -417,13 +417,13 @@ ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufb %ymm0, %ymm3, %ymm0 ; AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] ; AVX2-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ult_4_v32i8: ; AVX512VPOPCNTDQ: # %bb.0: -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -431,7 +431,7 @@ ; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm0, %ymm3, %ymm0 ; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm2, %ymm0, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastb {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm0 ; AVX512VPOPCNTDQ-NEXT: retq ; @@ -453,7 +453,7 @@ ; BITALG_NOVLX: # %bb.0: ; BITALG_NOVLX-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] +; BITALG_NOVLX-NEXT: vpbroadcastb {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] ; BITALG_NOVLX-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm0 ; BITALG_NOVLX-NEXT: retq ; @@ -495,7 +495,7 @@ ; ; AVX2-LABEL: ugt_4_v32i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -508,7 +508,7 @@ ; ; AVX512VPOPCNTDQ-LABEL: ugt_4_v32i8: ; AVX512VPOPCNTDQ: # %bb.0: -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -576,7 +576,7 @@ ; ; AVX2-LABEL: ult_5_v32i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -584,13 +584,13 @@ ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufb %ymm0, %ymm3, %ymm0 ; AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5] ; AVX2-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ult_5_v32i8: ; AVX512VPOPCNTDQ: # %bb.0: -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -598,7 +598,7 @@ ; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm0, %ymm3, %ymm0 ; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm2, %ymm0, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastb {{.*#+}} ymm1 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm0 ; AVX512VPOPCNTDQ-NEXT: retq ; @@ -620,7 +620,7 @@ ; BITALG_NOVLX: # %bb.0: ; BITALG_NOVLX-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} ymm1 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5] +; BITALG_NOVLX-NEXT: vpbroadcastb {{.*#+}} ymm1 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5] ; BITALG_NOVLX-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm0 ; BITALG_NOVLX-NEXT: retq ; @@ -662,7 +662,7 @@ ; ; AVX2-LABEL: ugt_5_v32i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -675,7 +675,7 @@ ; ; AVX512VPOPCNTDQ-LABEL: ugt_5_v32i8: ; AVX512VPOPCNTDQ: # %bb.0: -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -743,7 +743,7 @@ ; ; AVX2-LABEL: ult_6_v32i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -751,13 +751,13 @@ ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufb %ymm0, %ymm3, %ymm0 ; AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6] ; AVX2-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ult_6_v32i8: ; AVX512VPOPCNTDQ: # %bb.0: -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -765,7 +765,7 @@ ; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm0, %ymm3, %ymm0 ; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm2, %ymm0, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastb {{.*#+}} ymm1 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm0 ; AVX512VPOPCNTDQ-NEXT: retq ; @@ -787,7 +787,7 @@ ; BITALG_NOVLX: # %bb.0: ; BITALG_NOVLX-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} ymm1 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6] +; BITALG_NOVLX-NEXT: vpbroadcastb {{.*#+}} ymm1 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6] ; BITALG_NOVLX-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm0 ; BITALG_NOVLX-NEXT: retq ; @@ -829,7 +829,7 @@ ; ; AVX2-LABEL: ugt_6_v32i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -842,7 +842,7 @@ ; ; AVX512VPOPCNTDQ-LABEL: ugt_6_v32i8: ; AVX512VPOPCNTDQ: # %bb.0: -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -910,7 +910,7 @@ ; ; AVX2-LABEL: ult_7_v32i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -918,13 +918,13 @@ ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpshufb %ymm0, %ymm3, %ymm0 ; AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] ; AVX2-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ult_7_v32i8: ; AVX512VPOPCNTDQ: # %bb.0: -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -932,7 +932,7 @@ ; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm0, %ymm3, %ymm0 ; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm2, %ymm0, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastb {{.*#+}} ymm1 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm0 ; AVX512VPOPCNTDQ-NEXT: retq ; @@ -954,7 +954,7 @@ ; BITALG_NOVLX: # %bb.0: ; BITALG_NOVLX-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; BITALG_NOVLX-NEXT: vpopcntb %zmm0, %zmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} ymm1 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; BITALG_NOVLX-NEXT: vpbroadcastb {{.*#+}} ymm1 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] ; BITALG_NOVLX-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm0 ; BITALG_NOVLX-NEXT: retq ; @@ -1082,7 +1082,7 @@ ; BITALG_NOVLX: # %bb.0: ; BITALG_NOVLX-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; BITALG_NOVLX-NEXT: vpopcntw %zmm0, %zmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} ymm1 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2] +; BITALG_NOVLX-NEXT: vpbroadcastw {{.*#+}} ymm1 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2] ; BITALG_NOVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; BITALG_NOVLX-NEXT: retq ; @@ -1130,7 +1130,7 @@ ; ; AVX2-LABEL: ugt_2_v16i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -1210,7 +1210,7 @@ ; ; AVX2-LABEL: ult_3_v16i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -1221,7 +1221,7 @@ ; AVX2-NEXT: vpsllw $8, %ymm0, %ymm1 ; AVX2-NEXT: vpaddb %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3] +; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3] ; AVX2-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: retq ; @@ -1230,7 +1230,7 @@ ; AVX512VPOPCNTDQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastw {{.*#+}} ymm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; AVX512VPOPCNTDQ-NEXT: retq ; @@ -1247,7 +1247,7 @@ ; BITALG_NOVLX: # %bb.0: ; BITALG_NOVLX-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; BITALG_NOVLX-NEXT: vpopcntw %zmm0, %zmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} ymm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3] +; BITALG_NOVLX-NEXT: vpbroadcastw {{.*#+}} ymm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3] ; BITALG_NOVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; BITALG_NOVLX-NEXT: retq ; @@ -1295,7 +1295,7 @@ ; ; AVX2-LABEL: ugt_3_v16i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -1375,7 +1375,7 @@ ; ; AVX2-LABEL: ult_4_v16i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -1386,7 +1386,7 @@ ; AVX2-NEXT: vpsllw $8, %ymm0, %ymm1 ; AVX2-NEXT: vpaddb %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] +; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] ; AVX2-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: retq ; @@ -1395,7 +1395,7 @@ ; AVX512VPOPCNTDQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastw {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; AVX512VPOPCNTDQ-NEXT: retq ; @@ -1412,7 +1412,7 @@ ; BITALG_NOVLX: # %bb.0: ; BITALG_NOVLX-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; BITALG_NOVLX-NEXT: vpopcntw %zmm0, %zmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] +; BITALG_NOVLX-NEXT: vpbroadcastw {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] ; BITALG_NOVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; BITALG_NOVLX-NEXT: retq ; @@ -1460,7 +1460,7 @@ ; ; AVX2-LABEL: ugt_4_v16i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -1540,7 +1540,7 @@ ; ; AVX2-LABEL: ult_5_v16i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -1551,7 +1551,7 @@ ; AVX2-NEXT: vpsllw $8, %ymm0, %ymm1 ; AVX2-NEXT: vpaddb %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5] +; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm1 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5] ; AVX2-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: retq ; @@ -1560,7 +1560,7 @@ ; AVX512VPOPCNTDQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastw {{.*#+}} ymm1 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; AVX512VPOPCNTDQ-NEXT: retq ; @@ -1577,7 +1577,7 @@ ; BITALG_NOVLX: # %bb.0: ; BITALG_NOVLX-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; BITALG_NOVLX-NEXT: vpopcntw %zmm0, %zmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} ymm1 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5] +; BITALG_NOVLX-NEXT: vpbroadcastw {{.*#+}} ymm1 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5] ; BITALG_NOVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; BITALG_NOVLX-NEXT: retq ; @@ -1625,7 +1625,7 @@ ; ; AVX2-LABEL: ugt_5_v16i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -1705,7 +1705,7 @@ ; ; AVX2-LABEL: ult_6_v16i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -1716,7 +1716,7 @@ ; AVX2-NEXT: vpsllw $8, %ymm0, %ymm1 ; AVX2-NEXT: vpaddb %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6] +; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm1 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6] ; AVX2-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: retq ; @@ -1725,7 +1725,7 @@ ; AVX512VPOPCNTDQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastw {{.*#+}} ymm1 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; AVX512VPOPCNTDQ-NEXT: retq ; @@ -1742,7 +1742,7 @@ ; BITALG_NOVLX: # %bb.0: ; BITALG_NOVLX-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; BITALG_NOVLX-NEXT: vpopcntw %zmm0, %zmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} ymm1 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6] +; BITALG_NOVLX-NEXT: vpbroadcastw {{.*#+}} ymm1 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6] ; BITALG_NOVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; BITALG_NOVLX-NEXT: retq ; @@ -1790,7 +1790,7 @@ ; ; AVX2-LABEL: ugt_6_v16i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -1870,7 +1870,7 @@ ; ; AVX2-LABEL: ult_7_v16i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -1881,7 +1881,7 @@ ; AVX2-NEXT: vpsllw $8, %ymm0, %ymm1 ; AVX2-NEXT: vpaddb %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm1 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] ; AVX2-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: retq ; @@ -1890,7 +1890,7 @@ ; AVX512VPOPCNTDQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastw {{.*#+}} ymm1 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; AVX512VPOPCNTDQ-NEXT: retq ; @@ -1907,7 +1907,7 @@ ; BITALG_NOVLX: # %bb.0: ; BITALG_NOVLX-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; BITALG_NOVLX-NEXT: vpopcntw %zmm0, %zmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} ymm1 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; BITALG_NOVLX-NEXT: vpbroadcastw {{.*#+}} ymm1 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] ; BITALG_NOVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; BITALG_NOVLX-NEXT: retq ; @@ -1955,7 +1955,7 @@ ; ; AVX2-LABEL: ugt_7_v16i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -2035,7 +2035,7 @@ ; ; AVX2-LABEL: ult_8_v16i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -2046,7 +2046,7 @@ ; AVX2-NEXT: vpsllw $8, %ymm0, %ymm1 ; AVX2-NEXT: vpaddb %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] ; AVX2-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: retq ; @@ -2055,7 +2055,7 @@ ; AVX512VPOPCNTDQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastw {{.*#+}} ymm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; AVX512VPOPCNTDQ-NEXT: retq ; @@ -2072,7 +2072,7 @@ ; BITALG_NOVLX: # %bb.0: ; BITALG_NOVLX-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; BITALG_NOVLX-NEXT: vpopcntw %zmm0, %zmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} ymm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; BITALG_NOVLX-NEXT: vpbroadcastw {{.*#+}} ymm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] ; BITALG_NOVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; BITALG_NOVLX-NEXT: retq ; @@ -2120,7 +2120,7 @@ ; ; AVX2-LABEL: ugt_8_v16i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -2200,7 +2200,7 @@ ; ; AVX2-LABEL: ult_9_v16i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -2211,7 +2211,7 @@ ; AVX2-NEXT: vpsllw $8, %ymm0, %ymm1 ; AVX2-NEXT: vpaddb %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9] +; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm1 = [9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9] ; AVX2-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: retq ; @@ -2220,7 +2220,7 @@ ; AVX512VPOPCNTDQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastw {{.*#+}} ymm1 = [9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; AVX512VPOPCNTDQ-NEXT: retq ; @@ -2237,7 +2237,7 @@ ; BITALG_NOVLX: # %bb.0: ; BITALG_NOVLX-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; BITALG_NOVLX-NEXT: vpopcntw %zmm0, %zmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} ymm1 = [9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9] +; BITALG_NOVLX-NEXT: vpbroadcastw {{.*#+}} ymm1 = [9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9] ; BITALG_NOVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; BITALG_NOVLX-NEXT: retq ; @@ -2285,7 +2285,7 @@ ; ; AVX2-LABEL: ugt_9_v16i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -2365,7 +2365,7 @@ ; ; AVX2-LABEL: ult_10_v16i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -2376,7 +2376,7 @@ ; AVX2-NEXT: vpsllw $8, %ymm0, %ymm1 ; AVX2-NEXT: vpaddb %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10] +; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm1 = [10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10] ; AVX2-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: retq ; @@ -2385,7 +2385,7 @@ ; AVX512VPOPCNTDQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastw {{.*#+}} ymm1 = [10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; AVX512VPOPCNTDQ-NEXT: retq ; @@ -2402,7 +2402,7 @@ ; BITALG_NOVLX: # %bb.0: ; BITALG_NOVLX-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; BITALG_NOVLX-NEXT: vpopcntw %zmm0, %zmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} ymm1 = [10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10] +; BITALG_NOVLX-NEXT: vpbroadcastw {{.*#+}} ymm1 = [10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10] ; BITALG_NOVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; BITALG_NOVLX-NEXT: retq ; @@ -2450,7 +2450,7 @@ ; ; AVX2-LABEL: ugt_10_v16i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -2530,7 +2530,7 @@ ; ; AVX2-LABEL: ult_11_v16i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -2541,7 +2541,7 @@ ; AVX2-NEXT: vpsllw $8, %ymm0, %ymm1 ; AVX2-NEXT: vpaddb %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11] +; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm1 = [11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11] ; AVX2-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: retq ; @@ -2550,7 +2550,7 @@ ; AVX512VPOPCNTDQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastw {{.*#+}} ymm1 = [11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; AVX512VPOPCNTDQ-NEXT: retq ; @@ -2567,7 +2567,7 @@ ; BITALG_NOVLX: # %bb.0: ; BITALG_NOVLX-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; BITALG_NOVLX-NEXT: vpopcntw %zmm0, %zmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} ymm1 = [11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11] +; BITALG_NOVLX-NEXT: vpbroadcastw {{.*#+}} ymm1 = [11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11] ; BITALG_NOVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; BITALG_NOVLX-NEXT: retq ; @@ -2615,7 +2615,7 @@ ; ; AVX2-LABEL: ugt_11_v16i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -2695,7 +2695,7 @@ ; ; AVX2-LABEL: ult_12_v16i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -2706,7 +2706,7 @@ ; AVX2-NEXT: vpsllw $8, %ymm0, %ymm1 ; AVX2-NEXT: vpaddb %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12] +; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm1 = [12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12] ; AVX2-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: retq ; @@ -2715,7 +2715,7 @@ ; AVX512VPOPCNTDQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastw {{.*#+}} ymm1 = [12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; AVX512VPOPCNTDQ-NEXT: retq ; @@ -2732,7 +2732,7 @@ ; BITALG_NOVLX: # %bb.0: ; BITALG_NOVLX-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; BITALG_NOVLX-NEXT: vpopcntw %zmm0, %zmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} ymm1 = [12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12] +; BITALG_NOVLX-NEXT: vpbroadcastw {{.*#+}} ymm1 = [12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12] ; BITALG_NOVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; BITALG_NOVLX-NEXT: retq ; @@ -2780,7 +2780,7 @@ ; ; AVX2-LABEL: ugt_12_v16i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -2860,7 +2860,7 @@ ; ; AVX2-LABEL: ult_13_v16i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -2871,7 +2871,7 @@ ; AVX2-NEXT: vpsllw $8, %ymm0, %ymm1 ; AVX2-NEXT: vpaddb %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13] +; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm1 = [13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13] ; AVX2-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: retq ; @@ -2880,7 +2880,7 @@ ; AVX512VPOPCNTDQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastw {{.*#+}} ymm1 = [13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; AVX512VPOPCNTDQ-NEXT: retq ; @@ -2897,7 +2897,7 @@ ; BITALG_NOVLX: # %bb.0: ; BITALG_NOVLX-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; BITALG_NOVLX-NEXT: vpopcntw %zmm0, %zmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} ymm1 = [13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13] +; BITALG_NOVLX-NEXT: vpbroadcastw {{.*#+}} ymm1 = [13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13] ; BITALG_NOVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; BITALG_NOVLX-NEXT: retq ; @@ -2945,7 +2945,7 @@ ; ; AVX2-LABEL: ugt_13_v16i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -3025,7 +3025,7 @@ ; ; AVX2-LABEL: ult_14_v16i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -3036,7 +3036,7 @@ ; AVX2-NEXT: vpsllw $8, %ymm0, %ymm1 ; AVX2-NEXT: vpaddb %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14] +; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm1 = [14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14] ; AVX2-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: retq ; @@ -3045,7 +3045,7 @@ ; AVX512VPOPCNTDQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastw {{.*#+}} ymm1 = [14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; AVX512VPOPCNTDQ-NEXT: retq ; @@ -3062,7 +3062,7 @@ ; BITALG_NOVLX: # %bb.0: ; BITALG_NOVLX-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; BITALG_NOVLX-NEXT: vpopcntw %zmm0, %zmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} ymm1 = [14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14] +; BITALG_NOVLX-NEXT: vpbroadcastw {{.*#+}} ymm1 = [14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14] ; BITALG_NOVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; BITALG_NOVLX-NEXT: retq ; @@ -3110,7 +3110,7 @@ ; ; AVX2-LABEL: ugt_14_v16i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -3190,7 +3190,7 @@ ; ; AVX2-LABEL: ult_15_v16i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -3201,7 +3201,7 @@ ; AVX2-NEXT: vpsllw $8, %ymm0, %ymm1 ; AVX2-NEXT: vpaddb %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: retq ; @@ -3210,7 +3210,7 @@ ; AVX512VPOPCNTDQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512VPOPCNTDQ-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastw {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQ-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; AVX512VPOPCNTDQ-NEXT: retq ; @@ -3227,7 +3227,7 @@ ; BITALG_NOVLX: # %bb.0: ; BITALG_NOVLX-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; BITALG_NOVLX-NEXT: vpopcntw %zmm0, %zmm0 -; BITALG_NOVLX-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; BITALG_NOVLX-NEXT: vpbroadcastw {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; BITALG_NOVLX-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; BITALG_NOVLX-NEXT: retq ; @@ -3410,7 +3410,7 @@ ; ; AVX2-LABEL: ugt_2_v8i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -3424,8 +3424,7 @@ ; AVX2-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] ; AVX2-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm1 = [2,2,2,2,2,2,2,2] -; AVX2-NEXT: vpcmpgtd %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ugt_2_v8i32: @@ -3512,7 +3511,7 @@ ; ; AVX2-LABEL: ult_3_v8i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -3614,7 +3613,7 @@ ; ; AVX2-LABEL: ugt_3_v8i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -3628,8 +3627,7 @@ ; AVX2-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] ; AVX2-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm1 = [3,3,3,3,3,3,3,3] -; AVX2-NEXT: vpcmpgtd %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ugt_3_v8i32: @@ -3716,7 +3714,7 @@ ; ; AVX2-LABEL: ult_4_v8i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -3818,7 +3816,7 @@ ; ; AVX2-LABEL: ugt_4_v8i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -3832,8 +3830,7 @@ ; AVX2-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] ; AVX2-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4] -; AVX2-NEXT: vpcmpgtd %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ugt_4_v8i32: @@ -3920,7 +3917,7 @@ ; ; AVX2-LABEL: ult_5_v8i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -4022,7 +4019,7 @@ ; ; AVX2-LABEL: ugt_5_v8i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -4036,8 +4033,7 @@ ; AVX2-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] ; AVX2-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm1 = [5,5,5,5,5,5,5,5] -; AVX2-NEXT: vpcmpgtd %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ugt_5_v8i32: @@ -4124,7 +4120,7 @@ ; ; AVX2-LABEL: ult_6_v8i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -4226,7 +4222,7 @@ ; ; AVX2-LABEL: ugt_6_v8i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -4240,8 +4236,7 @@ ; AVX2-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] ; AVX2-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm1 = [6,6,6,6,6,6,6,6] -; AVX2-NEXT: vpcmpgtd %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ugt_6_v8i32: @@ -4328,7 +4323,7 @@ ; ; AVX2-LABEL: ult_7_v8i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -4430,7 +4425,7 @@ ; ; AVX2-LABEL: ugt_7_v8i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -4444,8 +4439,7 @@ ; AVX2-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] ; AVX2-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm1 = [7,7,7,7,7,7,7,7] -; AVX2-NEXT: vpcmpgtd %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ugt_7_v8i32: @@ -4532,7 +4526,7 @@ ; ; AVX2-LABEL: ult_8_v8i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -4634,7 +4628,7 @@ ; ; AVX2-LABEL: ugt_8_v8i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -4648,8 +4642,7 @@ ; AVX2-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] ; AVX2-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm1 = [8,8,8,8,8,8,8,8] -; AVX2-NEXT: vpcmpgtd %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ugt_8_v8i32: @@ -4736,7 +4729,7 @@ ; ; AVX2-LABEL: ult_9_v8i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -4838,7 +4831,7 @@ ; ; AVX2-LABEL: ugt_9_v8i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -4852,8 +4845,7 @@ ; AVX2-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] ; AVX2-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm1 = [9,9,9,9,9,9,9,9] -; AVX2-NEXT: vpcmpgtd %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ugt_9_v8i32: @@ -4940,7 +4932,7 @@ ; ; AVX2-LABEL: ult_10_v8i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -5042,7 +5034,7 @@ ; ; AVX2-LABEL: ugt_10_v8i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -5056,8 +5048,7 @@ ; AVX2-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] ; AVX2-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm1 = [10,10,10,10,10,10,10,10] -; AVX2-NEXT: vpcmpgtd %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ugt_10_v8i32: @@ -5144,7 +5135,7 @@ ; ; AVX2-LABEL: ult_11_v8i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -5246,7 +5237,7 @@ ; ; AVX2-LABEL: ugt_11_v8i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -5260,8 +5251,7 @@ ; AVX2-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] ; AVX2-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm1 = [11,11,11,11,11,11,11,11] -; AVX2-NEXT: vpcmpgtd %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ugt_11_v8i32: @@ -5348,7 +5338,7 @@ ; ; AVX2-LABEL: ult_12_v8i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -5450,7 +5440,7 @@ ; ; AVX2-LABEL: ugt_12_v8i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -5464,8 +5454,7 @@ ; AVX2-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] ; AVX2-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm1 = [12,12,12,12,12,12,12,12] -; AVX2-NEXT: vpcmpgtd %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ugt_12_v8i32: @@ -5552,7 +5541,7 @@ ; ; AVX2-LABEL: ult_13_v8i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -5654,7 +5643,7 @@ ; ; AVX2-LABEL: ugt_13_v8i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -5668,8 +5657,7 @@ ; AVX2-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] ; AVX2-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm1 = [13,13,13,13,13,13,13,13] -; AVX2-NEXT: vpcmpgtd %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ugt_13_v8i32: @@ -5756,7 +5744,7 @@ ; ; AVX2-LABEL: ult_14_v8i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -5858,7 +5846,7 @@ ; ; AVX2-LABEL: ugt_14_v8i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -5872,8 +5860,7 @@ ; AVX2-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] ; AVX2-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm1 = [14,14,14,14,14,14,14,14] -; AVX2-NEXT: vpcmpgtd %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ugt_14_v8i32: @@ -5960,7 +5947,7 @@ ; ; AVX2-LABEL: ult_15_v8i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -6062,7 +6049,7 @@ ; ; AVX2-LABEL: ugt_15_v8i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -6076,8 +6063,7 @@ ; AVX2-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] ; AVX2-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15] -; AVX2-NEXT: vpcmpgtd %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ugt_15_v8i32: @@ -6164,7 +6150,7 @@ ; ; AVX2-LABEL: ult_16_v8i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -6266,7 +6252,7 @@ ; ; AVX2-LABEL: ugt_16_v8i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -6280,8 +6266,7 @@ ; AVX2-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] ; AVX2-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm1 = [16,16,16,16,16,16,16,16] -; AVX2-NEXT: vpcmpgtd %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ugt_16_v8i32: @@ -6368,7 +6353,7 @@ ; ; AVX2-LABEL: ult_17_v8i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -6470,7 +6455,7 @@ ; ; AVX2-LABEL: ugt_17_v8i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -6484,8 +6469,7 @@ ; AVX2-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] ; AVX2-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm1 = [17,17,17,17,17,17,17,17] -; AVX2-NEXT: vpcmpgtd %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ugt_17_v8i32: @@ -6572,7 +6556,7 @@ ; ; AVX2-LABEL: ult_18_v8i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -6674,7 +6658,7 @@ ; ; AVX2-LABEL: ugt_18_v8i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -6688,8 +6672,7 @@ ; AVX2-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] ; AVX2-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm1 = [18,18,18,18,18,18,18,18] -; AVX2-NEXT: vpcmpgtd %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ugt_18_v8i32: @@ -6776,7 +6759,7 @@ ; ; AVX2-LABEL: ult_19_v8i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -6878,7 +6861,7 @@ ; ; AVX2-LABEL: ugt_19_v8i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -6892,8 +6875,7 @@ ; AVX2-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] ; AVX2-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm1 = [19,19,19,19,19,19,19,19] -; AVX2-NEXT: vpcmpgtd %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ugt_19_v8i32: @@ -6980,7 +6962,7 @@ ; ; AVX2-LABEL: ult_20_v8i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -7082,7 +7064,7 @@ ; ; AVX2-LABEL: ugt_20_v8i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -7096,8 +7078,7 @@ ; AVX2-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] ; AVX2-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm1 = [20,20,20,20,20,20,20,20] -; AVX2-NEXT: vpcmpgtd %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ugt_20_v8i32: @@ -7184,7 +7165,7 @@ ; ; AVX2-LABEL: ult_21_v8i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -7286,7 +7267,7 @@ ; ; AVX2-LABEL: ugt_21_v8i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -7300,8 +7281,7 @@ ; AVX2-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] ; AVX2-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm1 = [21,21,21,21,21,21,21,21] -; AVX2-NEXT: vpcmpgtd %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ugt_21_v8i32: @@ -7388,7 +7368,7 @@ ; ; AVX2-LABEL: ult_22_v8i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -7490,7 +7470,7 @@ ; ; AVX2-LABEL: ugt_22_v8i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -7504,8 +7484,7 @@ ; AVX2-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] ; AVX2-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm1 = [22,22,22,22,22,22,22,22] -; AVX2-NEXT: vpcmpgtd %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ugt_22_v8i32: @@ -7592,7 +7571,7 @@ ; ; AVX2-LABEL: ult_23_v8i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -7694,7 +7673,7 @@ ; ; AVX2-LABEL: ugt_23_v8i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -7708,8 +7687,7 @@ ; AVX2-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] ; AVX2-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm1 = [23,23,23,23,23,23,23,23] -; AVX2-NEXT: vpcmpgtd %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ugt_23_v8i32: @@ -7796,7 +7774,7 @@ ; ; AVX2-LABEL: ult_24_v8i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -7898,7 +7876,7 @@ ; ; AVX2-LABEL: ugt_24_v8i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -7912,8 +7890,7 @@ ; AVX2-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] ; AVX2-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm1 = [24,24,24,24,24,24,24,24] -; AVX2-NEXT: vpcmpgtd %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ugt_24_v8i32: @@ -8000,7 +7977,7 @@ ; ; AVX2-LABEL: ult_25_v8i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -8102,7 +8079,7 @@ ; ; AVX2-LABEL: ugt_25_v8i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -8116,8 +8093,7 @@ ; AVX2-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] ; AVX2-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm1 = [25,25,25,25,25,25,25,25] -; AVX2-NEXT: vpcmpgtd %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ugt_25_v8i32: @@ -8204,7 +8180,7 @@ ; ; AVX2-LABEL: ult_26_v8i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -8306,7 +8282,7 @@ ; ; AVX2-LABEL: ugt_26_v8i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -8320,8 +8296,7 @@ ; AVX2-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] ; AVX2-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm1 = [26,26,26,26,26,26,26,26] -; AVX2-NEXT: vpcmpgtd %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ugt_26_v8i32: @@ -8408,7 +8383,7 @@ ; ; AVX2-LABEL: ult_27_v8i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -8510,7 +8485,7 @@ ; ; AVX2-LABEL: ugt_27_v8i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -8524,8 +8499,7 @@ ; AVX2-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] ; AVX2-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm1 = [27,27,27,27,27,27,27,27] -; AVX2-NEXT: vpcmpgtd %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ugt_27_v8i32: @@ -8612,7 +8586,7 @@ ; ; AVX2-LABEL: ult_28_v8i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -8714,7 +8688,7 @@ ; ; AVX2-LABEL: ugt_28_v8i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -8728,8 +8702,7 @@ ; AVX2-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] ; AVX2-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm1 = [28,28,28,28,28,28,28,28] -; AVX2-NEXT: vpcmpgtd %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ugt_28_v8i32: @@ -8816,7 +8789,7 @@ ; ; AVX2-LABEL: ult_29_v8i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -8918,7 +8891,7 @@ ; ; AVX2-LABEL: ugt_29_v8i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -8932,8 +8905,7 @@ ; AVX2-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] ; AVX2-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm1 = [29,29,29,29,29,29,29,29] -; AVX2-NEXT: vpcmpgtd %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ugt_29_v8i32: @@ -9020,7 +8992,7 @@ ; ; AVX2-LABEL: ult_30_v8i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -9122,7 +9094,7 @@ ; ; AVX2-LABEL: ugt_30_v8i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -9136,8 +9108,7 @@ ; AVX2-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] ; AVX2-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm1 = [30,30,30,30,30,30,30,30] -; AVX2-NEXT: vpcmpgtd %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ugt_30_v8i32: @@ -9224,7 +9195,7 @@ ; ; AVX2-LABEL: ult_31_v8i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -9448,7 +9419,7 @@ ; ; AVX2-LABEL: ugt_2_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -9458,8 +9429,7 @@ ; AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [2,2,2,2] -; AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ugt_2_v4i64: @@ -9530,7 +9500,7 @@ ; ; AVX2-LABEL: ult_3_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -9612,7 +9582,7 @@ ; ; AVX2-LABEL: ugt_3_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -9622,8 +9592,7 @@ ; AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [3,3,3,3] -; AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ugt_3_v4i64: @@ -9694,7 +9663,7 @@ ; ; AVX2-LABEL: ult_4_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -9776,7 +9745,7 @@ ; ; AVX2-LABEL: ugt_4_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -9786,8 +9755,7 @@ ; AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [4,4,4,4] -; AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ugt_4_v4i64: @@ -9858,7 +9826,7 @@ ; ; AVX2-LABEL: ult_5_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -9940,7 +9908,7 @@ ; ; AVX2-LABEL: ugt_5_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -9950,8 +9918,7 @@ ; AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [5,5,5,5] -; AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ugt_5_v4i64: @@ -10022,7 +9989,7 @@ ; ; AVX2-LABEL: ult_6_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -10104,7 +10071,7 @@ ; ; AVX2-LABEL: ugt_6_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -10114,8 +10081,7 @@ ; AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [6,6,6,6] -; AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ugt_6_v4i64: @@ -10186,7 +10152,7 @@ ; ; AVX2-LABEL: ult_7_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -10268,7 +10234,7 @@ ; ; AVX2-LABEL: ugt_7_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -10278,8 +10244,7 @@ ; AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [7,7,7,7] -; AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ugt_7_v4i64: @@ -10350,7 +10315,7 @@ ; ; AVX2-LABEL: ult_8_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -10432,7 +10397,7 @@ ; ; AVX2-LABEL: ugt_8_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -10442,8 +10407,7 @@ ; AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [8,8,8,8] -; AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ugt_8_v4i64: @@ -10514,7 +10478,7 @@ ; ; AVX2-LABEL: ult_9_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -10596,7 +10560,7 @@ ; ; AVX2-LABEL: ugt_9_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -10606,8 +10570,7 @@ ; AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [9,9,9,9] -; AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ugt_9_v4i64: @@ -10678,7 +10641,7 @@ ; ; AVX2-LABEL: ult_10_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -10760,7 +10723,7 @@ ; ; AVX2-LABEL: ugt_10_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -10770,8 +10733,7 @@ ; AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [10,10,10,10] -; AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ugt_10_v4i64: @@ -10842,7 +10804,7 @@ ; ; AVX2-LABEL: ult_11_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -10924,7 +10886,7 @@ ; ; AVX2-LABEL: ugt_11_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -10934,8 +10896,7 @@ ; AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [11,11,11,11] -; AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ugt_11_v4i64: @@ -11006,7 +10967,7 @@ ; ; AVX2-LABEL: ult_12_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -11088,7 +11049,7 @@ ; ; AVX2-LABEL: ugt_12_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -11098,8 +11059,7 @@ ; AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [12,12,12,12] -; AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ugt_12_v4i64: @@ -11170,7 +11130,7 @@ ; ; AVX2-LABEL: ult_13_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -11252,7 +11212,7 @@ ; ; AVX2-LABEL: ugt_13_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -11262,8 +11222,7 @@ ; AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [13,13,13,13] -; AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ugt_13_v4i64: @@ -11334,7 +11293,7 @@ ; ; AVX2-LABEL: ult_14_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -11416,7 +11375,7 @@ ; ; AVX2-LABEL: ugt_14_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -11426,8 +11385,7 @@ ; AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [14,14,14,14] -; AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ugt_14_v4i64: @@ -11498,7 +11456,7 @@ ; ; AVX2-LABEL: ult_15_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -11580,7 +11538,7 @@ ; ; AVX2-LABEL: ugt_15_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -11590,8 +11548,7 @@ ; AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [15,15,15,15] -; AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ugt_15_v4i64: @@ -11662,7 +11619,7 @@ ; ; AVX2-LABEL: ult_16_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -11744,7 +11701,7 @@ ; ; AVX2-LABEL: ugt_16_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -11754,8 +11711,7 @@ ; AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [16,16,16,16] -; AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ugt_16_v4i64: @@ -11826,7 +11782,7 @@ ; ; AVX2-LABEL: ult_17_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -11908,7 +11864,7 @@ ; ; AVX2-LABEL: ugt_17_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -11918,8 +11874,7 @@ ; AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [17,17,17,17] -; AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ugt_17_v4i64: @@ -11990,7 +11945,7 @@ ; ; AVX2-LABEL: ult_18_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -12072,7 +12027,7 @@ ; ; AVX2-LABEL: ugt_18_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -12082,8 +12037,7 @@ ; AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [18,18,18,18] -; AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ugt_18_v4i64: @@ -12154,7 +12108,7 @@ ; ; AVX2-LABEL: ult_19_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -12236,7 +12190,7 @@ ; ; AVX2-LABEL: ugt_19_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -12246,8 +12200,7 @@ ; AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [19,19,19,19] -; AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ugt_19_v4i64: @@ -12318,7 +12271,7 @@ ; ; AVX2-LABEL: ult_20_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -12400,7 +12353,7 @@ ; ; AVX2-LABEL: ugt_20_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -12410,8 +12363,7 @@ ; AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [20,20,20,20] -; AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ugt_20_v4i64: @@ -12482,7 +12434,7 @@ ; ; AVX2-LABEL: ult_21_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -12564,7 +12516,7 @@ ; ; AVX2-LABEL: ugt_21_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -12574,8 +12526,7 @@ ; AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [21,21,21,21] -; AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ugt_21_v4i64: @@ -12646,7 +12597,7 @@ ; ; AVX2-LABEL: ult_22_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -12728,7 +12679,7 @@ ; ; AVX2-LABEL: ugt_22_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -12738,8 +12689,7 @@ ; AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [22,22,22,22] -; AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ugt_22_v4i64: @@ -12810,7 +12760,7 @@ ; ; AVX2-LABEL: ult_23_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -12892,7 +12842,7 @@ ; ; AVX2-LABEL: ugt_23_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -12902,8 +12852,7 @@ ; AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [23,23,23,23] -; AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ugt_23_v4i64: @@ -12974,7 +12923,7 @@ ; ; AVX2-LABEL: ult_24_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -13056,7 +13005,7 @@ ; ; AVX2-LABEL: ugt_24_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -13066,8 +13015,7 @@ ; AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [24,24,24,24] -; AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ugt_24_v4i64: @@ -13138,7 +13086,7 @@ ; ; AVX2-LABEL: ult_25_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -13220,7 +13168,7 @@ ; ; AVX2-LABEL: ugt_25_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -13230,8 +13178,7 @@ ; AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [25,25,25,25] -; AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ugt_25_v4i64: @@ -13302,7 +13249,7 @@ ; ; AVX2-LABEL: ult_26_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -13384,7 +13331,7 @@ ; ; AVX2-LABEL: ugt_26_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -13394,8 +13341,7 @@ ; AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [26,26,26,26] -; AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ugt_26_v4i64: @@ -13466,7 +13412,7 @@ ; ; AVX2-LABEL: ult_27_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -13548,7 +13494,7 @@ ; ; AVX2-LABEL: ugt_27_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -13558,8 +13504,7 @@ ; AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [27,27,27,27] -; AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ugt_27_v4i64: @@ -13630,7 +13575,7 @@ ; ; AVX2-LABEL: ult_28_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -13712,7 +13657,7 @@ ; ; AVX2-LABEL: ugt_28_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -13722,8 +13667,7 @@ ; AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [28,28,28,28] -; AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ugt_28_v4i64: @@ -13794,7 +13738,7 @@ ; ; AVX2-LABEL: ult_29_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -13876,7 +13820,7 @@ ; ; AVX2-LABEL: ugt_29_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -13886,8 +13830,7 @@ ; AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [29,29,29,29] -; AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ugt_29_v4i64: @@ -13958,7 +13901,7 @@ ; ; AVX2-LABEL: ult_30_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -14040,7 +13983,7 @@ ; ; AVX2-LABEL: ugt_30_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -14050,8 +13993,7 @@ ; AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [30,30,30,30] -; AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ugt_30_v4i64: @@ -14122,7 +14064,7 @@ ; ; AVX2-LABEL: ult_31_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -14204,7 +14146,7 @@ ; ; AVX2-LABEL: ugt_31_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -14214,8 +14156,7 @@ ; AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [31,31,31,31] -; AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ugt_31_v4i64: @@ -14286,7 +14227,7 @@ ; ; AVX2-LABEL: ult_32_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -14368,7 +14309,7 @@ ; ; AVX2-LABEL: ugt_32_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -14378,8 +14319,7 @@ ; AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [32,32,32,32] -; AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ugt_32_v4i64: @@ -14450,7 +14390,7 @@ ; ; AVX2-LABEL: ult_33_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -14532,7 +14472,7 @@ ; ; AVX2-LABEL: ugt_33_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -14542,8 +14482,7 @@ ; AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [33,33,33,33] -; AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ugt_33_v4i64: @@ -14614,7 +14553,7 @@ ; ; AVX2-LABEL: ult_34_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -14696,7 +14635,7 @@ ; ; AVX2-LABEL: ugt_34_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -14706,8 +14645,7 @@ ; AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [34,34,34,34] -; AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ugt_34_v4i64: @@ -14778,7 +14716,7 @@ ; ; AVX2-LABEL: ult_35_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -14860,7 +14798,7 @@ ; ; AVX2-LABEL: ugt_35_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -14870,8 +14808,7 @@ ; AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [35,35,35,35] -; AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ugt_35_v4i64: @@ -14942,7 +14879,7 @@ ; ; AVX2-LABEL: ult_36_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -15024,7 +14961,7 @@ ; ; AVX2-LABEL: ugt_36_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -15034,8 +14971,7 @@ ; AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [36,36,36,36] -; AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ugt_36_v4i64: @@ -15106,7 +15042,7 @@ ; ; AVX2-LABEL: ult_37_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -15188,7 +15124,7 @@ ; ; AVX2-LABEL: ugt_37_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -15198,8 +15134,7 @@ ; AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [37,37,37,37] -; AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ugt_37_v4i64: @@ -15270,7 +15205,7 @@ ; ; AVX2-LABEL: ult_38_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -15352,7 +15287,7 @@ ; ; AVX2-LABEL: ugt_38_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -15362,8 +15297,7 @@ ; AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [38,38,38,38] -; AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ugt_38_v4i64: @@ -15434,7 +15368,7 @@ ; ; AVX2-LABEL: ult_39_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -15516,7 +15450,7 @@ ; ; AVX2-LABEL: ugt_39_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -15526,8 +15460,7 @@ ; AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [39,39,39,39] -; AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ugt_39_v4i64: @@ -15598,7 +15531,7 @@ ; ; AVX2-LABEL: ult_40_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -15680,7 +15613,7 @@ ; ; AVX2-LABEL: ugt_40_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -15690,8 +15623,7 @@ ; AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [40,40,40,40] -; AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ugt_40_v4i64: @@ -15762,7 +15694,7 @@ ; ; AVX2-LABEL: ult_41_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -15844,7 +15776,7 @@ ; ; AVX2-LABEL: ugt_41_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -15854,8 +15786,7 @@ ; AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [41,41,41,41] -; AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ugt_41_v4i64: @@ -15926,7 +15857,7 @@ ; ; AVX2-LABEL: ult_42_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -16008,7 +15939,7 @@ ; ; AVX2-LABEL: ugt_42_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -16018,8 +15949,7 @@ ; AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [42,42,42,42] -; AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ugt_42_v4i64: @@ -16090,7 +16020,7 @@ ; ; AVX2-LABEL: ult_43_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -16172,7 +16102,7 @@ ; ; AVX2-LABEL: ugt_43_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -16182,8 +16112,7 @@ ; AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [43,43,43,43] -; AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ugt_43_v4i64: @@ -16254,7 +16183,7 @@ ; ; AVX2-LABEL: ult_44_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -16336,7 +16265,7 @@ ; ; AVX2-LABEL: ugt_44_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -16346,8 +16275,7 @@ ; AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [44,44,44,44] -; AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ugt_44_v4i64: @@ -16418,7 +16346,7 @@ ; ; AVX2-LABEL: ult_45_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -16500,7 +16428,7 @@ ; ; AVX2-LABEL: ugt_45_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -16510,8 +16438,7 @@ ; AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [45,45,45,45] -; AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ugt_45_v4i64: @@ -16582,7 +16509,7 @@ ; ; AVX2-LABEL: ult_46_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -16664,7 +16591,7 @@ ; ; AVX2-LABEL: ugt_46_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -16674,8 +16601,7 @@ ; AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [46,46,46,46] -; AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ugt_46_v4i64: @@ -16746,7 +16672,7 @@ ; ; AVX2-LABEL: ult_47_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -16828,7 +16754,7 @@ ; ; AVX2-LABEL: ugt_47_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -16838,8 +16764,7 @@ ; AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [47,47,47,47] -; AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ugt_47_v4i64: @@ -16910,7 +16835,7 @@ ; ; AVX2-LABEL: ult_48_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -16992,7 +16917,7 @@ ; ; AVX2-LABEL: ugt_48_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -17002,8 +16927,7 @@ ; AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [48,48,48,48] -; AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ugt_48_v4i64: @@ -17074,7 +16998,7 @@ ; ; AVX2-LABEL: ult_49_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -17156,7 +17080,7 @@ ; ; AVX2-LABEL: ugt_49_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -17166,8 +17090,7 @@ ; AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [49,49,49,49] -; AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ugt_49_v4i64: @@ -17238,7 +17161,7 @@ ; ; AVX2-LABEL: ult_50_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -17320,7 +17243,7 @@ ; ; AVX2-LABEL: ugt_50_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -17330,8 +17253,7 @@ ; AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [50,50,50,50] -; AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ugt_50_v4i64: @@ -17402,7 +17324,7 @@ ; ; AVX2-LABEL: ult_51_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -17484,7 +17406,7 @@ ; ; AVX2-LABEL: ugt_51_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -17494,8 +17416,7 @@ ; AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [51,51,51,51] -; AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ugt_51_v4i64: @@ -17566,7 +17487,7 @@ ; ; AVX2-LABEL: ult_52_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -17648,7 +17569,7 @@ ; ; AVX2-LABEL: ugt_52_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -17658,8 +17579,7 @@ ; AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [52,52,52,52] -; AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ugt_52_v4i64: @@ -17730,7 +17650,7 @@ ; ; AVX2-LABEL: ult_53_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -17812,7 +17732,7 @@ ; ; AVX2-LABEL: ugt_53_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -17822,8 +17742,7 @@ ; AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [53,53,53,53] -; AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ugt_53_v4i64: @@ -17894,7 +17813,7 @@ ; ; AVX2-LABEL: ult_54_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -17976,7 +17895,7 @@ ; ; AVX2-LABEL: ugt_54_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -17986,8 +17905,7 @@ ; AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [54,54,54,54] -; AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ugt_54_v4i64: @@ -18058,7 +17976,7 @@ ; ; AVX2-LABEL: ult_55_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -18140,7 +18058,7 @@ ; ; AVX2-LABEL: ugt_55_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -18150,8 +18068,7 @@ ; AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [55,55,55,55] -; AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ugt_55_v4i64: @@ -18222,7 +18139,7 @@ ; ; AVX2-LABEL: ult_56_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -18304,7 +18221,7 @@ ; ; AVX2-LABEL: ugt_56_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -18314,8 +18231,7 @@ ; AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [56,56,56,56] -; AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ugt_56_v4i64: @@ -18386,7 +18302,7 @@ ; ; AVX2-LABEL: ult_57_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -18468,7 +18384,7 @@ ; ; AVX2-LABEL: ugt_57_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -18478,8 +18394,7 @@ ; AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [57,57,57,57] -; AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ugt_57_v4i64: @@ -18550,7 +18465,7 @@ ; ; AVX2-LABEL: ult_58_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -18632,7 +18547,7 @@ ; ; AVX2-LABEL: ugt_58_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -18642,8 +18557,7 @@ ; AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [58,58,58,58] -; AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ugt_58_v4i64: @@ -18714,7 +18628,7 @@ ; ; AVX2-LABEL: ult_59_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -18796,7 +18710,7 @@ ; ; AVX2-LABEL: ugt_59_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -18806,8 +18720,7 @@ ; AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [59,59,59,59] -; AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ugt_59_v4i64: @@ -18878,7 +18791,7 @@ ; ; AVX2-LABEL: ult_60_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -18960,7 +18873,7 @@ ; ; AVX2-LABEL: ugt_60_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -18970,8 +18883,7 @@ ; AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [60,60,60,60] -; AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ugt_60_v4i64: @@ -19042,7 +18954,7 @@ ; ; AVX2-LABEL: ult_61_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -19124,7 +19036,7 @@ ; ; AVX2-LABEL: ugt_61_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -19134,8 +19046,7 @@ ; AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [61,61,61,61] -; AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ugt_61_v4i64: @@ -19206,7 +19117,7 @@ ; ; AVX2-LABEL: ult_62_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -19288,7 +19199,7 @@ ; ; AVX2-LABEL: ugt_62_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -19298,8 +19209,7 @@ ; AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpsadbw %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [62,62,62,62] -; AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: ugt_62_v4i64: @@ -19370,7 +19280,7 @@ ; ; AVX2-LABEL: ult_63_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 diff --git a/llvm/test/CodeGen/X86/vector-popcnt-256.ll b/llvm/test/CodeGen/X86/vector-popcnt-256.ll --- a/llvm/test/CodeGen/X86/vector-popcnt-256.ll +++ b/llvm/test/CodeGen/X86/vector-popcnt-256.ll @@ -33,7 +33,7 @@ ; ; AVX2-LABEL: testv4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -131,7 +131,7 @@ ; ; AVX2-LABEL: testv8i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -244,7 +244,7 @@ ; ; AVX2-LABEL: testv16i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -334,7 +334,7 @@ ; ; AVX2-LABEL: testv32i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -365,7 +365,7 @@ ; ; AVX512VPOPCNTDQ-LABEL: testv32i8: ; AVX512VPOPCNTDQ: # %bb.0: -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm2, %ymm3, %ymm2 diff --git a/llvm/test/CodeGen/X86/vector-popcnt-512-ult-ugt.ll b/llvm/test/CodeGen/X86/vector-popcnt-512-ult-ugt.ll --- a/llvm/test/CodeGen/X86/vector-popcnt-512-ult-ugt.ll +++ b/llvm/test/CodeGen/X86/vector-popcnt-512-ult-ugt.ll @@ -125,7 +125,7 @@ define <64 x i8> @ugt_2_v64i8(<64 x i8> %0) { ; AVX512F-LABEL: ugt_2_v64i8: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -140,7 +140,7 @@ ; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: vpshufb %ymm0, %ymm3, %ymm0 ; AVX512F-NEXT: vpaddb %ymm4, %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm1 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2] ; AVX512F-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: vpcmpgtb %ymm1, %ymm2, %ymm1 ; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 @@ -162,7 +162,7 @@ ; ; AVX512VPOPCNTDQ-NOBW-LABEL: ugt_2_v64i8: ; AVX512VPOPCNTDQ-NOBW: # %bb.0: -; AVX512VPOPCNTDQ-NOBW-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQ-NOBW-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQ-NOBW-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX512VPOPCNTDQ-NOBW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512VPOPCNTDQ-NOBW-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -177,7 +177,7 @@ ; AVX512VPOPCNTDQ-NOBW-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpshufb %ymm0, %ymm3, %ymm0 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpaddb %ymm4, %ymm0, %ymm0 -; AVX512VPOPCNTDQ-NOBW-NEXT: vmovdqa {{.*#+}} ymm1 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2] +; AVX512VPOPCNTDQ-NOBW-NEXT: vpbroadcastb {{.*#+}} ymm1 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2] ; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm0 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtb %ymm1, %ymm2, %ymm1 ; AVX512VPOPCNTDQ-NOBW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 @@ -212,7 +212,7 @@ define <64 x i8> @ult_3_v64i8(<64 x i8> %0) { ; AVX512F-LABEL: ult_3_v64i8: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -227,7 +227,7 @@ ; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: vpshufb %ymm0, %ymm3, %ymm0 ; AVX512F-NEXT: vpaddb %ymm4, %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3] ; AVX512F-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm0 ; AVX512F-NEXT: vpcmpgtb %ymm2, %ymm1, %ymm1 ; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 @@ -249,7 +249,7 @@ ; ; AVX512VPOPCNTDQ-NOBW-LABEL: ult_3_v64i8: ; AVX512VPOPCNTDQ-NOBW: # %bb.0: -; AVX512VPOPCNTDQ-NOBW-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQ-NOBW-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQ-NOBW-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX512VPOPCNTDQ-NOBW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512VPOPCNTDQ-NOBW-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -264,7 +264,7 @@ ; AVX512VPOPCNTDQ-NOBW-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpshufb %ymm0, %ymm3, %ymm0 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpaddb %ymm4, %ymm0, %ymm0 -; AVX512VPOPCNTDQ-NOBW-NEXT: vmovdqa {{.*#+}} ymm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3] +; AVX512VPOPCNTDQ-NOBW-NEXT: vpbroadcastb {{.*#+}} ymm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3] ; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm0 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtb %ymm2, %ymm1, %ymm1 ; AVX512VPOPCNTDQ-NOBW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 @@ -299,7 +299,7 @@ define <64 x i8> @ugt_3_v64i8(<64 x i8> %0) { ; AVX512F-LABEL: ugt_3_v64i8: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -314,7 +314,7 @@ ; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: vpshufb %ymm0, %ymm3, %ymm0 ; AVX512F-NEXT: vpaddb %ymm4, %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3] ; AVX512F-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: vpcmpgtb %ymm1, %ymm2, %ymm1 ; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 @@ -336,7 +336,7 @@ ; ; AVX512VPOPCNTDQ-NOBW-LABEL: ugt_3_v64i8: ; AVX512VPOPCNTDQ-NOBW: # %bb.0: -; AVX512VPOPCNTDQ-NOBW-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQ-NOBW-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQ-NOBW-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX512VPOPCNTDQ-NOBW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512VPOPCNTDQ-NOBW-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -351,7 +351,7 @@ ; AVX512VPOPCNTDQ-NOBW-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpshufb %ymm0, %ymm3, %ymm0 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpaddb %ymm4, %ymm0, %ymm0 -; AVX512VPOPCNTDQ-NOBW-NEXT: vmovdqa {{.*#+}} ymm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3] +; AVX512VPOPCNTDQ-NOBW-NEXT: vpbroadcastb {{.*#+}} ymm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3] ; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm0 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtb %ymm1, %ymm2, %ymm1 ; AVX512VPOPCNTDQ-NOBW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 @@ -386,7 +386,7 @@ define <64 x i8> @ult_4_v64i8(<64 x i8> %0) { ; AVX512F-LABEL: ult_4_v64i8: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -401,7 +401,7 @@ ; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: vpshufb %ymm0, %ymm3, %ymm0 ; AVX512F-NEXT: vpaddb %ymm4, %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] ; AVX512F-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm0 ; AVX512F-NEXT: vpcmpgtb %ymm2, %ymm1, %ymm1 ; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 @@ -423,7 +423,7 @@ ; ; AVX512VPOPCNTDQ-NOBW-LABEL: ult_4_v64i8: ; AVX512VPOPCNTDQ-NOBW: # %bb.0: -; AVX512VPOPCNTDQ-NOBW-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQ-NOBW-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQ-NOBW-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX512VPOPCNTDQ-NOBW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512VPOPCNTDQ-NOBW-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -438,7 +438,7 @@ ; AVX512VPOPCNTDQ-NOBW-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpshufb %ymm0, %ymm3, %ymm0 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpaddb %ymm4, %ymm0, %ymm0 -; AVX512VPOPCNTDQ-NOBW-NEXT: vmovdqa {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] +; AVX512VPOPCNTDQ-NOBW-NEXT: vpbroadcastb {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] ; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm0 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtb %ymm2, %ymm1, %ymm1 ; AVX512VPOPCNTDQ-NOBW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 @@ -473,7 +473,7 @@ define <64 x i8> @ugt_4_v64i8(<64 x i8> %0) { ; AVX512F-LABEL: ugt_4_v64i8: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -488,7 +488,7 @@ ; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: vpshufb %ymm0, %ymm3, %ymm0 ; AVX512F-NEXT: vpaddb %ymm4, %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] ; AVX512F-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: vpcmpgtb %ymm1, %ymm2, %ymm1 ; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 @@ -510,7 +510,7 @@ ; ; AVX512VPOPCNTDQ-NOBW-LABEL: ugt_4_v64i8: ; AVX512VPOPCNTDQ-NOBW: # %bb.0: -; AVX512VPOPCNTDQ-NOBW-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQ-NOBW-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQ-NOBW-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX512VPOPCNTDQ-NOBW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512VPOPCNTDQ-NOBW-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -525,7 +525,7 @@ ; AVX512VPOPCNTDQ-NOBW-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpshufb %ymm0, %ymm3, %ymm0 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpaddb %ymm4, %ymm0, %ymm0 -; AVX512VPOPCNTDQ-NOBW-NEXT: vmovdqa {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] +; AVX512VPOPCNTDQ-NOBW-NEXT: vpbroadcastb {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] ; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm0 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtb %ymm1, %ymm2, %ymm1 ; AVX512VPOPCNTDQ-NOBW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 @@ -560,7 +560,7 @@ define <64 x i8> @ult_5_v64i8(<64 x i8> %0) { ; AVX512F-LABEL: ult_5_v64i8: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -575,7 +575,7 @@ ; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: vpshufb %ymm0, %ymm3, %ymm0 ; AVX512F-NEXT: vpaddb %ymm4, %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm1 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5] ; AVX512F-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm0 ; AVX512F-NEXT: vpcmpgtb %ymm2, %ymm1, %ymm1 ; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 @@ -597,7 +597,7 @@ ; ; AVX512VPOPCNTDQ-NOBW-LABEL: ult_5_v64i8: ; AVX512VPOPCNTDQ-NOBW: # %bb.0: -; AVX512VPOPCNTDQ-NOBW-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQ-NOBW-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQ-NOBW-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX512VPOPCNTDQ-NOBW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512VPOPCNTDQ-NOBW-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -612,7 +612,7 @@ ; AVX512VPOPCNTDQ-NOBW-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpshufb %ymm0, %ymm3, %ymm0 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpaddb %ymm4, %ymm0, %ymm0 -; AVX512VPOPCNTDQ-NOBW-NEXT: vmovdqa {{.*#+}} ymm1 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5] +; AVX512VPOPCNTDQ-NOBW-NEXT: vpbroadcastb {{.*#+}} ymm1 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5] ; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm0 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtb %ymm2, %ymm1, %ymm1 ; AVX512VPOPCNTDQ-NOBW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 @@ -647,7 +647,7 @@ define <64 x i8> @ugt_5_v64i8(<64 x i8> %0) { ; AVX512F-LABEL: ugt_5_v64i8: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -662,7 +662,7 @@ ; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: vpshufb %ymm0, %ymm3, %ymm0 ; AVX512F-NEXT: vpaddb %ymm4, %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm1 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5] ; AVX512F-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: vpcmpgtb %ymm1, %ymm2, %ymm1 ; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 @@ -684,7 +684,7 @@ ; ; AVX512VPOPCNTDQ-NOBW-LABEL: ugt_5_v64i8: ; AVX512VPOPCNTDQ-NOBW: # %bb.0: -; AVX512VPOPCNTDQ-NOBW-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQ-NOBW-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQ-NOBW-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX512VPOPCNTDQ-NOBW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512VPOPCNTDQ-NOBW-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -699,7 +699,7 @@ ; AVX512VPOPCNTDQ-NOBW-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpshufb %ymm0, %ymm3, %ymm0 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpaddb %ymm4, %ymm0, %ymm0 -; AVX512VPOPCNTDQ-NOBW-NEXT: vmovdqa {{.*#+}} ymm1 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5] +; AVX512VPOPCNTDQ-NOBW-NEXT: vpbroadcastb {{.*#+}} ymm1 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5] ; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm0 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtb %ymm1, %ymm2, %ymm1 ; AVX512VPOPCNTDQ-NOBW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 @@ -734,7 +734,7 @@ define <64 x i8> @ult_6_v64i8(<64 x i8> %0) { ; AVX512F-LABEL: ult_6_v64i8: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -749,7 +749,7 @@ ; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: vpshufb %ymm0, %ymm3, %ymm0 ; AVX512F-NEXT: vpaddb %ymm4, %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm1 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6] ; AVX512F-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm0 ; AVX512F-NEXT: vpcmpgtb %ymm2, %ymm1, %ymm1 ; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 @@ -771,7 +771,7 @@ ; ; AVX512VPOPCNTDQ-NOBW-LABEL: ult_6_v64i8: ; AVX512VPOPCNTDQ-NOBW: # %bb.0: -; AVX512VPOPCNTDQ-NOBW-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQ-NOBW-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQ-NOBW-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX512VPOPCNTDQ-NOBW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512VPOPCNTDQ-NOBW-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -786,7 +786,7 @@ ; AVX512VPOPCNTDQ-NOBW-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpshufb %ymm0, %ymm3, %ymm0 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpaddb %ymm4, %ymm0, %ymm0 -; AVX512VPOPCNTDQ-NOBW-NEXT: vmovdqa {{.*#+}} ymm1 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6] +; AVX512VPOPCNTDQ-NOBW-NEXT: vpbroadcastb {{.*#+}} ymm1 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6] ; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm0 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtb %ymm2, %ymm1, %ymm1 ; AVX512VPOPCNTDQ-NOBW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 @@ -821,7 +821,7 @@ define <64 x i8> @ugt_6_v64i8(<64 x i8> %0) { ; AVX512F-LABEL: ugt_6_v64i8: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -836,7 +836,7 @@ ; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: vpshufb %ymm0, %ymm3, %ymm0 ; AVX512F-NEXT: vpaddb %ymm4, %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm1 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6] ; AVX512F-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: vpcmpgtb %ymm1, %ymm2, %ymm1 ; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 @@ -858,7 +858,7 @@ ; ; AVX512VPOPCNTDQ-NOBW-LABEL: ugt_6_v64i8: ; AVX512VPOPCNTDQ-NOBW: # %bb.0: -; AVX512VPOPCNTDQ-NOBW-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQ-NOBW-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQ-NOBW-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX512VPOPCNTDQ-NOBW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512VPOPCNTDQ-NOBW-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -873,7 +873,7 @@ ; AVX512VPOPCNTDQ-NOBW-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpshufb %ymm0, %ymm3, %ymm0 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpaddb %ymm4, %ymm0, %ymm0 -; AVX512VPOPCNTDQ-NOBW-NEXT: vmovdqa {{.*#+}} ymm1 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6] +; AVX512VPOPCNTDQ-NOBW-NEXT: vpbroadcastb {{.*#+}} ymm1 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6] ; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm0 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtb %ymm1, %ymm2, %ymm1 ; AVX512VPOPCNTDQ-NOBW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 @@ -908,7 +908,7 @@ define <64 x i8> @ult_7_v64i8(<64 x i8> %0) { ; AVX512F-LABEL: ult_7_v64i8: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -923,7 +923,7 @@ ; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: vpshufb %ymm0, %ymm3, %ymm0 ; AVX512F-NEXT: vpaddb %ymm4, %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm1 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] ; AVX512F-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm0 ; AVX512F-NEXT: vpcmpgtb %ymm2, %ymm1, %ymm1 ; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 @@ -945,7 +945,7 @@ ; ; AVX512VPOPCNTDQ-NOBW-LABEL: ult_7_v64i8: ; AVX512VPOPCNTDQ-NOBW: # %bb.0: -; AVX512VPOPCNTDQ-NOBW-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQ-NOBW-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQ-NOBW-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX512VPOPCNTDQ-NOBW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512VPOPCNTDQ-NOBW-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -960,7 +960,7 @@ ; AVX512VPOPCNTDQ-NOBW-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpshufb %ymm0, %ymm3, %ymm0 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpaddb %ymm4, %ymm0, %ymm0 -; AVX512VPOPCNTDQ-NOBW-NEXT: vmovdqa {{.*#+}} ymm1 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; AVX512VPOPCNTDQ-NOBW-NEXT: vpbroadcastb {{.*#+}} ymm1 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] ; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm0 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtb %ymm2, %ymm1, %ymm1 ; AVX512VPOPCNTDQ-NOBW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 @@ -1111,7 +1111,7 @@ define <32 x i16> @ugt_2_v32i16(<32 x i16> %0) { ; AVX512F-LABEL: ugt_2_v32i16: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -1132,7 +1132,7 @@ ; AVX512F-NEXT: vpsllw $8, %ymm0, %ymm1 ; AVX512F-NEXT: vpaddb %ymm0, %ymm1, %ymm0 ; AVX512F-NEXT: vpsrlw $8, %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2] +; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm1 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2] ; AVX512F-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: vpcmpgtw %ymm1, %ymm2, %ymm1 ; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 @@ -1164,7 +1164,7 @@ ; AVX512VPOPCNTDQ-NOBW-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512VPOPCNTDQ-NOBW-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VPOPCNTDQ-NOBW-NEXT: vmovdqa {{.*#+}} ymm2 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2] +; AVX512VPOPCNTDQ-NOBW-NEXT: vpbroadcastw {{.*#+}} ymm2 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2] ; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtw %ymm2, %ymm0, %ymm0 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtw %ymm2, %ymm1, %ymm1 ; AVX512VPOPCNTDQ-NOBW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 @@ -1202,7 +1202,7 @@ define <32 x i16> @ult_3_v32i16(<32 x i16> %0) { ; AVX512F-LABEL: ult_3_v32i16: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -1223,7 +1223,7 @@ ; AVX512F-NEXT: vpsllw $8, %ymm0, %ymm1 ; AVX512F-NEXT: vpaddb %ymm0, %ymm1, %ymm0 ; AVX512F-NEXT: vpsrlw $8, %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3] +; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3] ; AVX512F-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; AVX512F-NEXT: vpcmpgtw %ymm2, %ymm1, %ymm1 ; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 @@ -1255,7 +1255,7 @@ ; AVX512VPOPCNTDQ-NOBW-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512VPOPCNTDQ-NOBW-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VPOPCNTDQ-NOBW-NEXT: vmovdqa {{.*#+}} ymm2 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3] +; AVX512VPOPCNTDQ-NOBW-NEXT: vpbroadcastw {{.*#+}} ymm2 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3] ; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtw %ymm0, %ymm2, %ymm0 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtw %ymm1, %ymm2, %ymm1 ; AVX512VPOPCNTDQ-NOBW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 @@ -1293,7 +1293,7 @@ define <32 x i16> @ugt_3_v32i16(<32 x i16> %0) { ; AVX512F-LABEL: ugt_3_v32i16: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -1314,7 +1314,7 @@ ; AVX512F-NEXT: vpsllw $8, %ymm0, %ymm1 ; AVX512F-NEXT: vpaddb %ymm0, %ymm1, %ymm0 ; AVX512F-NEXT: vpsrlw $8, %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3] +; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm1 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3] ; AVX512F-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: vpcmpgtw %ymm1, %ymm2, %ymm1 ; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 @@ -1346,7 +1346,7 @@ ; AVX512VPOPCNTDQ-NOBW-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512VPOPCNTDQ-NOBW-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VPOPCNTDQ-NOBW-NEXT: vmovdqa {{.*#+}} ymm2 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3] +; AVX512VPOPCNTDQ-NOBW-NEXT: vpbroadcastw {{.*#+}} ymm2 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3] ; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtw %ymm2, %ymm0, %ymm0 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtw %ymm2, %ymm1, %ymm1 ; AVX512VPOPCNTDQ-NOBW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 @@ -1384,7 +1384,7 @@ define <32 x i16> @ult_4_v32i16(<32 x i16> %0) { ; AVX512F-LABEL: ult_4_v32i16: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -1405,7 +1405,7 @@ ; AVX512F-NEXT: vpsllw $8, %ymm0, %ymm1 ; AVX512F-NEXT: vpaddb %ymm0, %ymm1, %ymm0 ; AVX512F-NEXT: vpsrlw $8, %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] +; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] ; AVX512F-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; AVX512F-NEXT: vpcmpgtw %ymm2, %ymm1, %ymm1 ; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 @@ -1437,7 +1437,7 @@ ; AVX512VPOPCNTDQ-NOBW-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512VPOPCNTDQ-NOBW-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VPOPCNTDQ-NOBW-NEXT: vmovdqa {{.*#+}} ymm2 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] +; AVX512VPOPCNTDQ-NOBW-NEXT: vpbroadcastw {{.*#+}} ymm2 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] ; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtw %ymm0, %ymm2, %ymm0 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtw %ymm1, %ymm2, %ymm1 ; AVX512VPOPCNTDQ-NOBW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 @@ -1475,7 +1475,7 @@ define <32 x i16> @ugt_4_v32i16(<32 x i16> %0) { ; AVX512F-LABEL: ugt_4_v32i16: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -1496,7 +1496,7 @@ ; AVX512F-NEXT: vpsllw $8, %ymm0, %ymm1 ; AVX512F-NEXT: vpaddb %ymm0, %ymm1, %ymm0 ; AVX512F-NEXT: vpsrlw $8, %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] +; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] ; AVX512F-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: vpcmpgtw %ymm1, %ymm2, %ymm1 ; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 @@ -1528,7 +1528,7 @@ ; AVX512VPOPCNTDQ-NOBW-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512VPOPCNTDQ-NOBW-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VPOPCNTDQ-NOBW-NEXT: vmovdqa {{.*#+}} ymm2 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] +; AVX512VPOPCNTDQ-NOBW-NEXT: vpbroadcastw {{.*#+}} ymm2 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] ; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtw %ymm2, %ymm0, %ymm0 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtw %ymm2, %ymm1, %ymm1 ; AVX512VPOPCNTDQ-NOBW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 @@ -1566,7 +1566,7 @@ define <32 x i16> @ult_5_v32i16(<32 x i16> %0) { ; AVX512F-LABEL: ult_5_v32i16: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -1587,7 +1587,7 @@ ; AVX512F-NEXT: vpsllw $8, %ymm0, %ymm1 ; AVX512F-NEXT: vpaddb %ymm0, %ymm1, %ymm0 ; AVX512F-NEXT: vpsrlw $8, %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5] +; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm1 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5] ; AVX512F-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; AVX512F-NEXT: vpcmpgtw %ymm2, %ymm1, %ymm1 ; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 @@ -1619,7 +1619,7 @@ ; AVX512VPOPCNTDQ-NOBW-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512VPOPCNTDQ-NOBW-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VPOPCNTDQ-NOBW-NEXT: vmovdqa {{.*#+}} ymm2 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5] +; AVX512VPOPCNTDQ-NOBW-NEXT: vpbroadcastw {{.*#+}} ymm2 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5] ; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtw %ymm0, %ymm2, %ymm0 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtw %ymm1, %ymm2, %ymm1 ; AVX512VPOPCNTDQ-NOBW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 @@ -1657,7 +1657,7 @@ define <32 x i16> @ugt_5_v32i16(<32 x i16> %0) { ; AVX512F-LABEL: ugt_5_v32i16: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -1678,7 +1678,7 @@ ; AVX512F-NEXT: vpsllw $8, %ymm0, %ymm1 ; AVX512F-NEXT: vpaddb %ymm0, %ymm1, %ymm0 ; AVX512F-NEXT: vpsrlw $8, %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5] +; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm1 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5] ; AVX512F-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: vpcmpgtw %ymm1, %ymm2, %ymm1 ; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 @@ -1710,7 +1710,7 @@ ; AVX512VPOPCNTDQ-NOBW-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512VPOPCNTDQ-NOBW-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VPOPCNTDQ-NOBW-NEXT: vmovdqa {{.*#+}} ymm2 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5] +; AVX512VPOPCNTDQ-NOBW-NEXT: vpbroadcastw {{.*#+}} ymm2 = [5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5] ; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtw %ymm2, %ymm0, %ymm0 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtw %ymm2, %ymm1, %ymm1 ; AVX512VPOPCNTDQ-NOBW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 @@ -1748,7 +1748,7 @@ define <32 x i16> @ult_6_v32i16(<32 x i16> %0) { ; AVX512F-LABEL: ult_6_v32i16: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -1769,7 +1769,7 @@ ; AVX512F-NEXT: vpsllw $8, %ymm0, %ymm1 ; AVX512F-NEXT: vpaddb %ymm0, %ymm1, %ymm0 ; AVX512F-NEXT: vpsrlw $8, %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6] +; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm1 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6] ; AVX512F-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; AVX512F-NEXT: vpcmpgtw %ymm2, %ymm1, %ymm1 ; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 @@ -1801,7 +1801,7 @@ ; AVX512VPOPCNTDQ-NOBW-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512VPOPCNTDQ-NOBW-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VPOPCNTDQ-NOBW-NEXT: vmovdqa {{.*#+}} ymm2 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6] +; AVX512VPOPCNTDQ-NOBW-NEXT: vpbroadcastw {{.*#+}} ymm2 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6] ; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtw %ymm0, %ymm2, %ymm0 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtw %ymm1, %ymm2, %ymm1 ; AVX512VPOPCNTDQ-NOBW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 @@ -1839,7 +1839,7 @@ define <32 x i16> @ugt_6_v32i16(<32 x i16> %0) { ; AVX512F-LABEL: ugt_6_v32i16: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -1860,7 +1860,7 @@ ; AVX512F-NEXT: vpsllw $8, %ymm0, %ymm1 ; AVX512F-NEXT: vpaddb %ymm0, %ymm1, %ymm0 ; AVX512F-NEXT: vpsrlw $8, %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6] +; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm1 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6] ; AVX512F-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: vpcmpgtw %ymm1, %ymm2, %ymm1 ; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 @@ -1892,7 +1892,7 @@ ; AVX512VPOPCNTDQ-NOBW-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512VPOPCNTDQ-NOBW-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VPOPCNTDQ-NOBW-NEXT: vmovdqa {{.*#+}} ymm2 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6] +; AVX512VPOPCNTDQ-NOBW-NEXT: vpbroadcastw {{.*#+}} ymm2 = [6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6] ; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtw %ymm2, %ymm0, %ymm0 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtw %ymm2, %ymm1, %ymm1 ; AVX512VPOPCNTDQ-NOBW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 @@ -1930,7 +1930,7 @@ define <32 x i16> @ult_7_v32i16(<32 x i16> %0) { ; AVX512F-LABEL: ult_7_v32i16: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -1951,7 +1951,7 @@ ; AVX512F-NEXT: vpsllw $8, %ymm0, %ymm1 ; AVX512F-NEXT: vpaddb %ymm0, %ymm1, %ymm0 ; AVX512F-NEXT: vpsrlw $8, %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm1 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] ; AVX512F-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; AVX512F-NEXT: vpcmpgtw %ymm2, %ymm1, %ymm1 ; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 @@ -1983,7 +1983,7 @@ ; AVX512VPOPCNTDQ-NOBW-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512VPOPCNTDQ-NOBW-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VPOPCNTDQ-NOBW-NEXT: vmovdqa {{.*#+}} ymm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; AVX512VPOPCNTDQ-NOBW-NEXT: vpbroadcastw {{.*#+}} ymm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] ; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtw %ymm0, %ymm2, %ymm0 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtw %ymm1, %ymm2, %ymm1 ; AVX512VPOPCNTDQ-NOBW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 @@ -2021,7 +2021,7 @@ define <32 x i16> @ugt_7_v32i16(<32 x i16> %0) { ; AVX512F-LABEL: ugt_7_v32i16: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -2042,7 +2042,7 @@ ; AVX512F-NEXT: vpsllw $8, %ymm0, %ymm1 ; AVX512F-NEXT: vpaddb %ymm0, %ymm1, %ymm0 ; AVX512F-NEXT: vpsrlw $8, %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm1 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] ; AVX512F-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: vpcmpgtw %ymm1, %ymm2, %ymm1 ; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 @@ -2074,7 +2074,7 @@ ; AVX512VPOPCNTDQ-NOBW-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512VPOPCNTDQ-NOBW-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VPOPCNTDQ-NOBW-NEXT: vmovdqa {{.*#+}} ymm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; AVX512VPOPCNTDQ-NOBW-NEXT: vpbroadcastw {{.*#+}} ymm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] ; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtw %ymm2, %ymm0, %ymm0 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtw %ymm2, %ymm1, %ymm1 ; AVX512VPOPCNTDQ-NOBW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 @@ -2112,7 +2112,7 @@ define <32 x i16> @ult_8_v32i16(<32 x i16> %0) { ; AVX512F-LABEL: ult_8_v32i16: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -2133,7 +2133,7 @@ ; AVX512F-NEXT: vpsllw $8, %ymm0, %ymm1 ; AVX512F-NEXT: vpaddb %ymm0, %ymm1, %ymm0 ; AVX512F-NEXT: vpsrlw $8, %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] ; AVX512F-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; AVX512F-NEXT: vpcmpgtw %ymm2, %ymm1, %ymm1 ; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 @@ -2165,7 +2165,7 @@ ; AVX512VPOPCNTDQ-NOBW-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512VPOPCNTDQ-NOBW-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VPOPCNTDQ-NOBW-NEXT: vmovdqa {{.*#+}} ymm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; AVX512VPOPCNTDQ-NOBW-NEXT: vpbroadcastw {{.*#+}} ymm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] ; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtw %ymm0, %ymm2, %ymm0 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtw %ymm1, %ymm2, %ymm1 ; AVX512VPOPCNTDQ-NOBW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 @@ -2203,7 +2203,7 @@ define <32 x i16> @ugt_8_v32i16(<32 x i16> %0) { ; AVX512F-LABEL: ugt_8_v32i16: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -2224,7 +2224,7 @@ ; AVX512F-NEXT: vpsllw $8, %ymm0, %ymm1 ; AVX512F-NEXT: vpaddb %ymm0, %ymm1, %ymm0 ; AVX512F-NEXT: vpsrlw $8, %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] ; AVX512F-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: vpcmpgtw %ymm1, %ymm2, %ymm1 ; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 @@ -2256,7 +2256,7 @@ ; AVX512VPOPCNTDQ-NOBW-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512VPOPCNTDQ-NOBW-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VPOPCNTDQ-NOBW-NEXT: vmovdqa {{.*#+}} ymm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; AVX512VPOPCNTDQ-NOBW-NEXT: vpbroadcastw {{.*#+}} ymm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] ; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtw %ymm2, %ymm0, %ymm0 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtw %ymm2, %ymm1, %ymm1 ; AVX512VPOPCNTDQ-NOBW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 @@ -2294,7 +2294,7 @@ define <32 x i16> @ult_9_v32i16(<32 x i16> %0) { ; AVX512F-LABEL: ult_9_v32i16: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -2315,7 +2315,7 @@ ; AVX512F-NEXT: vpsllw $8, %ymm0, %ymm1 ; AVX512F-NEXT: vpaddb %ymm0, %ymm1, %ymm0 ; AVX512F-NEXT: vpsrlw $8, %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9] +; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm1 = [9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9] ; AVX512F-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; AVX512F-NEXT: vpcmpgtw %ymm2, %ymm1, %ymm1 ; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 @@ -2347,7 +2347,7 @@ ; AVX512VPOPCNTDQ-NOBW-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512VPOPCNTDQ-NOBW-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VPOPCNTDQ-NOBW-NEXT: vmovdqa {{.*#+}} ymm2 = [9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9] +; AVX512VPOPCNTDQ-NOBW-NEXT: vpbroadcastw {{.*#+}} ymm2 = [9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9] ; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtw %ymm0, %ymm2, %ymm0 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtw %ymm1, %ymm2, %ymm1 ; AVX512VPOPCNTDQ-NOBW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 @@ -2385,7 +2385,7 @@ define <32 x i16> @ugt_9_v32i16(<32 x i16> %0) { ; AVX512F-LABEL: ugt_9_v32i16: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -2406,7 +2406,7 @@ ; AVX512F-NEXT: vpsllw $8, %ymm0, %ymm1 ; AVX512F-NEXT: vpaddb %ymm0, %ymm1, %ymm0 ; AVX512F-NEXT: vpsrlw $8, %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9] +; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm1 = [9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9] ; AVX512F-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: vpcmpgtw %ymm1, %ymm2, %ymm1 ; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 @@ -2438,7 +2438,7 @@ ; AVX512VPOPCNTDQ-NOBW-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512VPOPCNTDQ-NOBW-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VPOPCNTDQ-NOBW-NEXT: vmovdqa {{.*#+}} ymm2 = [9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9] +; AVX512VPOPCNTDQ-NOBW-NEXT: vpbroadcastw {{.*#+}} ymm2 = [9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9] ; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtw %ymm2, %ymm0, %ymm0 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtw %ymm2, %ymm1, %ymm1 ; AVX512VPOPCNTDQ-NOBW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 @@ -2476,7 +2476,7 @@ define <32 x i16> @ult_10_v32i16(<32 x i16> %0) { ; AVX512F-LABEL: ult_10_v32i16: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -2497,7 +2497,7 @@ ; AVX512F-NEXT: vpsllw $8, %ymm0, %ymm1 ; AVX512F-NEXT: vpaddb %ymm0, %ymm1, %ymm0 ; AVX512F-NEXT: vpsrlw $8, %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10] +; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm1 = [10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10] ; AVX512F-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; AVX512F-NEXT: vpcmpgtw %ymm2, %ymm1, %ymm1 ; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 @@ -2529,7 +2529,7 @@ ; AVX512VPOPCNTDQ-NOBW-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512VPOPCNTDQ-NOBW-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VPOPCNTDQ-NOBW-NEXT: vmovdqa {{.*#+}} ymm2 = [10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10] +; AVX512VPOPCNTDQ-NOBW-NEXT: vpbroadcastw {{.*#+}} ymm2 = [10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10] ; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtw %ymm0, %ymm2, %ymm0 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtw %ymm1, %ymm2, %ymm1 ; AVX512VPOPCNTDQ-NOBW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 @@ -2567,7 +2567,7 @@ define <32 x i16> @ugt_10_v32i16(<32 x i16> %0) { ; AVX512F-LABEL: ugt_10_v32i16: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -2588,7 +2588,7 @@ ; AVX512F-NEXT: vpsllw $8, %ymm0, %ymm1 ; AVX512F-NEXT: vpaddb %ymm0, %ymm1, %ymm0 ; AVX512F-NEXT: vpsrlw $8, %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10] +; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm1 = [10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10] ; AVX512F-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: vpcmpgtw %ymm1, %ymm2, %ymm1 ; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 @@ -2620,7 +2620,7 @@ ; AVX512VPOPCNTDQ-NOBW-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512VPOPCNTDQ-NOBW-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VPOPCNTDQ-NOBW-NEXT: vmovdqa {{.*#+}} ymm2 = [10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10] +; AVX512VPOPCNTDQ-NOBW-NEXT: vpbroadcastw {{.*#+}} ymm2 = [10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10] ; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtw %ymm2, %ymm0, %ymm0 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtw %ymm2, %ymm1, %ymm1 ; AVX512VPOPCNTDQ-NOBW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 @@ -2658,7 +2658,7 @@ define <32 x i16> @ult_11_v32i16(<32 x i16> %0) { ; AVX512F-LABEL: ult_11_v32i16: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -2679,7 +2679,7 @@ ; AVX512F-NEXT: vpsllw $8, %ymm0, %ymm1 ; AVX512F-NEXT: vpaddb %ymm0, %ymm1, %ymm0 ; AVX512F-NEXT: vpsrlw $8, %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11] +; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm1 = [11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11] ; AVX512F-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; AVX512F-NEXT: vpcmpgtw %ymm2, %ymm1, %ymm1 ; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 @@ -2711,7 +2711,7 @@ ; AVX512VPOPCNTDQ-NOBW-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512VPOPCNTDQ-NOBW-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VPOPCNTDQ-NOBW-NEXT: vmovdqa {{.*#+}} ymm2 = [11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11] +; AVX512VPOPCNTDQ-NOBW-NEXT: vpbroadcastw {{.*#+}} ymm2 = [11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11] ; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtw %ymm0, %ymm2, %ymm0 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtw %ymm1, %ymm2, %ymm1 ; AVX512VPOPCNTDQ-NOBW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 @@ -2749,7 +2749,7 @@ define <32 x i16> @ugt_11_v32i16(<32 x i16> %0) { ; AVX512F-LABEL: ugt_11_v32i16: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -2770,7 +2770,7 @@ ; AVX512F-NEXT: vpsllw $8, %ymm0, %ymm1 ; AVX512F-NEXT: vpaddb %ymm0, %ymm1, %ymm0 ; AVX512F-NEXT: vpsrlw $8, %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11] +; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm1 = [11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11] ; AVX512F-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: vpcmpgtw %ymm1, %ymm2, %ymm1 ; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 @@ -2802,7 +2802,7 @@ ; AVX512VPOPCNTDQ-NOBW-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512VPOPCNTDQ-NOBW-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VPOPCNTDQ-NOBW-NEXT: vmovdqa {{.*#+}} ymm2 = [11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11] +; AVX512VPOPCNTDQ-NOBW-NEXT: vpbroadcastw {{.*#+}} ymm2 = [11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11] ; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtw %ymm2, %ymm0, %ymm0 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtw %ymm2, %ymm1, %ymm1 ; AVX512VPOPCNTDQ-NOBW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 @@ -2840,7 +2840,7 @@ define <32 x i16> @ult_12_v32i16(<32 x i16> %0) { ; AVX512F-LABEL: ult_12_v32i16: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -2861,7 +2861,7 @@ ; AVX512F-NEXT: vpsllw $8, %ymm0, %ymm1 ; AVX512F-NEXT: vpaddb %ymm0, %ymm1, %ymm0 ; AVX512F-NEXT: vpsrlw $8, %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12] +; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm1 = [12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12] ; AVX512F-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; AVX512F-NEXT: vpcmpgtw %ymm2, %ymm1, %ymm1 ; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 @@ -2893,7 +2893,7 @@ ; AVX512VPOPCNTDQ-NOBW-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512VPOPCNTDQ-NOBW-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VPOPCNTDQ-NOBW-NEXT: vmovdqa {{.*#+}} ymm2 = [12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12] +; AVX512VPOPCNTDQ-NOBW-NEXT: vpbroadcastw {{.*#+}} ymm2 = [12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12] ; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtw %ymm0, %ymm2, %ymm0 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtw %ymm1, %ymm2, %ymm1 ; AVX512VPOPCNTDQ-NOBW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 @@ -2931,7 +2931,7 @@ define <32 x i16> @ugt_12_v32i16(<32 x i16> %0) { ; AVX512F-LABEL: ugt_12_v32i16: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -2952,7 +2952,7 @@ ; AVX512F-NEXT: vpsllw $8, %ymm0, %ymm1 ; AVX512F-NEXT: vpaddb %ymm0, %ymm1, %ymm0 ; AVX512F-NEXT: vpsrlw $8, %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12] +; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm1 = [12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12] ; AVX512F-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: vpcmpgtw %ymm1, %ymm2, %ymm1 ; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 @@ -2984,7 +2984,7 @@ ; AVX512VPOPCNTDQ-NOBW-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512VPOPCNTDQ-NOBW-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VPOPCNTDQ-NOBW-NEXT: vmovdqa {{.*#+}} ymm2 = [12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12] +; AVX512VPOPCNTDQ-NOBW-NEXT: vpbroadcastw {{.*#+}} ymm2 = [12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12] ; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtw %ymm2, %ymm0, %ymm0 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtw %ymm2, %ymm1, %ymm1 ; AVX512VPOPCNTDQ-NOBW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 @@ -3022,7 +3022,7 @@ define <32 x i16> @ult_13_v32i16(<32 x i16> %0) { ; AVX512F-LABEL: ult_13_v32i16: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -3043,7 +3043,7 @@ ; AVX512F-NEXT: vpsllw $8, %ymm0, %ymm1 ; AVX512F-NEXT: vpaddb %ymm0, %ymm1, %ymm0 ; AVX512F-NEXT: vpsrlw $8, %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13] +; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm1 = [13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13] ; AVX512F-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; AVX512F-NEXT: vpcmpgtw %ymm2, %ymm1, %ymm1 ; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 @@ -3075,7 +3075,7 @@ ; AVX512VPOPCNTDQ-NOBW-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512VPOPCNTDQ-NOBW-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VPOPCNTDQ-NOBW-NEXT: vmovdqa {{.*#+}} ymm2 = [13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13] +; AVX512VPOPCNTDQ-NOBW-NEXT: vpbroadcastw {{.*#+}} ymm2 = [13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13] ; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtw %ymm0, %ymm2, %ymm0 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtw %ymm1, %ymm2, %ymm1 ; AVX512VPOPCNTDQ-NOBW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 @@ -3113,7 +3113,7 @@ define <32 x i16> @ugt_13_v32i16(<32 x i16> %0) { ; AVX512F-LABEL: ugt_13_v32i16: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -3134,7 +3134,7 @@ ; AVX512F-NEXT: vpsllw $8, %ymm0, %ymm1 ; AVX512F-NEXT: vpaddb %ymm0, %ymm1, %ymm0 ; AVX512F-NEXT: vpsrlw $8, %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13] +; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm1 = [13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13] ; AVX512F-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: vpcmpgtw %ymm1, %ymm2, %ymm1 ; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 @@ -3166,7 +3166,7 @@ ; AVX512VPOPCNTDQ-NOBW-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512VPOPCNTDQ-NOBW-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VPOPCNTDQ-NOBW-NEXT: vmovdqa {{.*#+}} ymm2 = [13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13] +; AVX512VPOPCNTDQ-NOBW-NEXT: vpbroadcastw {{.*#+}} ymm2 = [13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13] ; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtw %ymm2, %ymm0, %ymm0 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtw %ymm2, %ymm1, %ymm1 ; AVX512VPOPCNTDQ-NOBW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 @@ -3204,7 +3204,7 @@ define <32 x i16> @ult_14_v32i16(<32 x i16> %0) { ; AVX512F-LABEL: ult_14_v32i16: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -3225,7 +3225,7 @@ ; AVX512F-NEXT: vpsllw $8, %ymm0, %ymm1 ; AVX512F-NEXT: vpaddb %ymm0, %ymm1, %ymm0 ; AVX512F-NEXT: vpsrlw $8, %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14] +; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm1 = [14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14] ; AVX512F-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; AVX512F-NEXT: vpcmpgtw %ymm2, %ymm1, %ymm1 ; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 @@ -3257,7 +3257,7 @@ ; AVX512VPOPCNTDQ-NOBW-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512VPOPCNTDQ-NOBW-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VPOPCNTDQ-NOBW-NEXT: vmovdqa {{.*#+}} ymm2 = [14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14] +; AVX512VPOPCNTDQ-NOBW-NEXT: vpbroadcastw {{.*#+}} ymm2 = [14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14] ; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtw %ymm0, %ymm2, %ymm0 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtw %ymm1, %ymm2, %ymm1 ; AVX512VPOPCNTDQ-NOBW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 @@ -3295,7 +3295,7 @@ define <32 x i16> @ugt_14_v32i16(<32 x i16> %0) { ; AVX512F-LABEL: ugt_14_v32i16: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -3316,7 +3316,7 @@ ; AVX512F-NEXT: vpsllw $8, %ymm0, %ymm1 ; AVX512F-NEXT: vpaddb %ymm0, %ymm1, %ymm0 ; AVX512F-NEXT: vpsrlw $8, %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14] +; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm1 = [14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14] ; AVX512F-NEXT: vpcmpgtw %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: vpcmpgtw %ymm1, %ymm2, %ymm1 ; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 @@ -3348,7 +3348,7 @@ ; AVX512VPOPCNTDQ-NOBW-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512VPOPCNTDQ-NOBW-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VPOPCNTDQ-NOBW-NEXT: vmovdqa {{.*#+}} ymm2 = [14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14] +; AVX512VPOPCNTDQ-NOBW-NEXT: vpbroadcastw {{.*#+}} ymm2 = [14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14] ; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtw %ymm2, %ymm0, %ymm0 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtw %ymm2, %ymm1, %ymm1 ; AVX512VPOPCNTDQ-NOBW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 @@ -3386,7 +3386,7 @@ define <32 x i16> @ult_15_v32i16(<32 x i16> %0) { ; AVX512F-LABEL: ult_15_v32i16: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -3407,7 +3407,7 @@ ; AVX512F-NEXT: vpsllw $8, %ymm0, %ymm1 ; AVX512F-NEXT: vpaddb %ymm0, %ymm1, %ymm0 ; AVX512F-NEXT: vpsrlw $8, %ymm0, %ymm0 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpcmpgtw %ymm0, %ymm1, %ymm0 ; AVX512F-NEXT: vpcmpgtw %ymm2, %ymm1, %ymm1 ; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 @@ -3439,7 +3439,7 @@ ; AVX512VPOPCNTDQ-NOBW-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512VPOPCNTDQ-NOBW-NEXT: vpopcntd %zmm0, %zmm0 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512VPOPCNTDQ-NOBW-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQ-NOBW-NEXT: vpbroadcastw {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtw %ymm0, %ymm2, %ymm0 ; AVX512VPOPCNTDQ-NOBW-NEXT: vpcmpgtw %ymm1, %ymm2, %ymm1 ; AVX512VPOPCNTDQ-NOBW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 @@ -3552,7 +3552,7 @@ ; AVX512F-LABEL: ugt_2_v16i32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -3631,7 +3631,7 @@ ; AVX512F-LABEL: ult_3_v16i32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -3710,7 +3710,7 @@ ; AVX512F-LABEL: ugt_3_v16i32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -3789,7 +3789,7 @@ ; AVX512F-LABEL: ult_4_v16i32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -3868,7 +3868,7 @@ ; AVX512F-LABEL: ugt_4_v16i32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -3947,7 +3947,7 @@ ; AVX512F-LABEL: ult_5_v16i32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -4026,7 +4026,7 @@ ; AVX512F-LABEL: ugt_5_v16i32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -4105,7 +4105,7 @@ ; AVX512F-LABEL: ult_6_v16i32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -4184,7 +4184,7 @@ ; AVX512F-LABEL: ugt_6_v16i32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -4263,7 +4263,7 @@ ; AVX512F-LABEL: ult_7_v16i32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -4342,7 +4342,7 @@ ; AVX512F-LABEL: ugt_7_v16i32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -4421,7 +4421,7 @@ ; AVX512F-LABEL: ult_8_v16i32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -4500,7 +4500,7 @@ ; AVX512F-LABEL: ugt_8_v16i32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -4579,7 +4579,7 @@ ; AVX512F-LABEL: ult_9_v16i32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -4658,7 +4658,7 @@ ; AVX512F-LABEL: ugt_9_v16i32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -4737,7 +4737,7 @@ ; AVX512F-LABEL: ult_10_v16i32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -4816,7 +4816,7 @@ ; AVX512F-LABEL: ugt_10_v16i32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -4895,7 +4895,7 @@ ; AVX512F-LABEL: ult_11_v16i32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -4974,7 +4974,7 @@ ; AVX512F-LABEL: ugt_11_v16i32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -5053,7 +5053,7 @@ ; AVX512F-LABEL: ult_12_v16i32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -5132,7 +5132,7 @@ ; AVX512F-LABEL: ugt_12_v16i32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -5211,7 +5211,7 @@ ; AVX512F-LABEL: ult_13_v16i32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -5290,7 +5290,7 @@ ; AVX512F-LABEL: ugt_13_v16i32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -5369,7 +5369,7 @@ ; AVX512F-LABEL: ult_14_v16i32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -5448,7 +5448,7 @@ ; AVX512F-LABEL: ugt_14_v16i32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -5527,7 +5527,7 @@ ; AVX512F-LABEL: ult_15_v16i32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -5606,7 +5606,7 @@ ; AVX512F-LABEL: ugt_15_v16i32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -5685,7 +5685,7 @@ ; AVX512F-LABEL: ult_16_v16i32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -5764,7 +5764,7 @@ ; AVX512F-LABEL: ugt_16_v16i32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -5843,7 +5843,7 @@ ; AVX512F-LABEL: ult_17_v16i32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -5922,7 +5922,7 @@ ; AVX512F-LABEL: ugt_17_v16i32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -6001,7 +6001,7 @@ ; AVX512F-LABEL: ult_18_v16i32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -6080,7 +6080,7 @@ ; AVX512F-LABEL: ugt_18_v16i32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -6159,7 +6159,7 @@ ; AVX512F-LABEL: ult_19_v16i32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -6238,7 +6238,7 @@ ; AVX512F-LABEL: ugt_19_v16i32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -6317,7 +6317,7 @@ ; AVX512F-LABEL: ult_20_v16i32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -6396,7 +6396,7 @@ ; AVX512F-LABEL: ugt_20_v16i32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -6475,7 +6475,7 @@ ; AVX512F-LABEL: ult_21_v16i32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -6554,7 +6554,7 @@ ; AVX512F-LABEL: ugt_21_v16i32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -6633,7 +6633,7 @@ ; AVX512F-LABEL: ult_22_v16i32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -6712,7 +6712,7 @@ ; AVX512F-LABEL: ugt_22_v16i32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -6791,7 +6791,7 @@ ; AVX512F-LABEL: ult_23_v16i32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -6870,7 +6870,7 @@ ; AVX512F-LABEL: ugt_23_v16i32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -6949,7 +6949,7 @@ ; AVX512F-LABEL: ult_24_v16i32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -7028,7 +7028,7 @@ ; AVX512F-LABEL: ugt_24_v16i32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -7107,7 +7107,7 @@ ; AVX512F-LABEL: ult_25_v16i32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -7186,7 +7186,7 @@ ; AVX512F-LABEL: ugt_25_v16i32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -7265,7 +7265,7 @@ ; AVX512F-LABEL: ult_26_v16i32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -7344,7 +7344,7 @@ ; AVX512F-LABEL: ugt_26_v16i32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -7423,7 +7423,7 @@ ; AVX512F-LABEL: ult_27_v16i32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -7502,7 +7502,7 @@ ; AVX512F-LABEL: ugt_27_v16i32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -7581,7 +7581,7 @@ ; AVX512F-LABEL: ult_28_v16i32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -7660,7 +7660,7 @@ ; AVX512F-LABEL: ugt_28_v16i32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -7739,7 +7739,7 @@ ; AVX512F-LABEL: ult_29_v16i32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -7818,7 +7818,7 @@ ; AVX512F-LABEL: ugt_29_v16i32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -7897,7 +7897,7 @@ ; AVX512F-LABEL: ult_30_v16i32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -7976,7 +7976,7 @@ ; AVX512F-LABEL: ugt_30_v16i32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -8055,7 +8055,7 @@ ; AVX512F-LABEL: ult_31_v16i32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -8208,7 +8208,7 @@ ; AVX512F-LABEL: ugt_2_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -8271,7 +8271,7 @@ ; AVX512F-LABEL: ult_3_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -8334,7 +8334,7 @@ ; AVX512F-LABEL: ugt_3_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -8397,7 +8397,7 @@ ; AVX512F-LABEL: ult_4_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -8460,7 +8460,7 @@ ; AVX512F-LABEL: ugt_4_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -8523,7 +8523,7 @@ ; AVX512F-LABEL: ult_5_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -8586,7 +8586,7 @@ ; AVX512F-LABEL: ugt_5_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -8649,7 +8649,7 @@ ; AVX512F-LABEL: ult_6_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -8712,7 +8712,7 @@ ; AVX512F-LABEL: ugt_6_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -8775,7 +8775,7 @@ ; AVX512F-LABEL: ult_7_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -8838,7 +8838,7 @@ ; AVX512F-LABEL: ugt_7_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -8901,7 +8901,7 @@ ; AVX512F-LABEL: ult_8_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -8964,7 +8964,7 @@ ; AVX512F-LABEL: ugt_8_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -9027,7 +9027,7 @@ ; AVX512F-LABEL: ult_9_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -9090,7 +9090,7 @@ ; AVX512F-LABEL: ugt_9_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -9153,7 +9153,7 @@ ; AVX512F-LABEL: ult_10_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -9216,7 +9216,7 @@ ; AVX512F-LABEL: ugt_10_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -9279,7 +9279,7 @@ ; AVX512F-LABEL: ult_11_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -9342,7 +9342,7 @@ ; AVX512F-LABEL: ugt_11_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -9405,7 +9405,7 @@ ; AVX512F-LABEL: ult_12_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -9468,7 +9468,7 @@ ; AVX512F-LABEL: ugt_12_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -9531,7 +9531,7 @@ ; AVX512F-LABEL: ult_13_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -9594,7 +9594,7 @@ ; AVX512F-LABEL: ugt_13_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -9657,7 +9657,7 @@ ; AVX512F-LABEL: ult_14_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -9720,7 +9720,7 @@ ; AVX512F-LABEL: ugt_14_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -9783,7 +9783,7 @@ ; AVX512F-LABEL: ult_15_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -9846,7 +9846,7 @@ ; AVX512F-LABEL: ugt_15_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -9909,7 +9909,7 @@ ; AVX512F-LABEL: ult_16_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -9972,7 +9972,7 @@ ; AVX512F-LABEL: ugt_16_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -10035,7 +10035,7 @@ ; AVX512F-LABEL: ult_17_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -10098,7 +10098,7 @@ ; AVX512F-LABEL: ugt_17_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -10161,7 +10161,7 @@ ; AVX512F-LABEL: ult_18_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -10224,7 +10224,7 @@ ; AVX512F-LABEL: ugt_18_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -10287,7 +10287,7 @@ ; AVX512F-LABEL: ult_19_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -10350,7 +10350,7 @@ ; AVX512F-LABEL: ugt_19_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -10413,7 +10413,7 @@ ; AVX512F-LABEL: ult_20_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -10476,7 +10476,7 @@ ; AVX512F-LABEL: ugt_20_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -10539,7 +10539,7 @@ ; AVX512F-LABEL: ult_21_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -10602,7 +10602,7 @@ ; AVX512F-LABEL: ugt_21_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -10665,7 +10665,7 @@ ; AVX512F-LABEL: ult_22_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -10728,7 +10728,7 @@ ; AVX512F-LABEL: ugt_22_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -10791,7 +10791,7 @@ ; AVX512F-LABEL: ult_23_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -10854,7 +10854,7 @@ ; AVX512F-LABEL: ugt_23_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -10917,7 +10917,7 @@ ; AVX512F-LABEL: ult_24_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -10980,7 +10980,7 @@ ; AVX512F-LABEL: ugt_24_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -11043,7 +11043,7 @@ ; AVX512F-LABEL: ult_25_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -11106,7 +11106,7 @@ ; AVX512F-LABEL: ugt_25_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -11169,7 +11169,7 @@ ; AVX512F-LABEL: ult_26_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -11232,7 +11232,7 @@ ; AVX512F-LABEL: ugt_26_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -11295,7 +11295,7 @@ ; AVX512F-LABEL: ult_27_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -11358,7 +11358,7 @@ ; AVX512F-LABEL: ugt_27_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -11421,7 +11421,7 @@ ; AVX512F-LABEL: ult_28_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -11484,7 +11484,7 @@ ; AVX512F-LABEL: ugt_28_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -11547,7 +11547,7 @@ ; AVX512F-LABEL: ult_29_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -11610,7 +11610,7 @@ ; AVX512F-LABEL: ugt_29_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -11673,7 +11673,7 @@ ; AVX512F-LABEL: ult_30_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -11736,7 +11736,7 @@ ; AVX512F-LABEL: ugt_30_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -11799,7 +11799,7 @@ ; AVX512F-LABEL: ult_31_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -11862,7 +11862,7 @@ ; AVX512F-LABEL: ugt_31_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -11925,7 +11925,7 @@ ; AVX512F-LABEL: ult_32_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -11988,7 +11988,7 @@ ; AVX512F-LABEL: ugt_32_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -12051,7 +12051,7 @@ ; AVX512F-LABEL: ult_33_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -12114,7 +12114,7 @@ ; AVX512F-LABEL: ugt_33_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -12177,7 +12177,7 @@ ; AVX512F-LABEL: ult_34_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -12240,7 +12240,7 @@ ; AVX512F-LABEL: ugt_34_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -12303,7 +12303,7 @@ ; AVX512F-LABEL: ult_35_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -12366,7 +12366,7 @@ ; AVX512F-LABEL: ugt_35_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -12429,7 +12429,7 @@ ; AVX512F-LABEL: ult_36_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -12492,7 +12492,7 @@ ; AVX512F-LABEL: ugt_36_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -12555,7 +12555,7 @@ ; AVX512F-LABEL: ult_37_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -12618,7 +12618,7 @@ ; AVX512F-LABEL: ugt_37_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -12681,7 +12681,7 @@ ; AVX512F-LABEL: ult_38_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -12744,7 +12744,7 @@ ; AVX512F-LABEL: ugt_38_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -12807,7 +12807,7 @@ ; AVX512F-LABEL: ult_39_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -12870,7 +12870,7 @@ ; AVX512F-LABEL: ugt_39_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -12933,7 +12933,7 @@ ; AVX512F-LABEL: ult_40_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -12996,7 +12996,7 @@ ; AVX512F-LABEL: ugt_40_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -13059,7 +13059,7 @@ ; AVX512F-LABEL: ult_41_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -13122,7 +13122,7 @@ ; AVX512F-LABEL: ugt_41_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -13185,7 +13185,7 @@ ; AVX512F-LABEL: ult_42_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -13248,7 +13248,7 @@ ; AVX512F-LABEL: ugt_42_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -13311,7 +13311,7 @@ ; AVX512F-LABEL: ult_43_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -13374,7 +13374,7 @@ ; AVX512F-LABEL: ugt_43_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -13437,7 +13437,7 @@ ; AVX512F-LABEL: ult_44_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -13500,7 +13500,7 @@ ; AVX512F-LABEL: ugt_44_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -13563,7 +13563,7 @@ ; AVX512F-LABEL: ult_45_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -13626,7 +13626,7 @@ ; AVX512F-LABEL: ugt_45_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -13689,7 +13689,7 @@ ; AVX512F-LABEL: ult_46_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -13752,7 +13752,7 @@ ; AVX512F-LABEL: ugt_46_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -13815,7 +13815,7 @@ ; AVX512F-LABEL: ult_47_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -13878,7 +13878,7 @@ ; AVX512F-LABEL: ugt_47_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -13941,7 +13941,7 @@ ; AVX512F-LABEL: ult_48_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -14004,7 +14004,7 @@ ; AVX512F-LABEL: ugt_48_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -14067,7 +14067,7 @@ ; AVX512F-LABEL: ult_49_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -14130,7 +14130,7 @@ ; AVX512F-LABEL: ugt_49_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -14193,7 +14193,7 @@ ; AVX512F-LABEL: ult_50_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -14256,7 +14256,7 @@ ; AVX512F-LABEL: ugt_50_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -14319,7 +14319,7 @@ ; AVX512F-LABEL: ult_51_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -14382,7 +14382,7 @@ ; AVX512F-LABEL: ugt_51_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -14445,7 +14445,7 @@ ; AVX512F-LABEL: ult_52_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -14508,7 +14508,7 @@ ; AVX512F-LABEL: ugt_52_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -14571,7 +14571,7 @@ ; AVX512F-LABEL: ult_53_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -14634,7 +14634,7 @@ ; AVX512F-LABEL: ugt_53_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -14697,7 +14697,7 @@ ; AVX512F-LABEL: ult_54_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -14760,7 +14760,7 @@ ; AVX512F-LABEL: ugt_54_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -14823,7 +14823,7 @@ ; AVX512F-LABEL: ult_55_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -14886,7 +14886,7 @@ ; AVX512F-LABEL: ugt_55_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -14949,7 +14949,7 @@ ; AVX512F-LABEL: ult_56_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -15012,7 +15012,7 @@ ; AVX512F-LABEL: ugt_56_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -15075,7 +15075,7 @@ ; AVX512F-LABEL: ult_57_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -15138,7 +15138,7 @@ ; AVX512F-LABEL: ugt_57_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -15201,7 +15201,7 @@ ; AVX512F-LABEL: ult_58_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -15264,7 +15264,7 @@ ; AVX512F-LABEL: ugt_58_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -15327,7 +15327,7 @@ ; AVX512F-LABEL: ult_59_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -15390,7 +15390,7 @@ ; AVX512F-LABEL: ugt_59_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -15453,7 +15453,7 @@ ; AVX512F-LABEL: ult_60_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -15516,7 +15516,7 @@ ; AVX512F-LABEL: ugt_60_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -15579,7 +15579,7 @@ ; AVX512F-LABEL: ult_61_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -15642,7 +15642,7 @@ ; AVX512F-LABEL: ugt_61_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -15705,7 +15705,7 @@ ; AVX512F-LABEL: ult_62_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -15768,7 +15768,7 @@ ; AVX512F-LABEL: ugt_62_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -15831,7 +15831,7 @@ ; AVX512F-LABEL: ult_63_v8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 diff --git a/llvm/test/CodeGen/X86/vector-popcnt-512.ll b/llvm/test/CodeGen/X86/vector-popcnt-512.ll --- a/llvm/test/CodeGen/X86/vector-popcnt-512.ll +++ b/llvm/test/CodeGen/X86/vector-popcnt-512.ll @@ -9,7 +9,7 @@ ; AVX512F-LABEL: testv8i64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -62,7 +62,7 @@ ; AVX512F-LABEL: testv16i32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -130,7 +130,7 @@ define <32 x i16> @testv32i16(<32 x i16> %in) nounwind { ; AVX512F-LABEL: testv32i16: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -208,7 +208,7 @@ ; AVX512F-LABEL: testv64i8: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512F-NEXT: vpshufb %ymm3, %ymm4, %ymm3 @@ -240,7 +240,7 @@ ; AVX512VPOPCNTDQ-NOBW-LABEL: testv64i8: ; AVX512VPOPCNTDQ-NOBW: # %bb.0: ; AVX512VPOPCNTDQ-NOBW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512VPOPCNTDQ-NOBW-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQ-NOBW-NEXT: vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQ-NOBW-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512VPOPCNTDQ-NOBW-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512VPOPCNTDQ-NOBW-NEXT: vpshufb %ymm3, %ymm4, %ymm3 diff --git a/llvm/test/CodeGen/X86/vector-reduce-add-mask.ll b/llvm/test/CodeGen/X86/vector-reduce-add-mask.ll --- a/llvm/test/CodeGen/X86/vector-reduce-add-mask.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-add-mask.ll @@ -231,7 +231,7 @@ ; ; AVX1-LABEL: test_v16i64_v16i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovaps {{.*#+}} ymm4 = [1,1,1,1] +; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm4 = [4.9406564584124654E-324,4.9406564584124654E-324,4.9406564584124654E-324,4.9406564584124654E-324] ; AVX1-NEXT: vandps %ymm4, %ymm0, %ymm0 ; AVX1-NEXT: vandps %ymm4, %ymm2, %ymm2 ; AVX1-NEXT: vandps %ymm4, %ymm1, %ymm1 @@ -539,7 +539,7 @@ ; ; AVX1-SLOW-LABEL: test_v16i32_v16i8: ; AVX1-SLOW: # %bb.0: -; AVX1-SLOW-NEXT: vmovaps {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255] +; AVX1-SLOW-NEXT: vbroadcastss {{.*#+}} ymm2 = [3.57331108E-43,3.57331108E-43,3.57331108E-43,3.57331108E-43,3.57331108E-43,3.57331108E-43,3.57331108E-43,3.57331108E-43] ; AVX1-SLOW-NEXT: vandps %ymm2, %ymm0, %ymm0 ; AVX1-SLOW-NEXT: vandps %ymm2, %ymm1, %ymm1 ; AVX1-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2 @@ -557,7 +557,7 @@ ; ; AVX1-FAST-LABEL: test_v16i32_v16i8: ; AVX1-FAST: # %bb.0: -; AVX1-FAST-NEXT: vmovaps {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255] +; AVX1-FAST-NEXT: vbroadcastss {{.*#+}} ymm2 = [3.57331108E-43,3.57331108E-43,3.57331108E-43,3.57331108E-43,3.57331108E-43,3.57331108E-43,3.57331108E-43,3.57331108E-43] ; AVX1-FAST-NEXT: vandps %ymm2, %ymm0, %ymm0 ; AVX1-FAST-NEXT: vandps %ymm2, %ymm1, %ymm1 ; AVX1-FAST-NEXT: vextractf128 $1, %ymm1, %xmm2 @@ -656,7 +656,7 @@ ; ; AVX1-SLOW-LABEL: test_v32i32_v32i8: ; AVX1-SLOW: # %bb.0: -; AVX1-SLOW-NEXT: vmovaps {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255] +; AVX1-SLOW-NEXT: vbroadcastss {{.*#+}} ymm4 = [3.57331108E-43,3.57331108E-43,3.57331108E-43,3.57331108E-43,3.57331108E-43,3.57331108E-43,3.57331108E-43,3.57331108E-43] ; AVX1-SLOW-NEXT: vandps %ymm4, %ymm0, %ymm0 ; AVX1-SLOW-NEXT: vandps %ymm4, %ymm2, %ymm2 ; AVX1-SLOW-NEXT: vandps %ymm4, %ymm1, %ymm1 @@ -682,7 +682,7 @@ ; ; AVX1-FAST-LABEL: test_v32i32_v32i8: ; AVX1-FAST: # %bb.0: -; AVX1-FAST-NEXT: vmovaps {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255] +; AVX1-FAST-NEXT: vbroadcastss {{.*#+}} ymm4 = [3.57331108E-43,3.57331108E-43,3.57331108E-43,3.57331108E-43,3.57331108E-43,3.57331108E-43,3.57331108E-43,3.57331108E-43] ; AVX1-FAST-NEXT: vandps %ymm4, %ymm0, %ymm0 ; AVX1-FAST-NEXT: vandps %ymm4, %ymm2, %ymm2 ; AVX1-FAST-NEXT: vandps %ymm4, %ymm1, %ymm1 @@ -1184,7 +1184,7 @@ ; ; AVX1-LABEL: test_v64i16_v64i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovaps {{.*#+}} ymm4 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX1-NEXT: vbroadcastss {{.*#+}} ymm4 = [1.1663286E-38,1.1663286E-38,1.1663286E-38,1.1663286E-38,1.1663286E-38,1.1663286E-38,1.1663286E-38,1.1663286E-38] ; AVX1-NEXT: vandps %ymm4, %ymm0, %ymm0 ; AVX1-NEXT: vandps %ymm4, %ymm2, %ymm2 ; AVX1-NEXT: vandps %ymm4, %ymm1, %ymm1 @@ -1214,7 +1214,7 @@ ; ; AVX2-LABEL: test_v64i16_v64i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm4 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] ; AVX2-NEXT: vpand %ymm4, %ymm1, %ymm1 ; AVX2-NEXT: vpand %ymm4, %ymm0, %ymm0 ; AVX2-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 diff --git a/llvm/test/CodeGen/X86/vector-reduce-and-bool.ll b/llvm/test/CodeGen/X86/vector-reduce-and-bool.ll --- a/llvm/test/CodeGen/X86/vector-reduce-and-bool.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-and-bool.ll @@ -70,18 +70,11 @@ ; SSE41-NEXT: setb %al ; SSE41-NEXT: retq ; -; AVX1-LABEL: trunc_v4i32_v4i1: -; AVX1: # %bb.0: -; AVX1-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; AVX1-NEXT: setb %al -; AVX1-NEXT: retq -; -; AVX2-LABEL: trunc_v4i32_v4i1: -; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1,1,1,1] -; AVX2-NEXT: vptest %xmm1, %xmm0 -; AVX2-NEXT: setb %al -; AVX2-NEXT: retq +; AVX1OR2-LABEL: trunc_v4i32_v4i1: +; AVX1OR2: # %bb.0: +; AVX1OR2-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; AVX1OR2-NEXT: setb %al +; AVX1OR2-NEXT: retq ; ; AVX512-LABEL: trunc_v4i32_v4i1: ; AVX512: # %bb.0: @@ -162,20 +155,12 @@ ; SSE41-NEXT: setb %al ; SSE41-NEXT: retq ; -; AVX1-LABEL: trunc_v4i64_v4i1: -; AVX1: # %bb.0: -; AVX1-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0 -; AVX1-NEXT: setb %al -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: trunc_v4i64_v4i1: -; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [1,1,1,1] -; AVX2-NEXT: vptest %ymm1, %ymm0 -; AVX2-NEXT: setb %al -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq +; AVX1OR2-LABEL: trunc_v4i64_v4i1: +; AVX1OR2: # %bb.0: +; AVX1OR2-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0 +; AVX1OR2-NEXT: setb %al +; AVX1OR2-NEXT: vzeroupper +; AVX1OR2-NEXT: retq ; ; AVX512-LABEL: trunc_v4i64_v4i1: ; AVX512: # %bb.0: @@ -206,20 +191,12 @@ ; SSE41-NEXT: setb %al ; SSE41-NEXT: retq ; -; AVX1-LABEL: trunc_v8i32_v8i1: -; AVX1: # %bb.0: -; AVX1-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0 -; AVX1-NEXT: setb %al -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: trunc_v8i32_v8i1: -; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm1 = [1,1,1,1,1,1,1,1] -; AVX2-NEXT: vptest %ymm1, %ymm0 -; AVX2-NEXT: setb %al -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq +; AVX1OR2-LABEL: trunc_v8i32_v8i1: +; AVX1OR2: # %bb.0: +; AVX1OR2-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0 +; AVX1OR2-NEXT: setb %al +; AVX1OR2-NEXT: vzeroupper +; AVX1OR2-NEXT: retq ; ; AVX512-LABEL: trunc_v8i32_v8i1: ; AVX512: # %bb.0: @@ -331,8 +308,7 @@ ; AVX2-LABEL: trunc_v8i64_v8i1: ; AVX2: # %bb.0: ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [1,1,1,1] -; AVX2-NEXT: vptest %ymm1, %ymm0 +; AVX2-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0 ; AVX2-NEXT: setb %al ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -383,8 +359,7 @@ ; AVX2-LABEL: trunc_v16i32_v16i1: ; AVX2: # %bb.0: ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm1 = [1,1,1,1,1,1,1,1] -; AVX2-NEXT: vptest %ymm1, %ymm0 +; AVX2-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0 ; AVX2-NEXT: setb %al ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-reduce-or-bool.ll b/llvm/test/CodeGen/X86/vector-reduce-or-bool.ll --- a/llvm/test/CodeGen/X86/vector-reduce-or-bool.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-or-bool.ll @@ -70,18 +70,11 @@ ; SSE41-NEXT: setne %al ; SSE41-NEXT: retq ; -; AVX1-LABEL: trunc_v4i32_v4i1: -; AVX1: # %bb.0: -; AVX1-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; AVX1-NEXT: setne %al -; AVX1-NEXT: retq -; -; AVX2-LABEL: trunc_v4i32_v4i1: -; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1,1,1,1] -; AVX2-NEXT: vptest %xmm1, %xmm0 -; AVX2-NEXT: setne %al -; AVX2-NEXT: retq +; AVX1OR2-LABEL: trunc_v4i32_v4i1: +; AVX1OR2: # %bb.0: +; AVX1OR2-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; AVX1OR2-NEXT: setne %al +; AVX1OR2-NEXT: retq ; ; AVX512-LABEL: trunc_v4i32_v4i1: ; AVX512: # %bb.0: @@ -161,20 +154,12 @@ ; SSE41-NEXT: setne %al ; SSE41-NEXT: retq ; -; AVX1-LABEL: trunc_v4i64_v4i1: -; AVX1: # %bb.0: -; AVX1-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0 -; AVX1-NEXT: setne %al -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: trunc_v4i64_v4i1: -; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [1,1,1,1] -; AVX2-NEXT: vptest %ymm1, %ymm0 -; AVX2-NEXT: setne %al -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq +; AVX1OR2-LABEL: trunc_v4i64_v4i1: +; AVX1OR2: # %bb.0: +; AVX1OR2-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0 +; AVX1OR2-NEXT: setne %al +; AVX1OR2-NEXT: vzeroupper +; AVX1OR2-NEXT: retq ; ; AVX512-LABEL: trunc_v4i64_v4i1: ; AVX512: # %bb.0: @@ -205,20 +190,12 @@ ; SSE41-NEXT: setne %al ; SSE41-NEXT: retq ; -; AVX1-LABEL: trunc_v8i32_v8i1: -; AVX1: # %bb.0: -; AVX1-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0 -; AVX1-NEXT: setne %al -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: trunc_v8i32_v8i1: -; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm1 = [1,1,1,1,1,1,1,1] -; AVX2-NEXT: vptest %ymm1, %ymm0 -; AVX2-NEXT: setne %al -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq +; AVX1OR2-LABEL: trunc_v8i32_v8i1: +; AVX1OR2: # %bb.0: +; AVX1OR2-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0 +; AVX1OR2-NEXT: setne %al +; AVX1OR2-NEXT: vzeroupper +; AVX1OR2-NEXT: retq ; ; AVX512-LABEL: trunc_v8i32_v8i1: ; AVX512: # %bb.0: @@ -328,8 +305,7 @@ ; AVX2-LABEL: trunc_v8i64_v8i1: ; AVX2: # %bb.0: ; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [1,1,1,1] -; AVX2-NEXT: vptest %ymm1, %ymm0 +; AVX2-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0 ; AVX2-NEXT: setne %al ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -379,8 +355,7 @@ ; AVX2-LABEL: trunc_v16i32_v16i1: ; AVX2: # %bb.0: ; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm1 = [1,1,1,1,1,1,1,1] -; AVX2-NEXT: vptest %ymm1, %ymm0 +; AVX2-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0 ; AVX2-NEXT: setne %al ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-reduce-or-cmp.ll b/llvm/test/CodeGen/X86/vector-reduce-or-cmp.ll --- a/llvm/test/CodeGen/X86/vector-reduce-or-cmp.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-or-cmp.ll @@ -875,20 +875,12 @@ ; SSE41-NEXT: sete %al ; SSE41-NEXT: retq ; -; AVX1-LABEL: mask_v8i32: -; AVX1: # %bb.0: -; AVX1-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0 -; AVX1-NEXT: sete %al -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: mask_v8i32: -; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm1 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648] -; AVX2-NEXT: vptest %ymm1, %ymm0 -; AVX2-NEXT: sete %al -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq +; AVX1OR2-LABEL: mask_v8i32: +; AVX1OR2: # %bb.0: +; AVX1OR2-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0 +; AVX1OR2-NEXT: sete %al +; AVX1OR2-NEXT: vzeroupper +; AVX1OR2-NEXT: retq ; ; AVX512-LABEL: mask_v8i32: ; AVX512: # %bb.0: @@ -978,8 +970,7 @@ ; AVX2-NEXT: vpor %ymm3, %ymm1, %ymm1 ; AVX2-NEXT: vpor %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [72340172838076673,72340172838076673,72340172838076673,72340172838076673] -; AVX2-NEXT: vptest %ymm1, %ymm0 +; AVX2-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0 ; AVX2-NEXT: sete %al ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -1018,20 +1009,12 @@ ; SSE41-NEXT: sete %al ; SSE41-NEXT: retq ; -; AVX1-LABEL: PR44781: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqu (%rdi), %xmm0 -; AVX1-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; AVX1-NEXT: sete %al -; AVX1-NEXT: retq -; -; AVX2-LABEL: PR44781: -; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqu (%rdi), %xmm0 -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [15,15,15,15] -; AVX2-NEXT: vptest %xmm1, %xmm0 -; AVX2-NEXT: sete %al -; AVX2-NEXT: retq +; AVX1OR2-LABEL: PR44781: +; AVX1OR2: # %bb.0: +; AVX1OR2-NEXT: vmovdqu (%rdi), %xmm0 +; AVX1OR2-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; AVX1OR2-NEXT: sete %al +; AVX1OR2-NEXT: retq ; ; AVX512-LABEL: PR44781: ; AVX512: # %bb.0: diff --git a/llvm/test/CodeGen/X86/vector-reduce-umax.ll b/llvm/test/CodeGen/X86/vector-reduce-umax.ll --- a/llvm/test/CodeGen/X86/vector-reduce-umax.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-umax.ll @@ -63,16 +63,27 @@ ; SSE42-NEXT: movq %xmm2, %rax ; SSE42-NEXT: retq ; -; AVX-LABEL: test_v2i64: -; AVX: # %bb.0: -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] -; AVX-NEXT: vpxor %xmm2, %xmm0, %xmm3 -; AVX-NEXT: vpxor %xmm2, %xmm1, %xmm2 -; AVX-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 -; AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; AVX-NEXT: vmovq %xmm0, %rax -; AVX-NEXT: retq +; AVX1-LABEL: test_v2i64: +; AVX1: # %bb.0: +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm3 +; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm2 +; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vmovq %xmm0, %rax +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_v2i64: +; AVX2: # %bb.0: +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm3 +; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm2 +; AVX2-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 +; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vmovq %xmm0, %rax +; AVX2-NEXT: retq ; ; AVX512BW-LABEL: test_v2i64: ; AVX512BW: # %bb.0: @@ -197,7 +208,7 @@ ; AVX2-LABEL: test_v4i64: ; AVX2: # %bb.0: ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] ; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm3 ; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm4 ; AVX2-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3 @@ -413,10 +424,12 @@ ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] ; AVX2-NEXT: vpxor %ymm2, %ymm1, %ymm3 -; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm4 -; AVX2-NEXT: vpcmpgtq %ymm3, %ymm4, %ymm3 -; AVX2-NEXT: vblendvpd %ymm3, %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm2 +; AVX2-NEXT: vpcmpgtq %ymm3, %ymm2, %ymm2 +; AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vmovddup {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX2-NEXT: # xmm2 = mem[0,0] ; AVX2-NEXT: vxorpd %xmm2, %xmm1, %xmm3 ; AVX2-NEXT: vxorpd %xmm2, %xmm0, %xmm4 ; AVX2-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3 @@ -794,14 +807,16 @@ ; AVX2-NEXT: vpcmpgtq %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vxorpd %xmm4, %xmm1, %xmm2 -; AVX2-NEXT: vxorpd %xmm4, %xmm0, %xmm3 -; AVX2-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 -; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vmovddup {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX2-NEXT: # xmm2 = mem[0,0] +; AVX2-NEXT: vxorpd %xmm2, %xmm1, %xmm3 +; AVX2-NEXT: vxorpd %xmm2, %xmm0, %xmm4 +; AVX2-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3 +; AVX2-NEXT: vblendvpd %xmm3, %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX2-NEXT: vxorpd %xmm4, %xmm0, %xmm2 -; AVX2-NEXT: vxorpd %xmm4, %xmm1, %xmm3 -; AVX2-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2 +; AVX2-NEXT: vxorpd %xmm2, %xmm0, %xmm3 +; AVX2-NEXT: vxorpd %xmm2, %xmm1, %xmm2 +; AVX2-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 ; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: vmovq %xmm0, %rax ; AVX2-NEXT: vzeroupper diff --git a/llvm/test/CodeGen/X86/vector-reduce-umin.ll b/llvm/test/CodeGen/X86/vector-reduce-umin.ll --- a/llvm/test/CodeGen/X86/vector-reduce-umin.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-umin.ll @@ -63,16 +63,27 @@ ; SSE42-NEXT: movq %xmm2, %rax ; SSE42-NEXT: retq ; -; AVX-LABEL: test_v2i64: -; AVX: # %bb.0: -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] -; AVX-NEXT: vpxor %xmm2, %xmm0, %xmm3 -; AVX-NEXT: vpxor %xmm2, %xmm1, %xmm2 -; AVX-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2 -; AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; AVX-NEXT: vmovq %xmm0, %rax -; AVX-NEXT: retq +; AVX1-LABEL: test_v2i64: +; AVX1: # %bb.0: +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm3 +; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm2 +; AVX1-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vmovq %xmm0, %rax +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_v2i64: +; AVX2: # %bb.0: +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm3 +; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm2 +; AVX2-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2 +; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vmovq %xmm0, %rax +; AVX2-NEXT: retq ; ; AVX512BW-LABEL: test_v2i64: ; AVX512BW: # %bb.0: @@ -198,7 +209,7 @@ ; AVX2-LABEL: test_v4i64: ; AVX2: # %bb.0: ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] ; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm3 ; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm4 ; AVX2-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3 @@ -416,10 +427,12 @@ ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] ; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm3 -; AVX2-NEXT: vpxor %ymm2, %ymm1, %ymm4 -; AVX2-NEXT: vpcmpgtq %ymm3, %ymm4, %ymm3 -; AVX2-NEXT: vblendvpd %ymm3, %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vpxor %ymm2, %ymm1, %ymm2 +; AVX2-NEXT: vpcmpgtq %ymm3, %ymm2, %ymm2 +; AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vmovddup {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX2-NEXT: # xmm2 = mem[0,0] ; AVX2-NEXT: vxorpd %xmm2, %xmm0, %xmm3 ; AVX2-NEXT: vxorpd %xmm2, %xmm1, %xmm4 ; AVX2-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3 @@ -798,14 +811,16 @@ ; AVX2-NEXT: vpcmpgtq %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vxorpd %xmm4, %xmm0, %xmm2 -; AVX2-NEXT: vxorpd %xmm4, %xmm1, %xmm3 -; AVX2-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 -; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vmovddup {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX2-NEXT: # xmm2 = mem[0,0] +; AVX2-NEXT: vxorpd %xmm2, %xmm0, %xmm3 +; AVX2-NEXT: vxorpd %xmm2, %xmm1, %xmm4 +; AVX2-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3 +; AVX2-NEXT: vblendvpd %xmm3, %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX2-NEXT: vxorpd %xmm4, %xmm0, %xmm2 -; AVX2-NEXT: vxorpd %xmm4, %xmm1, %xmm3 -; AVX2-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 +; AVX2-NEXT: vxorpd %xmm2, %xmm0, %xmm3 +; AVX2-NEXT: vxorpd %xmm2, %xmm1, %xmm2 +; AVX2-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2 ; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: vmovq %xmm0, %rax ; AVX2-NEXT: vzeroupper diff --git a/llvm/test/CodeGen/X86/vector-reduce-xor-bool.ll b/llvm/test/CodeGen/X86/vector-reduce-xor-bool.ll --- a/llvm/test/CodeGen/X86/vector-reduce-xor-bool.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-xor-bool.ll @@ -633,7 +633,7 @@ ; ; AVX1-LABEL: trunc_v16i32_v16i1: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255] +; AVX1-NEXT: vbroadcastss {{.*#+}} ymm2 = [3.57331108E-43,3.57331108E-43,3.57331108E-43,3.57331108E-43,3.57331108E-43,3.57331108E-43,3.57331108E-43,3.57331108E-43] ; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 ; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 diff --git a/llvm/test/CodeGen/X86/vector-rotate-128.ll b/llvm/test/CodeGen/X86/vector-rotate-128.ll --- a/llvm/test/CodeGen/X86/vector-rotate-128.ll +++ b/llvm/test/CodeGen/X86/vector-rotate-128.ll @@ -73,7 +73,7 @@ ; ; AVX2-LABEL: var_rotate_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [64,64] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [64,64] ; AVX2-NEXT: vpsubq %xmm1, %xmm2, %xmm2 ; AVX2-NEXT: vpsllvq %xmm1, %xmm0, %xmm1 ; AVX2-NEXT: vpsrlvq %xmm2, %xmm0, %xmm0 @@ -179,8 +179,7 @@ ; ; AVX2-LABEL: var_rotate_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [31,31,31,31] -; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; AVX2-NEXT: vpsllvd %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [32,32,32,32] ; AVX2-NEXT: vpsubd %xmm1, %xmm3, %xmm1 @@ -297,7 +296,7 @@ ; AVX2-LABEL: var_rotate_v8i16: ; AVX2: # %bb.0: ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [16,16,16,16,16,16,16,16] +; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm2 = [16,16,16,16,16,16,16,16] ; AVX2-NEXT: vpsubw %xmm1, %xmm2, %xmm2 ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero @@ -318,7 +317,7 @@ ; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} ymm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero ; AVX512F-NEXT: vpsllvd %ymm2, %ymm0, %ymm2 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16] +; AVX512F-NEXT: vpbroadcastw {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16] ; AVX512F-NEXT: vpsubw %xmm1, %xmm3, %xmm1 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero ; AVX512F-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0 @@ -348,7 +347,7 @@ ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16] +; AVX512BW-NEXT: vpbroadcastw {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16] ; AVX512BW-NEXT: vpsubw %xmm1, %xmm3, %xmm1 ; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpor %xmm0, %xmm2, %xmm0 @@ -664,14 +663,23 @@ ; SSE-NEXT: por %xmm3, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: splatvar_rotate_v2i64: -; AVX: # %bb.0: -; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [64,64] -; AVX-NEXT: vpsubq %xmm1, %xmm2, %xmm2 -; AVX-NEXT: vpsllq %xmm1, %xmm0, %xmm1 -; AVX-NEXT: vpsrlq %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpor %xmm0, %xmm1, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: splatvar_rotate_v2i64: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [64,64] +; AVX1-NEXT: vpsubq %xmm1, %xmm2, %xmm2 +; AVX1-NEXT: vpsllq %xmm1, %xmm0, %xmm1 +; AVX1-NEXT: vpsrlq %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: splatvar_rotate_v2i64: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [64,64] +; AVX2-NEXT: vpsubq %xmm1, %xmm2, %xmm2 +; AVX2-NEXT: vpsllq %xmm1, %xmm0, %xmm1 +; AVX2-NEXT: vpsrlq %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: retq ; ; AVX512NOVLX-LABEL: splatvar_rotate_v2i64: ; AVX512NOVLX: # %bb.0: diff --git a/llvm/test/CodeGen/X86/vector-rotate-256.ll b/llvm/test/CodeGen/X86/vector-rotate-256.ll --- a/llvm/test/CodeGen/X86/vector-rotate-256.ll +++ b/llvm/test/CodeGen/X86/vector-rotate-256.ll @@ -127,8 +127,7 @@ ; ; AVX2-LABEL: var_rotate_v8i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [31,31,31,31,31,31,31,31] -; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 ; AVX2-NEXT: vpsllvd %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm3 = [32,32,32,32,32,32,32,32] ; AVX2-NEXT: vpsubd %ymm1, %ymm3, %ymm1 @@ -222,7 +221,7 @@ ; AVX2-NEXT: vpsllvd %ymm5, %ymm0, %ymm5 ; AVX2-NEXT: vpsrld $16, %ymm5, %ymm5 ; AVX2-NEXT: vpackusdw %ymm4, %ymm5, %ymm4 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm5 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] ; AVX2-NEXT: vpsubw %ymm1, %ymm5, %ymm1 ; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15] ; AVX2-NEXT: vpsrlvd %ymm5, %ymm3, %ymm3 @@ -240,7 +239,7 @@ ; AVX512F-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm2 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero ; AVX512F-NEXT: vpsllvd %zmm2, %zmm0, %zmm2 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] ; AVX512F-NEXT: vpsubw %ymm1, %ymm3, %ymm1 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero ; AVX512F-NEXT: vpsrlvd %zmm1, %zmm0, %zmm0 @@ -267,7 +266,7 @@ ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 ; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX512BW-NEXT: vpbroadcastw {{.*#+}} ymm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] ; AVX512BW-NEXT: vpsubw %ymm1, %ymm3, %ymm1 ; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpor %ymm0, %ymm2, %ymm0 @@ -526,7 +525,7 @@ ; AVX2-LABEL: splatvar_rotate_v4i64: ; AVX2: # %bb.0: ; AVX2-NEXT: vpsllq %xmm1, %ymm0, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [64,64] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm3 = [64,64] ; AVX2-NEXT: vpsubq %xmm1, %xmm3, %xmm1 ; AVX2-NEXT: vpsrlq %xmm1, %ymm0, %ymm0 ; AVX2-NEXT: vpor %ymm0, %ymm2, %ymm0 diff --git a/llvm/test/CodeGen/X86/vector-rotate-512.ll b/llvm/test/CodeGen/X86/vector-rotate-512.ll --- a/llvm/test/CodeGen/X86/vector-rotate-512.ll +++ b/llvm/test/CodeGen/X86/vector-rotate-512.ll @@ -37,12 +37,12 @@ define <32 x i16> @var_rotate_v32i16(<32 x i16> %a, <32 x i16> %b) nounwind { ; AVX512F-LABEL: var_rotate_v32i16: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm3 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm4 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm5 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512F-NEXT: vpsllvd %zmm4, %zmm5, %zmm4 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm6 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] ; AVX512F-NEXT: vpsubw %ymm3, %ymm6, %ymm3 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm3 = ymm3[0],zero,ymm3[1],zero,ymm3[2],zero,ymm3[3],zero,ymm3[4],zero,ymm3[5],zero,ymm3[6],zero,ymm3[7],zero,ymm3[8],zero,ymm3[9],zero,ymm3[10],zero,ymm3[11],zero,ymm3[12],zero,ymm3[13],zero,ymm3[14],zero,ymm3[15],zero ; AVX512F-NEXT: vpsrlvd %zmm3, %zmm5, %zmm3 @@ -143,7 +143,7 @@ ; AVX512F-NEXT: vpaddb %ymm3, %ymm3, %ymm3 ; AVX512F-NEXT: vpblendvb %ymm3, %ymm6, %ymm2, %ymm2 ; AVX512F-NEXT: vpsrlw $7, %ymm2, %ymm4 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm6 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX512F-NEXT: vpbroadcastb {{.*#+}} ymm6 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] ; AVX512F-NEXT: vpand %ymm6, %ymm4, %ymm4 ; AVX512F-NEXT: vpaddb %ymm2, %ymm2, %ymm8 ; AVX512F-NEXT: vpor %ymm4, %ymm8, %ymm4 diff --git a/llvm/test/CodeGen/X86/vector-sext.ll b/llvm/test/CodeGen/X86/vector-sext.ll --- a/llvm/test/CodeGen/X86/vector-sext.ll +++ b/llvm/test/CodeGen/X86/vector-sext.ll @@ -2524,8 +2524,7 @@ ; AVX1-NEXT: movzwl (%rdi), %eax ; AVX1-NEXT: vmovd %eax, %xmm0 ; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1] -; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [9241421688590303745,9241421688590303745] -; AVX1-NEXT: # xmm1 = mem[0,0] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: retq @@ -2684,8 +2683,7 @@ ; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0,1,1,4,4,5,5] ; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [9241421688590303745,9241421688590303745] -; AVX1-NEXT: # xmm2 = mem[0,0] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128] ; AVX1-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 diff --git a/llvm/test/CodeGen/X86/vector-shift-ashr-128.ll b/llvm/test/CodeGen/X86/vector-shift-ashr-128.ll --- a/llvm/test/CodeGen/X86/vector-shift-ashr-128.ll +++ b/llvm/test/CodeGen/X86/vector-shift-ashr-128.ll @@ -66,7 +66,7 @@ ; ; AVX2-LABEL: var_shift_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] ; AVX2-NEXT: vpsrlvq %xmm1, %xmm2, %xmm2 ; AVX2-NEXT: vpsrlvq %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm0 @@ -632,14 +632,23 @@ ; SSE-NEXT: psubq %xmm2, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: splatvar_shift_v2i64: -; AVX: # %bb.0: -; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] -; AVX-NEXT: vpsrlq %xmm1, %xmm2, %xmm2 -; AVX-NEXT: vpsrlq %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpxor %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpsubq %xmm2, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: splatvar_shift_v2i64: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX1-NEXT: vpsrlq %xmm1, %xmm2, %xmm2 +; AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpsubq %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: splatvar_shift_v2i64: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX2-NEXT: vpsrlq %xmm1, %xmm2, %xmm2 +; AVX2-NEXT: vpsrlq %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpsubq %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: retq ; ; XOPAVX1-LABEL: splatvar_shift_v2i64: ; XOPAVX1: # %bb.0: @@ -837,7 +846,7 @@ ; AVX2-NEXT: vpsrlw $8, %xmm2, %xmm2 ; AVX2-NEXT: vpbroadcastb %xmm2, %xmm2 ; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] ; AVX2-NEXT: vpsrlw %xmm1, %xmm2, %xmm1 ; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpsubb %xmm1, %xmm0, %xmm0 @@ -933,15 +942,25 @@ ; SSE-NEXT: psubq %xmm2, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: splatvar_modulo_shift_v2i64: -; AVX: # %bb.0: -; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] -; AVX-NEXT: vpsrlq %xmm1, %xmm2, %xmm2 -; AVX-NEXT: vpsrlq %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpxor %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpsubq %xmm2, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: splatvar_modulo_shift_v2i64: +; AVX1: # %bb.0: +; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX1-NEXT: vpsrlq %xmm1, %xmm2, %xmm2 +; AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpsubq %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: splatvar_modulo_shift_v2i64: +; AVX2: # %bb.0: +; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX2-NEXT: vpsrlq %xmm1, %xmm2, %xmm2 +; AVX2-NEXT: vpsrlq %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpsubq %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: retq ; ; XOPAVX1-LABEL: splatvar_modulo_shift_v2i64: ; XOPAVX1: # %bb.0: @@ -1130,7 +1149,7 @@ ; AVX2-NEXT: vpsrlw $8, %xmm2, %xmm2 ; AVX2-NEXT: vpbroadcastb %xmm2, %xmm2 ; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] ; AVX2-NEXT: vpsrlw %xmm1, %xmm2, %xmm1 ; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpsubb %xmm1, %xmm0, %xmm0 @@ -1708,14 +1727,23 @@ ; SSE-NEXT: psubb %xmm1, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: splatconstant_shift_v16i8: -; AVX: # %bb.0: -; AVX-NEXT: vpsrlw $3, %xmm0, %xmm0 -; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] -; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpsubb %xmm1, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: splatconstant_shift_v16i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vpsrlw $3, %xmm0, %xmm0 +; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpsubb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: splatconstant_shift_v16i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpsrlw $3, %xmm0, %xmm0 +; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpsubb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: retq ; ; XOP-LABEL: splatconstant_shift_v16i8: ; XOP: # %bb.0: @@ -1726,7 +1754,7 @@ ; AVX512: # %bb.0: ; AVX512-NEXT: vpsrlw $3, %xmm0, %xmm0 ; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX512-NEXT: vpbroadcastb {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] ; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpsubb %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: retq @@ -1762,15 +1790,25 @@ ; SSE-NEXT: psubq %xmm2, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: PR52719: -; AVX: # %bb.0: -; AVX-NEXT: vmovd %edi, %xmm1 -; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] -; AVX-NEXT: vpsrlq %xmm1, %xmm2, %xmm2 -; AVX-NEXT: vpsrlq %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpxor %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpsubq %xmm2, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: PR52719: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovd %edi, %xmm1 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX1-NEXT: vpsrlq %xmm1, %xmm2, %xmm2 +; AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpsubq %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: PR52719: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovd %edi, %xmm1 +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX2-NEXT: vpsrlq %xmm1, %xmm2, %xmm2 +; AVX2-NEXT: vpsrlq %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpsubq %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: retq ; ; XOPAVX1-LABEL: PR52719: ; XOPAVX1: # %bb.0: diff --git a/llvm/test/CodeGen/X86/vector-shift-ashr-256.ll b/llvm/test/CodeGen/X86/vector-shift-ashr-256.ll --- a/llvm/test/CodeGen/X86/vector-shift-ashr-256.ll +++ b/llvm/test/CodeGen/X86/vector-shift-ashr-256.ll @@ -89,8 +89,7 @@ ; X86-AVX1-LABEL: var_shift_v4i64: ; X86-AVX1: # %bb.0: ; X86-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; X86-AVX1-NEXT: vmovddup {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808] -; X86-AVX1-NEXT: # xmm3 = mem[0,0] +; X86-AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,2147483648,0,2147483648] ; X86-AVX1-NEXT: vpsrlq %xmm2, %xmm3, %xmm4 ; X86-AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[2,3,2,3] ; X86-AVX1-NEXT: vpsrlq %xmm5, %xmm3, %xmm6 @@ -115,7 +114,7 @@ ; ; X86-AVX2-LABEL: var_shift_v4i64: ; X86-AVX2: # %bb.0: -; X86-AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2147483648,0,2147483648,0,2147483648,0,2147483648] +; X86-AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] ; X86-AVX2-NEXT: vpsrlvq %ymm1, %ymm2, %ymm2 ; X86-AVX2-NEXT: vpsrlvq %ymm1, %ymm0, %ymm0 ; X86-AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm0 @@ -723,7 +722,7 @@ ; ; X86-AVX2-LABEL: splatvar_shift_v4i64: ; X86-AVX2: # %bb.0: -; X86-AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2147483648,0,2147483648,0,2147483648,0,2147483648] +; X86-AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] ; X86-AVX2-NEXT: vpsrlq %xmm1, %ymm2, %ymm2 ; X86-AVX2-NEXT: vpsrlq %xmm1, %ymm0, %ymm0 ; X86-AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm0 @@ -888,7 +887,7 @@ ; AVX2-NEXT: vpsrlw $8, %xmm2, %xmm2 ; AVX2-NEXT: vpbroadcastb %xmm2, %ymm2 ; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] ; AVX2-NEXT: vpsrlw %xmm1, %ymm2, %ymm1 ; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpsubb %ymm1, %ymm0, %ymm0 @@ -925,7 +924,7 @@ ; AVX512DQ-NEXT: vpsrlw $8, %xmm2, %xmm2 ; AVX512DQ-NEXT: vpbroadcastb %xmm2, %ymm2 ; AVX512DQ-NEXT: vpand %ymm2, %ymm0, %ymm0 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896] +; AVX512DQ-NEXT: vpbroadcastb {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] ; AVX512DQ-NEXT: vpsrlw %xmm1, %ymm2, %ymm1 ; AVX512DQ-NEXT: vpxor %ymm1, %ymm0, %ymm0 ; AVX512DQ-NEXT: vpsubb %ymm1, %ymm0, %ymm0 @@ -990,7 +989,7 @@ ; X86-AVX2-NEXT: vpsrlw $8, %xmm2, %xmm2 ; X86-AVX2-NEXT: vpbroadcastb %xmm2, %ymm2 ; X86-AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 -; X86-AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896] +; X86-AVX2-NEXT: vpbroadcastb {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] ; X86-AVX2-NEXT: vpsrlw %xmm1, %ymm2, %ymm1 ; X86-AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0 ; X86-AVX2-NEXT: vpsubb %ymm1, %ymm0, %ymm0 @@ -1084,7 +1083,7 @@ ; X86-AVX2-LABEL: splatvar_modulo_shift_v4i64: ; X86-AVX2: # %bb.0: ; X86-AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1, %xmm1 -; X86-AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2147483648,0,2147483648,0,2147483648,0,2147483648] +; X86-AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] ; X86-AVX2-NEXT: vpsrlq %xmm1, %ymm2, %ymm2 ; X86-AVX2-NEXT: vpsrlq %xmm1, %ymm0, %ymm0 ; X86-AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm0 @@ -1252,7 +1251,7 @@ ; AVX2-NEXT: vpsrlw $8, %xmm2, %xmm2 ; AVX2-NEXT: vpbroadcastb %xmm2, %ymm2 ; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] ; AVX2-NEXT: vpsrlw %xmm1, %ymm2, %ymm1 ; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpsubb %ymm1, %ymm0, %ymm0 @@ -1291,7 +1290,7 @@ ; AVX512DQ-NEXT: vpsrlw $8, %xmm2, %xmm2 ; AVX512DQ-NEXT: vpbroadcastb %xmm2, %ymm2 ; AVX512DQ-NEXT: vpand %ymm2, %ymm0, %ymm0 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896] +; AVX512DQ-NEXT: vpbroadcastb {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] ; AVX512DQ-NEXT: vpsrlw %xmm1, %ymm2, %ymm1 ; AVX512DQ-NEXT: vpxor %ymm1, %ymm0, %ymm0 ; AVX512DQ-NEXT: vpsubb %ymm1, %ymm0, %ymm0 @@ -1358,7 +1357,7 @@ ; X86-AVX2-NEXT: vpsrlw $8, %xmm2, %xmm2 ; X86-AVX2-NEXT: vpbroadcastb %xmm2, %ymm2 ; X86-AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 -; X86-AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896] +; X86-AVX2-NEXT: vpbroadcastb {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] ; X86-AVX2-NEXT: vpsrlw %xmm1, %ymm2, %ymm1 ; X86-AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0 ; X86-AVX2-NEXT: vpsubb %ymm1, %ymm0, %ymm0 @@ -1959,7 +1958,7 @@ ; AVX2: # %bb.0: ; AVX2-NEXT: vpsrlw $3, %ymm0, %ymm0 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] ; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpsubb %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq @@ -1977,7 +1976,7 @@ ; XOPAVX2: # %bb.0: ; XOPAVX2-NEXT: vpsrlw $3, %ymm0, %ymm0 ; XOPAVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; XOPAVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; XOPAVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] ; XOPAVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0 ; XOPAVX2-NEXT: vpsubb %ymm1, %ymm0, %ymm0 ; XOPAVX2-NEXT: retq @@ -1986,7 +1985,7 @@ ; AVX512: # %bb.0: ; AVX512-NEXT: vpsrlw $3, %ymm0, %ymm0 ; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX512-NEXT: vpbroadcastb {{.*#+}} ymm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] ; AVX512-NEXT: vpxor %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: vpsubb %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: retq @@ -2019,7 +2018,7 @@ ; X86-AVX2: # %bb.0: ; X86-AVX2-NEXT: vpsrlw $3, %ymm0, %ymm0 ; X86-AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0 -; X86-AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; X86-AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] ; X86-AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0 ; X86-AVX2-NEXT: vpsubb %ymm1, %ymm0, %ymm0 ; X86-AVX2-NEXT: retl @@ -2169,8 +2168,7 @@ ; X86-AVX1-LABEL: PR52719: ; X86-AVX1: # %bb.0: ; X86-AVX1-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; X86-AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] -; X86-AVX1-NEXT: # xmm2 = mem[0,0] +; X86-AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,2147483648,0,2147483648] ; X86-AVX1-NEXT: vpsrlq %xmm1, %xmm2, %xmm2 ; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 ; X86-AVX1-NEXT: vpsrlq %xmm1, %xmm3, %xmm3 @@ -2185,7 +2183,7 @@ ; X86-AVX2-LABEL: PR52719: ; X86-AVX2: # %bb.0: ; X86-AVX2-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; X86-AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2147483648,0,2147483648,0,2147483648,0,2147483648] +; X86-AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] ; X86-AVX2-NEXT: vpsrlq %xmm1, %ymm2, %ymm2 ; X86-AVX2-NEXT: vpsrlq %xmm1, %ymm0, %ymm0 ; X86-AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm0 diff --git a/llvm/test/CodeGen/X86/vector-shift-ashr-512.ll b/llvm/test/CodeGen/X86/vector-shift-ashr-512.ll --- a/llvm/test/CodeGen/X86/vector-shift-ashr-512.ll +++ b/llvm/test/CodeGen/X86/vector-shift-ashr-512.ll @@ -197,7 +197,7 @@ ; AVX512DQ-NEXT: vpsrlw $8, %xmm3, %xmm3 ; AVX512DQ-NEXT: vpbroadcastb %xmm3, %ymm3 ; AVX512DQ-NEXT: vpand %ymm3, %ymm2, %ymm2 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm4 = [32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896] +; AVX512DQ-NEXT: vpbroadcastb {{.*#+}} ymm4 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] ; AVX512DQ-NEXT: vpsrlw %xmm1, %ymm4, %ymm4 ; AVX512DQ-NEXT: vpxor %ymm4, %ymm2, %ymm2 ; AVX512DQ-NEXT: vpsubb %ymm4, %ymm2, %ymm2 @@ -286,7 +286,7 @@ ; AVX512DQ-NEXT: vpsrlw $8, %xmm3, %xmm3 ; AVX512DQ-NEXT: vpbroadcastb %xmm3, %ymm3 ; AVX512DQ-NEXT: vpand %ymm3, %ymm2, %ymm2 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm4 = [32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896] +; AVX512DQ-NEXT: vpbroadcastb {{.*#+}} ymm4 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] ; AVX512DQ-NEXT: vpsrlw %xmm1, %ymm4, %ymm4 ; AVX512DQ-NEXT: vpxor %ymm4, %ymm2, %ymm2 ; AVX512DQ-NEXT: vpsubb %ymm4, %ymm2, %ymm2 @@ -449,9 +449,9 @@ ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512DQ-NEXT: vpsrlw $3, %ymm1, %ymm1 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31] +; AVX512DQ-NEXT: vpbroadcastb {{.*#+}} ymm2 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31] ; AVX512DQ-NEXT: vpand %ymm2, %ymm1, %ymm1 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX512DQ-NEXT: vpbroadcastb {{.*#+}} ymm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] ; AVX512DQ-NEXT: vpxor %ymm3, %ymm1, %ymm1 ; AVX512DQ-NEXT: vpsubb %ymm3, %ymm1, %ymm1 ; AVX512DQ-NEXT: vpsrlw $3, %ymm0, %ymm0 diff --git a/llvm/test/CodeGen/X86/vector-shift-ashr-sub128.ll b/llvm/test/CodeGen/X86/vector-shift-ashr-sub128.ll --- a/llvm/test/CodeGen/X86/vector-shift-ashr-sub128.ll +++ b/llvm/test/CodeGen/X86/vector-shift-ashr-sub128.ll @@ -1356,7 +1356,7 @@ ; AVX2-NEXT: vpsrlw $8, %xmm2, %xmm2 ; AVX2-NEXT: vpbroadcastb %xmm2, %xmm2 ; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] ; AVX2-NEXT: vpsrlw %xmm1, %xmm2, %xmm1 ; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpsubb %xmm1, %xmm0, %xmm0 @@ -1494,7 +1494,7 @@ ; AVX2-NEXT: vpsrlw $8, %xmm2, %xmm2 ; AVX2-NEXT: vpbroadcastb %xmm2, %xmm2 ; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] ; AVX2-NEXT: vpsrlw %xmm1, %xmm2, %xmm1 ; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpsubb %xmm1, %xmm0, %xmm0 @@ -1632,7 +1632,7 @@ ; AVX2-NEXT: vpsrlw $8, %xmm2, %xmm2 ; AVX2-NEXT: vpbroadcastb %xmm2, %xmm2 ; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] ; AVX2-NEXT: vpsrlw %xmm1, %xmm2, %xmm1 ; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpsubb %xmm1, %xmm0, %xmm0 @@ -2308,14 +2308,23 @@ ; SSE-NEXT: psubb %xmm1, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: splatconstant_shift_v8i8: -; AVX: # %bb.0: -; AVX-NEXT: vpsrlw $3, %xmm0, %xmm0 -; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] -; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpsubb %xmm1, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: splatconstant_shift_v8i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vpsrlw $3, %xmm0, %xmm0 +; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpsubb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: splatconstant_shift_v8i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpsrlw $3, %xmm0, %xmm0 +; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpsubb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: retq ; ; XOP-LABEL: splatconstant_shift_v8i8: ; XOP: # %bb.0: @@ -2326,7 +2335,7 @@ ; AVX512: # %bb.0: ; AVX512-NEXT: vpsrlw $3, %xmm0, %xmm0 ; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX512-NEXT: vpbroadcastb {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] ; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpsubb %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: retq @@ -2361,14 +2370,23 @@ ; SSE-NEXT: psubb %xmm1, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: splatconstant_shift_v4i8: -; AVX: # %bb.0: -; AVX-NEXT: vpsrlw $3, %xmm0, %xmm0 -; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] -; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpsubb %xmm1, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: splatconstant_shift_v4i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vpsrlw $3, %xmm0, %xmm0 +; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpsubb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: splatconstant_shift_v4i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpsrlw $3, %xmm0, %xmm0 +; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpsubb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: retq ; ; XOP-LABEL: splatconstant_shift_v4i8: ; XOP: # %bb.0: @@ -2379,7 +2397,7 @@ ; AVX512: # %bb.0: ; AVX512-NEXT: vpsrlw $3, %xmm0, %xmm0 ; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX512-NEXT: vpbroadcastb {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] ; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpsubb %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: retq @@ -2414,14 +2432,23 @@ ; SSE-NEXT: psubb %xmm1, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: splatconstant_shift_v2i8: -; AVX: # %bb.0: -; AVX-NEXT: vpsrlw $3, %xmm0, %xmm0 -; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] -; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpsubb %xmm1, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: splatconstant_shift_v2i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vpsrlw $3, %xmm0, %xmm0 +; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpsubb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: splatconstant_shift_v2i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpsrlw $3, %xmm0, %xmm0 +; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpsubb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: retq ; ; XOP-LABEL: splatconstant_shift_v2i8: ; XOP: # %bb.0: @@ -2432,7 +2459,7 @@ ; AVX512: # %bb.0: ; AVX512-NEXT: vpsrlw $3, %xmm0, %xmm0 ; AVX512-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; AVX512-NEXT: vpbroadcastb {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] ; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpsubb %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-shift-lshr-256.ll b/llvm/test/CodeGen/X86/vector-shift-lshr-256.ll --- a/llvm/test/CodeGen/X86/vector-shift-lshr-256.ll +++ b/llvm/test/CodeGen/X86/vector-shift-lshr-256.ll @@ -1773,8 +1773,7 @@ ; AVX2-NEXT: vpsrlq $36, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] -; AVX2-NEXT: vbroadcastss {{.*#+}} xmm1 = [1048575,1048575,1048575,1048575] -; AVX2-NEXT: vandps %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -1793,8 +1792,7 @@ ; XOPAVX2-NEXT: vpsrlq $36, %ymm0, %ymm0 ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; XOPAVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] -; XOPAVX2-NEXT: vbroadcastss {{.*#+}} xmm1 = [1048575,1048575,1048575,1048575] -; XOPAVX2-NEXT: vandps %xmm1, %xmm0, %xmm0 +; XOPAVX2-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; XOPAVX2-NEXT: vzeroupper ; XOPAVX2-NEXT: retq ; @@ -1830,8 +1828,7 @@ ; X86-AVX2-NEXT: vpsrlq $36, %ymm0, %ymm0 ; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; X86-AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] -; X86-AVX2-NEXT: vbroadcastss {{.*#+}} xmm1 = [1048575,1048575,1048575,1048575] -; X86-AVX2-NEXT: vandps %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 ; X86-AVX2-NEXT: vzeroupper ; X86-AVX2-NEXT: retl %s = lshr <4 x i64> %x, diff --git a/llvm/test/CodeGen/X86/vector-shift-lshr-512.ll b/llvm/test/CodeGen/X86/vector-shift-lshr-512.ll --- a/llvm/test/CodeGen/X86/vector-shift-lshr-512.ll +++ b/llvm/test/CodeGen/X86/vector-shift-lshr-512.ll @@ -53,18 +53,18 @@ ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm2 ; AVX512DQ-NEXT: vpsrlw $4, %ymm2, %ymm3 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512DQ-NEXT: vpbroadcastb {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512DQ-NEXT: vpand %ymm4, %ymm3, %ymm3 ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm1, %ymm5 ; AVX512DQ-NEXT: vpsllw $5, %ymm5, %ymm5 ; AVX512DQ-NEXT: vpblendvb %ymm5, %ymm3, %ymm2, %ymm2 ; AVX512DQ-NEXT: vpsrlw $2, %ymm2, %ymm3 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm6 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63] +; AVX512DQ-NEXT: vpbroadcastb {{.*#+}} ymm6 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63] ; AVX512DQ-NEXT: vpand %ymm6, %ymm3, %ymm3 ; AVX512DQ-NEXT: vpaddb %ymm5, %ymm5, %ymm5 ; AVX512DQ-NEXT: vpblendvb %ymm5, %ymm3, %ymm2, %ymm2 ; AVX512DQ-NEXT: vpsrlw $1, %ymm2, %ymm3 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm7 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX512DQ-NEXT: vpbroadcastb {{.*#+}} ymm7 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] ; AVX512DQ-NEXT: vpand %ymm7, %ymm3, %ymm3 ; AVX512DQ-NEXT: vpaddb %ymm5, %ymm5, %ymm5 ; AVX512DQ-NEXT: vpblendvb %ymm5, %ymm3, %ymm2, %ymm2 diff --git a/llvm/test/CodeGen/X86/vector-shift-shl-256.ll b/llvm/test/CodeGen/X86/vector-shift-shl-256.ll --- a/llvm/test/CodeGen/X86/vector-shift-shl-256.ll +++ b/llvm/test/CodeGen/X86/vector-shift-shl-256.ll @@ -1261,7 +1261,7 @@ ; AVX2: # %bb.0: ; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm1 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] ; AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 @@ -1291,7 +1291,7 @@ ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} ymm1 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; AVX512DQ-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512DQ-NEXT: vpbroadcastw {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX512DQ-NEXT: vpand %ymm2, %ymm1, %ymm1 ; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] ; AVX512DQ-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 @@ -1352,7 +1352,7 @@ ; X86-AVX2: # %bb.0: ; X86-AVX2-NEXT: vpunpckhbw {{.*#+}} ymm1 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] ; X86-AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}, %ymm1, %ymm1 -; X86-AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; X86-AVX2-NEXT: vpbroadcastw {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; X86-AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1 ; X86-AVX2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] ; X86-AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0 diff --git a/llvm/test/CodeGen/X86/vector-shift-shl-512.ll b/llvm/test/CodeGen/X86/vector-shift-shl-512.ll --- a/llvm/test/CodeGen/X86/vector-shift-shl-512.ll +++ b/llvm/test/CodeGen/X86/vector-shift-shl-512.ll @@ -53,13 +53,13 @@ ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm2 ; AVX512DQ-NEXT: vpsllw $4, %ymm2, %ymm3 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm4 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] +; AVX512DQ-NEXT: vpbroadcastb {{.*#+}} ymm4 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] ; AVX512DQ-NEXT: vpand %ymm4, %ymm3, %ymm3 ; AVX512DQ-NEXT: vextracti64x4 $1, %zmm1, %ymm5 ; AVX512DQ-NEXT: vpsllw $5, %ymm5, %ymm5 ; AVX512DQ-NEXT: vpblendvb %ymm5, %ymm3, %ymm2, %ymm2 ; AVX512DQ-NEXT: vpsllw $2, %ymm2, %ymm3 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm6 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] +; AVX512DQ-NEXT: vpbroadcastb {{.*#+}} ymm6 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] ; AVX512DQ-NEXT: vpand %ymm6, %ymm3, %ymm3 ; AVX512DQ-NEXT: vpaddb %ymm5, %ymm5, %ymm5 ; AVX512DQ-NEXT: vpblendvb %ymm5, %ymm3, %ymm2, %ymm2 @@ -301,7 +301,7 @@ ; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [128,64,32,16,8,4,2,1,128,64,32,16,8,4,2,1] ; AVX512DQ-NEXT: # ymm3 = mem[0,1,0,1] ; AVX512DQ-NEXT: vpmullw %ymm3, %ymm2, %ymm2 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512DQ-NEXT: vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX512DQ-NEXT: vpand %ymm4, %ymm2, %ymm2 ; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] ; AVX512DQ-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128] diff --git a/llvm/test/CodeGen/X86/vector-shuffle-128-v16.ll b/llvm/test/CodeGen/X86/vector-shuffle-128-v16.ll --- a/llvm/test/CodeGen/X86/vector-shuffle-128-v16.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-128-v16.ll @@ -613,11 +613,17 @@ ; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: retq ; -; AVX1OR2-LABEL: shuffle_v16i8_00_17_02_19_04_21_06_23_08_25_10_27_12_29_14_31: -; AVX1OR2: # %bb.0: -; AVX1OR2-NEXT: vmovdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] -; AVX1OR2-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0 -; AVX1OR2-NEXT: retq +; AVX1-LABEL: shuffle_v16i8_00_17_02_19_04_21_06_23_08_25_10_27_12_29_14_31: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] +; AVX1-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v16i8_00_17_02_19_04_21_06_23_08_25_10_27_12_29_14_31: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v16i8_00_17_02_19_04_21_06_23_08_25_10_27_12_29_14_31: ; AVX512VL: # %bb.0: @@ -625,6 +631,18 @@ ; AVX512VL-NEXT: kmovd %eax, %k1 ; AVX512VL-NEXT: vmovdqu8 %xmm1, %xmm0 {%k1} ; AVX512VL-NEXT: retq +; +; XOPAVX1-LABEL: shuffle_v16i8_00_17_02_19_04_21_06_23_08_25_10_27_12_29_14_31: +; XOPAVX1: # %bb.0: +; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] +; XOPAVX1-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0 +; XOPAVX1-NEXT: retq +; +; XOPAVX2-LABEL: shuffle_v16i8_00_17_02_19_04_21_06_23_08_25_10_27_12_29_14_31: +; XOPAVX2: # %bb.0: +; XOPAVX2-NEXT: vpbroadcastw {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] +; XOPAVX2-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0 +; XOPAVX2-NEXT: retq %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> ret <16 x i8> %shuffle } @@ -653,11 +671,17 @@ ; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: retq ; -; AVX1OR2-LABEL: shuffle_v16i8_00_01_02_19_04_05_06_23_08_09_10_27_12_13_14_31: -; AVX1OR2: # %bb.0: -; AVX1OR2-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0] -; AVX1OR2-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0 -; AVX1OR2-NEXT: retq +; AVX1-LABEL: shuffle_v16i8_00_01_02_19_04_05_06_23_08_09_10_27_12_13_14_31: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0] +; AVX1-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v16i8_00_01_02_19_04_05_06_23_08_09_10_27_12_13_14_31: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [16777215,16777215,16777215,16777215] +; AVX2-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v16i8_00_01_02_19_04_05_06_23_08_09_10_27_12_13_14_31: ; AVX512VL: # %bb.0: @@ -665,6 +689,18 @@ ; AVX512VL-NEXT: kmovd %eax, %k1 ; AVX512VL-NEXT: vmovdqu8 %xmm1, %xmm0 {%k1} ; AVX512VL-NEXT: retq +; +; XOPAVX1-LABEL: shuffle_v16i8_00_01_02_19_04_05_06_23_08_09_10_27_12_13_14_31: +; XOPAVX1: # %bb.0: +; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0] +; XOPAVX1-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0 +; XOPAVX1-NEXT: retq +; +; XOPAVX2-LABEL: shuffle_v16i8_00_01_02_19_04_05_06_23_08_09_10_27_12_13_14_31: +; XOPAVX2: # %bb.0: +; XOPAVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [16777215,16777215,16777215,16777215] +; XOPAVX2-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0 +; XOPAVX2-NEXT: retq %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> ret <16 x i8> %shuffle } @@ -707,11 +743,17 @@ ; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: retq ; -; AVX1OR2-LABEL: shuffle_v16i8_00_01_02_03_20_05_06_23_08_09_10_11_28_13_14_31: -; AVX1OR2: # %bb.0: -; AVX1OR2-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,255,255,0,255,255,0,255,255,255,255,0,255,255,0] -; AVX1OR2-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0 -; AVX1OR2-NEXT: retq +; AVX1-LABEL: shuffle_v16i8_00_01_02_03_20_05_06_23_08_09_10_11_28_13_14_31: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,255,255,0,255,255,0,255,255,255,255,0,255,255,0] +; AVX1-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v16i8_00_01_02_03_20_05_06_23_08_09_10_11_28_13_14_31: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [72056498821267455,72056498821267455] +; AVX2-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v16i8_00_01_02_03_20_05_06_23_08_09_10_11_28_13_14_31: ; AVX512VL: # %bb.0: @@ -719,6 +761,18 @@ ; AVX512VL-NEXT: kmovd %eax, %k1 ; AVX512VL-NEXT: vmovdqu8 %xmm1, %xmm0 {%k1} ; AVX512VL-NEXT: retq +; +; XOPAVX1-LABEL: shuffle_v16i8_00_01_02_03_20_05_06_23_08_09_10_11_28_13_14_31: +; XOPAVX1: # %bb.0: +; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,255,255,0,255,255,0,255,255,255,255,0,255,255,0] +; XOPAVX1-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0 +; XOPAVX1-NEXT: retq +; +; XOPAVX2-LABEL: shuffle_v16i8_00_01_02_03_20_05_06_23_08_09_10_11_28_13_14_31: +; XOPAVX2: # %bb.0: +; XOPAVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [72056498821267455,72056498821267455] +; XOPAVX2-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0 +; XOPAVX2-NEXT: retq %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> ret <16 x i8> %shuffle } @@ -910,7 +964,7 @@ ; ; AVX2-LABEL: shuffle_v16i8_00_02_04_06_08_10_12_14_16_18_20_22_24_26_28_30: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] ; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 ; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 @@ -1021,11 +1075,17 @@ ; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: retq ; -; AVX1OR2-LABEL: load_fold_pblendvb: -; AVX1OR2: # %bb.0: -; AVX1OR2-NEXT: vmovdqa {{.*#+}} xmm1 = [0,0,255,0,255,255,255,0,0,0,255,0,255,255,255,0] -; AVX1OR2-NEXT: vpblendvb %xmm1, (%rdi), %xmm0, %xmm0 -; AVX1OR2-NEXT: retq +; AVX1-LABEL: load_fold_pblendvb: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [0,0,255,0,255,255,255,0,0,0,255,0,255,255,255,0] +; AVX1-NEXT: vpblendvb %xmm1, (%rdi), %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: load_fold_pblendvb: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [72057589759672320,72057589759672320] +; AVX2-NEXT: vpblendvb %xmm1, (%rdi), %xmm0, %xmm0 +; AVX2-NEXT: retq ; ; AVX512VL-LABEL: load_fold_pblendvb: ; AVX512VL: # %bb.0: @@ -1033,6 +1093,18 @@ ; AVX512VL-NEXT: kmovd %eax, %k1 ; AVX512VL-NEXT: vmovdqu8 (%rdi), %xmm0 {%k1} ; AVX512VL-NEXT: retq +; +; XOPAVX1-LABEL: load_fold_pblendvb: +; XOPAVX1: # %bb.0: +; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [0,0,255,0,255,255,255,0,0,0,255,0,255,255,255,0] +; XOPAVX1-NEXT: vpblendvb %xmm1, (%rdi), %xmm0, %xmm0 +; XOPAVX1-NEXT: retq +; +; XOPAVX2-LABEL: load_fold_pblendvb: +; XOPAVX2: # %bb.0: +; XOPAVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [72057589759672320,72057589759672320] +; XOPAVX2-NEXT: vpblendvb %xmm1, (%rdi), %xmm0, %xmm0 +; XOPAVX2-NEXT: retq %x = load <16 x i8>, ptr %px, align 16 %select = shufflevector <16 x i8> %x, <16 x i8> %y, <16 x i32> ret <16 x i8> %select @@ -1065,11 +1137,17 @@ ; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: retq ; -; AVX1OR2-LABEL: load_fold_pblendvb_commute: -; AVX1OR2: # %bb.0: -; AVX1OR2-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255,0,255,0,0,0,255,255,255,0,255,0,0,0,255] -; AVX1OR2-NEXT: vpblendvb %xmm1, (%rdi), %xmm0, %xmm0 -; AVX1OR2-NEXT: retq +; AVX1-LABEL: load_fold_pblendvb_commute: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255,0,255,0,0,0,255,255,255,0,255,0,0,0,255] +; AVX1-NEXT: vpblendvb %xmm1, (%rdi), %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: load_fold_pblendvb_commute: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [18374686483949879295,18374686483949879295] +; AVX2-NEXT: vpblendvb %xmm1, (%rdi), %xmm0, %xmm0 +; AVX2-NEXT: retq ; ; AVX512VL-LABEL: load_fold_pblendvb_commute: ; AVX512VL: # %bb.0: @@ -1079,6 +1157,18 @@ ; AVX512VL-NEXT: vmovdqu8 %xmm0, %xmm1 {%k1} ; AVX512VL-NEXT: vmovdqa %xmm1, %xmm0 ; AVX512VL-NEXT: retq +; +; XOPAVX1-LABEL: load_fold_pblendvb_commute: +; XOPAVX1: # %bb.0: +; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255,0,255,0,0,0,255,255,255,0,255,0,0,0,255] +; XOPAVX1-NEXT: vpblendvb %xmm1, (%rdi), %xmm0, %xmm0 +; XOPAVX1-NEXT: retq +; +; XOPAVX2-LABEL: load_fold_pblendvb_commute: +; XOPAVX2: # %bb.0: +; XOPAVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [18374686483949879295,18374686483949879295] +; XOPAVX2-NEXT: vpblendvb %xmm1, (%rdi), %xmm0, %xmm0 +; XOPAVX2-NEXT: retq %x = load <16 x i8>, ptr %px, align 16 %select = shufflevector <16 x i8> %y, <16 x i8> %x, <16 x i32> ret <16 x i8> %select @@ -2103,7 +2193,7 @@ ; ; AVX2-LABEL: PR12412: ; AVX2: # %bb.0: # %entry -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] ; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 ; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll --- a/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll @@ -1010,8 +1010,7 @@ ; AVX2-FAST-ALL-LABEL: shuffle_v16i16_00_00_00_00_00_00_01_00_00_00_00_00_00_00_01_00: ; AVX2-FAST-ALL: # %bb.0: ; AVX2-FAST-ALL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,1,0,4,5,6,7] -; AVX2-FAST-ALL-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,0,0,1,0,0,0,1] -; AVX2-FAST-ALL-NEXT: # ymm1 = mem[0,1,0,1] +; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,0,1,0,0,0,1] ; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX2-FAST-ALL-NEXT: retq ; @@ -1063,8 +1062,7 @@ ; AVX2-FAST-ALL-LABEL: shuffle_v16i16_00_00_00_00_00_02_00_00_00_00_00_00_00_02_00_00: ; AVX2-FAST-ALL: # %bb.0: ; AVX2-FAST-ALL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,2,4,5,6,7] -; AVX2-FAST-ALL-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,0,1,0,0,0,1,0] -; AVX2-FAST-ALL-NEXT: # ymm1 = mem[0,1,0,1] +; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,1,0,0,0,1,0] ; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX2-FAST-ALL-NEXT: retq ; @@ -1116,8 +1114,7 @@ ; AVX2-FAST-ALL-LABEL: shuffle_v16i16_00_00_00_00_03_00_00_00_00_00_00_00_03_00_00_00: ; AVX2-FAST-ALL: # %bb.0: ; AVX2-FAST-ALL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,3,0,4,5,6,7] -; AVX2-FAST-ALL-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [0,0,1,0,0,0,1,0] -; AVX2-FAST-ALL-NEXT: # ymm1 = mem[0,1,0,1] +; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,1,0,0,0,1,0] ; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX2-FAST-ALL-NEXT: retq ; @@ -1294,7 +1291,7 @@ define <16 x i16> @shuffle_v16i16_00_17_02_19_04_21_06_23_08_25_10_27_12_29_14_31(<16 x i16> %a, <16 x i16> %b) { ; AVX1-LABEL: shuffle_v16i16_00_17_02_19_04_21_06_23_08_25_10_27_12_29_14_31: ; AVX1: # %bb.0: -; AVX1-NEXT: vbroadcastss {{.*#+}} ymm2 = [65535,65535,65535,65535,65535,65535,65535,65535] +; AVX1-NEXT: vbroadcastss {{.*#+}} ymm2 = [9.18340949E-41,9.18340949E-41,9.18340949E-41,9.18340949E-41,9.18340949E-41,9.18340949E-41,9.18340949E-41,9.18340949E-41] ; AVX1-NEXT: vandnps %ymm1, %ymm2, %ymm1 ; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 ; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 @@ -1307,8 +1304,7 @@ ; ; XOPAVX1-LABEL: shuffle_v16i16_00_17_02_19_04_21_06_23_08_25_10_27_12_29_14_31: ; XOPAVX1: # %bb.0: -; XOPAVX1-NEXT: vbroadcastss {{.*#+}} ymm2 = [65535,65535,65535,65535,65535,65535,65535,65535] -; XOPAVX1-NEXT: vpcmov %ymm2, %ymm1, %ymm0, %ymm0 +; XOPAVX1-NEXT: vpcmov {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm0, %ymm0 ; XOPAVX1-NEXT: retq ; ; XOPAVX2-LABEL: shuffle_v16i16_00_17_02_19_04_21_06_23_08_25_10_27_12_29_14_31: @@ -1322,7 +1318,7 @@ define <16 x i16> @shuffle_v16i16_16_01_18_03_20_05_22_07_24_09_26_11_28_13_30_15(<16 x i16> %a, <16 x i16> %b) { ; AVX1-LABEL: shuffle_v16i16_16_01_18_03_20_05_22_07_24_09_26_11_28_13_30_15: ; AVX1: # %bb.0: -; AVX1-NEXT: vbroadcastss {{.*#+}} ymm2 = [65535,65535,65535,65535,65535,65535,65535,65535] +; AVX1-NEXT: vbroadcastss {{.*#+}} ymm2 = [9.18340949E-41,9.18340949E-41,9.18340949E-41,9.18340949E-41,9.18340949E-41,9.18340949E-41,9.18340949E-41,9.18340949E-41] ; AVX1-NEXT: vandnps %ymm0, %ymm2, %ymm0 ; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 ; AVX1-NEXT: vorps %ymm0, %ymm1, %ymm0 @@ -1335,8 +1331,7 @@ ; ; XOPAVX1-LABEL: shuffle_v16i16_16_01_18_03_20_05_22_07_24_09_26_11_28_13_30_15: ; XOPAVX1: # %bb.0: -; XOPAVX1-NEXT: vbroadcastss {{.*#+}} ymm2 = [65535,65535,65535,65535,65535,65535,65535,65535] -; XOPAVX1-NEXT: vpcmov %ymm2, %ymm0, %ymm1, %ymm0 +; XOPAVX1-NEXT: vpcmov {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1, %ymm0 ; XOPAVX1-NEXT: retq ; ; XOPAVX2-LABEL: shuffle_v16i16_16_01_18_03_20_05_22_07_24_09_26_11_28_13_30_15: diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll --- a/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll @@ -2211,7 +2211,7 @@ define <32 x i8> @shuffle_v32i8_00_33_02_35_04_37_06_39_08_41_10_43_12_45_14_47_16_49_18_51_20_53_22_55_24_57_26_59_28_61_30_63(<32 x i8> %a, <32 x i8> %b) { ; AVX1-LABEL: shuffle_v32i8_00_33_02_35_04_37_06_39_08_41_10_43_12_45_14_47_16_49_18_51_20_53_22_55_24_57_26_59_28_61_30_63: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] +; AVX1-NEXT: vbroadcastss {{.*#+}} ymm2 = [2.34184089E-38,2.34184089E-38,2.34184089E-38,2.34184089E-38,2.34184089E-38,2.34184089E-38,2.34184089E-38,2.34184089E-38] ; AVX1-NEXT: vandnps %ymm1, %ymm2, %ymm1 ; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 ; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 @@ -2219,7 +2219,7 @@ ; ; AVX2-LABEL: shuffle_v32i8_00_33_02_35_04_37_06_39_08_41_10_43_12_45_14_47_16_49_18_51_20_53_22_55_24_57_26_59_28_61_30_63: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] +; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: retq ; @@ -2237,7 +2237,7 @@ ; ; XOPAVX2-LABEL: shuffle_v32i8_00_33_02_35_04_37_06_39_08_41_10_43_12_45_14_47_16_49_18_51_20_53_22_55_24_57_26_59_28_61_30_63: ; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] +; XOPAVX2-NEXT: vpbroadcastw {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; XOPAVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; XOPAVX2-NEXT: retq %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> @@ -2247,7 +2247,7 @@ define <32 x i8> @shuffle_v32i8_32_01_34_03_36_05_38_07_40_09_42_11_44_13_46_15_48_17_50_19_52_21_54_23_56_25_58_27_60_29_62_31(<32 x i8> %a, <32 x i8> %b) { ; AVX1-LABEL: shuffle_v32i8_32_01_34_03_36_05_38_07_40_09_42_11_44_13_46_15_48_17_50_19_52_21_54_23_56_25_58_27_60_29_62_31: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] +; AVX1-NEXT: vbroadcastss {{.*#+}} ymm2 = [2.34184089E-38,2.34184089E-38,2.34184089E-38,2.34184089E-38,2.34184089E-38,2.34184089E-38,2.34184089E-38,2.34184089E-38] ; AVX1-NEXT: vandnps %ymm0, %ymm2, %ymm0 ; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 ; AVX1-NEXT: vorps %ymm0, %ymm1, %ymm0 @@ -2255,7 +2255,7 @@ ; ; AVX2-LABEL: shuffle_v32i8_32_01_34_03_36_05_38_07_40_09_42_11_44_13_46_15_48_17_50_19_52_21_54_23_56_25_58_27_60_29_62_31: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] +; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq ; @@ -2273,7 +2273,7 @@ ; ; XOPAVX2-LABEL: shuffle_v32i8_32_01_34_03_36_05_38_07_40_09_42_11_44_13_46_15_48_17_50_19_52_21_54_23_56_25_58_27_60_29_62_31: ; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] +; XOPAVX2-NEXT: vpbroadcastw {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; XOPAVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 ; XOPAVX2-NEXT: retq %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> @@ -2285,7 +2285,7 @@ define <32 x i8> @load_fold_pblendvb(ptr %px, <32 x i8> %y) { ; AVX1-LABEL: load_fold_pblendvb: ; AVX1: # %bb.0: -; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm1 = [18374686483949879295,18374686483949879295,18374686483949879295,18374686483949879295] +; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm1 = [-5.4861292804117373E+303,-5.4861292804117373E+303,-5.4861292804117373E+303,-5.4861292804117373E+303] ; AVX1-NEXT: vandnps (%rdi), %ymm1, %ymm2 ; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vorps %ymm2, %ymm0, %ymm0 @@ -2293,7 +2293,7 @@ ; ; AVX2-LABEL: load_fold_pblendvb: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,255,0,255,255,255,0,0,0,255,0,255,255,255,0,0,0,255,0,255,255,255,0,0,0,255,0,255,255,255,0] +; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [72057589759672320,72057589759672320,72057589759672320,72057589759672320] ; AVX2-NEXT: vpblendvb %ymm1, (%rdi), %ymm0, %ymm0 ; AVX2-NEXT: retq ; @@ -2306,13 +2306,13 @@ ; ; XOPAVX1-LABEL: load_fold_pblendvb: ; XOPAVX1: # %bb.0: -; XOPAVX1-NEXT: vbroadcastsd {{.*#+}} ymm1 = [18374686483949879295,18374686483949879295,18374686483949879295,18374686483949879295] +; XOPAVX1-NEXT: vmovdqa {{.*#+}} ymm1 = [255,255,0,255,0,0,0,255,255,255,0,255,0,0,0,255,255,255,0,255,0,0,0,255,255,255,0,255,0,0,0,255] ; XOPAVX1-NEXT: vpcmov %ymm1, (%rdi), %ymm0, %ymm0 ; XOPAVX1-NEXT: retq ; ; XOPAVX2-LABEL: load_fold_pblendvb: ; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,255,0,255,255,255,0,0,0,255,0,255,255,255,0,0,0,255,0,255,255,255,0,0,0,255,0,255,255,255,0] +; XOPAVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [72057589759672320,72057589759672320,72057589759672320,72057589759672320] ; XOPAVX2-NEXT: vpblendvb %ymm1, (%rdi), %ymm0, %ymm0 ; XOPAVX2-NEXT: retq %x = load <32 x i8>, ptr %px, align 32 @@ -2323,7 +2323,7 @@ define <32 x i8> @load_fold_pblendvb_commute(ptr %px, <32 x i8> %y) { ; AVX1-LABEL: load_fold_pblendvb_commute: ; AVX1: # %bb.0: -; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm1 = [18374686483949879295,18374686483949879295,18374686483949879295,18374686483949879295] +; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm1 = [-5.4861292804117373E+303,-5.4861292804117373E+303,-5.4861292804117373E+303,-5.4861292804117373E+303] ; AVX1-NEXT: vandnps %ymm0, %ymm1, %ymm0 ; AVX1-NEXT: vandps (%rdi), %ymm1, %ymm1 ; AVX1-NEXT: vorps %ymm0, %ymm1, %ymm0 @@ -2331,7 +2331,7 @@ ; ; AVX2-LABEL: load_fold_pblendvb_commute: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [255,255,0,255,0,0,0,255,255,255,0,255,0,0,0,255,255,255,0,255,0,0,0,255,255,255,0,255,0,0,0,255] +; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [18374686483949879295,18374686483949879295,18374686483949879295,18374686483949879295] ; AVX2-NEXT: vpblendvb %ymm1, (%rdi), %ymm0, %ymm0 ; AVX2-NEXT: retq ; @@ -2347,13 +2347,12 @@ ; XOPAVX1-LABEL: load_fold_pblendvb_commute: ; XOPAVX1: # %bb.0: ; XOPAVX1-NEXT: vmovdqa (%rdi), %ymm1 -; XOPAVX1-NEXT: vbroadcastsd {{.*#+}} ymm2 = [18374686483949879295,18374686483949879295,18374686483949879295,18374686483949879295] -; XOPAVX1-NEXT: vpcmov %ymm2, %ymm0, %ymm1, %ymm0 +; XOPAVX1-NEXT: vpcmov {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1, %ymm0 ; XOPAVX1-NEXT: retq ; ; XOPAVX2-LABEL: load_fold_pblendvb_commute: ; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [255,255,0,255,0,0,0,255,255,255,0,255,0,0,0,255,255,255,0,255,0,0,0,255,255,255,0,255,0,0,0,255] +; XOPAVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [18374686483949879295,18374686483949879295,18374686483949879295,18374686483949879295] ; XOPAVX2-NEXT: vpblendvb %ymm1, (%rdi), %ymm0, %ymm0 ; XOPAVX2-NEXT: retq %x = load <32 x i8>, ptr %px, align 32 diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll --- a/llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll @@ -893,8 +893,7 @@ ; ; AVX2-SLOW-LABEL: shuffle_v8f32_c348cda0: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vbroadcastf128 {{.*#+}} ymm2 = [4,5,2,0,4,5,2,0] -; AVX2-SLOW-NEXT: # ymm2 = mem[0,1,0,1] +; AVX2-SLOW-NEXT: vmovaps {{.*#+}} ymm2 = <4,u,u,0,4,5,2,u> ; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm2, %ymm1 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,3,2,0,4,7,6,4] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,1] @@ -905,16 +904,14 @@ ; AVX2-FAST-ALL: # %bb.0: ; AVX2-FAST-ALL-NEXT: vmovaps {{.*#+}} ymm2 = ; AVX2-FAST-ALL-NEXT: vpermps %ymm0, %ymm2, %ymm0 -; AVX2-FAST-ALL-NEXT: vbroadcastf128 {{.*#+}} ymm2 = [4,5,2,0,4,5,2,0] -; AVX2-FAST-ALL-NEXT: # ymm2 = mem[0,1,0,1] +; AVX2-FAST-ALL-NEXT: vmovaps {{.*#+}} ymm2 = <4,u,u,0,4,5,2,u> ; AVX2-FAST-ALL-NEXT: vpermps %ymm1, %ymm2, %ymm1 ; AVX2-FAST-ALL-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4,5,6],ymm0[7] ; AVX2-FAST-ALL-NEXT: retq ; ; AVX2-FAST-PERLANE-LABEL: shuffle_v8f32_c348cda0: ; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: vbroadcastf128 {{.*#+}} ymm2 = [4,5,2,0,4,5,2,0] -; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} ymm2 = <4,u,u,0,4,5,2,u> ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm2, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,3,2,0,4,7,6,4] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,1] @@ -993,8 +990,7 @@ ; ; AVX2-FAST-ALL-LABEL: shuffle_v8f32_32103210: ; AVX2-FAST-ALL: # %bb.0: -; AVX2-FAST-ALL-NEXT: vbroadcastf128 {{.*#+}} ymm1 = [3,2,1,0,3,2,1,0] -; AVX2-FAST-ALL-NEXT: # ymm1 = mem[0,1,0,1] +; AVX2-FAST-ALL-NEXT: vmovaps {{.*#+}} ymm1 = [3,2,1,0,3,2,1,0] ; AVX2-FAST-ALL-NEXT: vpermps %ymm0, %ymm1, %ymm0 ; AVX2-FAST-ALL-NEXT: retq ; @@ -1041,8 +1037,7 @@ ; ; AVX2-FAST-ALL-LABEL: shuffle_v8f32_76547654: ; AVX2-FAST-ALL: # %bb.0: -; AVX2-FAST-ALL-NEXT: vbroadcastf128 {{.*#+}} ymm1 = [7,6,5,4,7,6,5,4] -; AVX2-FAST-ALL-NEXT: # ymm1 = mem[0,1,0,1] +; AVX2-FAST-ALL-NEXT: vmovaps {{.*#+}} ymm1 = [7,6,5,4,7,6,5,4] ; AVX2-FAST-ALL-NEXT: vpermps %ymm0, %ymm1, %ymm0 ; AVX2-FAST-ALL-NEXT: retq ; @@ -1491,7 +1486,7 @@ ; ; AVX2-FAST-ALL-LABEL: shuffle_v8f32_44444444: ; AVX2-FAST-ALL: # %bb.0: -; AVX2-FAST-ALL-NEXT: vbroadcastss {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4] +; AVX2-FAST-ALL-NEXT: vbroadcastss {{.*#+}} ymm1 = [5.60519386E-45,5.60519386E-45,5.60519386E-45,5.60519386E-45,5.60519386E-45,5.60519386E-45,5.60519386E-45,5.60519386E-45] ; AVX2-FAST-ALL-NEXT: vpermps %ymm0, %ymm1, %ymm0 ; AVX2-FAST-ALL-NEXT: retq ; @@ -2666,8 +2661,7 @@ ; ; AVX2-FAST-ALL-LABEL: shuffle_v8i32_32103210: ; AVX2-FAST-ALL: # %bb.0: -; AVX2-FAST-ALL-NEXT: vbroadcastf128 {{.*#+}} ymm1 = [3,2,1,0,3,2,1,0] -; AVX2-FAST-ALL-NEXT: # ymm1 = mem[0,1,0,1] +; AVX2-FAST-ALL-NEXT: vmovaps {{.*#+}} ymm1 = [3,2,1,0,3,2,1,0] ; AVX2-FAST-ALL-NEXT: vpermps %ymm0, %ymm1, %ymm0 ; AVX2-FAST-ALL-NEXT: retq ; @@ -2714,8 +2708,7 @@ ; ; AVX2-FAST-ALL-LABEL: shuffle_v8i32_76547654: ; AVX2-FAST-ALL: # %bb.0: -; AVX2-FAST-ALL-NEXT: vbroadcastf128 {{.*#+}} ymm1 = [7,6,5,4,7,6,5,4] -; AVX2-FAST-ALL-NEXT: # ymm1 = mem[0,1,0,1] +; AVX2-FAST-ALL-NEXT: vmovaps {{.*#+}} ymm1 = [7,6,5,4,7,6,5,4] ; AVX2-FAST-ALL-NEXT: vpermps %ymm0, %ymm1, %ymm0 ; AVX2-FAST-ALL-NEXT: retq ; @@ -3140,7 +3133,7 @@ ; ; AVX2-FAST-ALL-LABEL: shuffle_v8i32_44444444: ; AVX2-FAST-ALL: # %bb.0: -; AVX2-FAST-ALL-NEXT: vbroadcastss {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4] +; AVX2-FAST-ALL-NEXT: vbroadcastss {{.*#+}} ymm1 = [5.60519386E-45,5.60519386E-45,5.60519386E-45,5.60519386E-45,5.60519386E-45,5.60519386E-45,5.60519386E-45,5.60519386E-45] ; AVX2-FAST-ALL-NEXT: vpermps %ymm0, %ymm1, %ymm0 ; AVX2-FAST-ALL-NEXT: retq ; @@ -3186,7 +3179,7 @@ ; ; AVX2-FAST-ALL-LABEL: shuffle_v8i32_44444444_bc: ; AVX2-FAST-ALL: # %bb.0: -; AVX2-FAST-ALL-NEXT: vbroadcastss {{.*#+}} ymm1 = [4,4,4,4,4,4,4,4] +; AVX2-FAST-ALL-NEXT: vbroadcastss {{.*#+}} ymm1 = [5.60519386E-45,5.60519386E-45,5.60519386E-45,5.60519386E-45,5.60519386E-45,5.60519386E-45,5.60519386E-45,5.60519386E-45] ; AVX2-FAST-ALL-NEXT: vpermps %ymm0, %ymm1, %ymm0 ; AVX2-FAST-ALL-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vector-shuffle-avx512.ll b/llvm/test/CodeGen/X86/vector-shuffle-avx512.ll --- a/llvm/test/CodeGen/X86/vector-shuffle-avx512.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-avx512.ll @@ -454,7 +454,7 @@ ; ; AVX512F-LABEL: test_mm256_mask_blend_epi8: ; AVX512F: # %bb.0: # %entry -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] +; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX512F-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 ; AVX512F-NEXT: ret{{[l|q]}} entry: @@ -472,7 +472,7 @@ ; ; AVX512F-LABEL: test_mm_mask_blend_epi8: ; AVX512F: # %bb.0: # %entry -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] +; AVX512F-NEXT: vpbroadcastw {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] ; AVX512F-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 ; AVX512F-NEXT: ret{{[l|q]}} entry: diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining.ll --- a/llvm/test/CodeGen/X86/vector-shuffle-combining.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-combining.ll @@ -2470,8 +2470,7 @@ ; AVX2-FAST-ALL-LABEL: combine_unneeded_subvector1: ; AVX2-FAST-ALL: # %bb.0: ; AVX2-FAST-ALL-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; AVX2-FAST-ALL-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [7,6,5,4,7,6,5,4] -; AVX2-FAST-ALL-NEXT: # ymm1 = mem[0,1,0,1] +; AVX2-FAST-ALL-NEXT: vmovdqa {{.*#+}} ymm1 = [7,6,5,4,7,6,5,4] ; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX2-FAST-ALL-NEXT: retq ; @@ -3471,37 +3470,21 @@ ; SSE41-NEXT: movaps %xmm2, (%rax) ; SSE41-NEXT: retq ; -; AVX1-LABEL: SpinningCube: -; AVX1: # %bb.0: # %entry -; AVX1-NEXT: movl $1065353216, (%rax) # imm = 0x3F800000 -; AVX1-NEXT: vmovaps {{.*#+}} xmm0 = -; AVX1-NEXT: vmovaps {{.*#+}} xmm1 = <0.0E+0,0.0E+0,-2.0E+0,u> -; AVX1-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; AVX1-NEXT: vinsertps {{.*#+}} xmm3 = xmm1[0,1,2],xmm2[0] -; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm0[0],xmm2[0],xmm0[2,3] -; AVX1-NEXT: vaddps %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vmovaps %xmm2, (%rax) -; AVX1-NEXT: vbroadcastss (%rax), %xmm2 -; AVX1-NEXT: vmulps %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vaddps %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vmovaps %xmm0, (%rax) -; AVX1-NEXT: retq -; -; AVX2-LABEL: SpinningCube: -; AVX2: # %bb.0: # %entry -; AVX2-NEXT: movl $1065353216, (%rax) # imm = 0x3F800000 -; AVX2-NEXT: vbroadcastss {{.*#+}} xmm0 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; AVX2-NEXT: vmovaps {{.*#+}} xmm1 = <0.0E+0,0.0E+0,-2.0E+0,u> -; AVX2-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; AVX2-NEXT: vinsertps {{.*#+}} xmm3 = xmm1[0,1,2],xmm2[0] -; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm0[0],xmm2[0],xmm0[2,3] -; AVX2-NEXT: vaddps %xmm2, %xmm3, %xmm2 -; AVX2-NEXT: vmovaps %xmm2, (%rax) -; AVX2-NEXT: vbroadcastss (%rax), %xmm2 -; AVX2-NEXT: vmulps %xmm1, %xmm2, %xmm1 -; AVX2-NEXT: vaddps %xmm0, %xmm1, %xmm0 -; AVX2-NEXT: vmovaps %xmm0, (%rax) -; AVX2-NEXT: retq +; AVX-LABEL: SpinningCube: +; AVX: # %bb.0: # %entry +; AVX-NEXT: movl $1065353216, (%rax) # imm = 0x3F800000 +; AVX-NEXT: vbroadcastss {{.*#+}} xmm0 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; AVX-NEXT: vmovaps {{.*#+}} xmm1 = <0.0E+0,0.0E+0,-2.0E+0,u> +; AVX-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; AVX-NEXT: vinsertps {{.*#+}} xmm3 = xmm1[0,1,2],xmm2[0] +; AVX-NEXT: vinsertps {{.*#+}} xmm2 = xmm0[0],xmm2[0],xmm0[2,3] +; AVX-NEXT: vaddps %xmm2, %xmm3, %xmm2 +; AVX-NEXT: vmovaps %xmm2, (%rax) +; AVX-NEXT: vbroadcastss (%rax), %xmm2 +; AVX-NEXT: vmulps %xmm1, %xmm2, %xmm1 +; AVX-NEXT: vaddps %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vmovaps %xmm0, (%rax) +; AVX-NEXT: retq entry: store float 1.000000e+00, ptr undef, align 4 %0 = load float, ptr undef, align 4 diff --git a/llvm/test/CodeGen/X86/vector-trunc-math.ll b/llvm/test/CodeGen/X86/vector-trunc-math.ll --- a/llvm/test/CodeGen/X86/vector-trunc-math.ll +++ b/llvm/test/CodeGen/X86/vector-trunc-math.ll @@ -528,7 +528,7 @@ ; ; AVX1-LABEL: trunc_add_const_v8i64_v8i16: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535] +; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm2 = [3.2378592100206092E-319,3.2378592100206092E-319,3.2378592100206092E-319,3.2378592100206092E-319] ; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 ; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 @@ -628,7 +628,7 @@ ; ; AVX1-LABEL: trunc_add_const_v16i64_v16i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovaps {{.*#+}} ymm4 = [255,255,255,255] +; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm4 = [1.2598673968951787E-321,1.2598673968951787E-321,1.2598673968951787E-321,1.2598673968951787E-321] ; AVX1-NEXT: vandps %ymm4, %ymm3, %ymm3 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm5 ; AVX1-NEXT: vpackusdw %xmm5, %xmm3, %xmm3 @@ -696,7 +696,7 @@ ; ; AVX1-LABEL: trunc_add_const_v16i32_v16i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255] +; AVX1-NEXT: vbroadcastss {{.*#+}} ymm2 = [3.57331108E-43,3.57331108E-43,3.57331108E-43,3.57331108E-43,3.57331108E-43,3.57331108E-43,3.57331108E-43,3.57331108E-43] ; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 ; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 @@ -1276,7 +1276,7 @@ ; ; AVX1-LABEL: trunc_sub_const_v8i64_v8i16: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535] +; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm2 = [3.2378592100206092E-319,3.2378592100206092E-319,3.2378592100206092E-319,3.2378592100206092E-319] ; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 ; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 @@ -1376,7 +1376,7 @@ ; ; AVX1-LABEL: trunc_sub_const_v16i64_v16i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovaps {{.*#+}} ymm4 = [255,255,255,255] +; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm4 = [1.2598673968951787E-321,1.2598673968951787E-321,1.2598673968951787E-321,1.2598673968951787E-321] ; AVX1-NEXT: vandps %ymm4, %ymm3, %ymm3 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm5 ; AVX1-NEXT: vpackusdw %xmm5, %xmm3, %xmm3 @@ -1444,7 +1444,7 @@ ; ; AVX1-LABEL: trunc_sub_const_v16i32_v16i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255] +; AVX1-NEXT: vbroadcastss {{.*#+}} ymm2 = [3.57331108E-43,3.57331108E-43,3.57331108E-43,3.57331108E-43,3.57331108E-43,3.57331108E-43,3.57331108E-43,3.57331108E-43] ; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 ; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 @@ -1686,7 +1686,7 @@ ; ; AVX1-LABEL: trunc_mul_v8i64_v8i16: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovaps {{.*#+}} ymm4 = [65535,65535,65535,65535] +; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm4 = [3.2378592100206092E-319,3.2378592100206092E-319,3.2378592100206092E-319,3.2378592100206092E-319] ; AVX1-NEXT: vandps %ymm4, %ymm3, %ymm3 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm5 ; AVX1-NEXT: vpackusdw %xmm5, %xmm3, %xmm3 @@ -2202,7 +2202,7 @@ ; ; AVX1-LABEL: trunc_mul_const_v8i64_v8i16: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535] +; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm2 = [3.2378592100206092E-319,3.2378592100206092E-319,3.2378592100206092E-319,3.2378592100206092E-319] ; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 ; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 @@ -2617,7 +2617,7 @@ ; AVX1: # %bb.0: ; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 ; AVX1-NEXT: vandps %ymm3, %ymm1, %ymm1 -; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535] +; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm2 = [3.2378592100206092E-319,3.2378592100206092E-319,3.2378592100206092E-319,3.2378592100206092E-319] ; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 ; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 @@ -2730,7 +2730,7 @@ ; AVX1-NEXT: vandps %ymm5, %ymm1, %ymm1 ; AVX1-NEXT: vandps %ymm6, %ymm2, %ymm2 ; AVX1-NEXT: vandps %ymm7, %ymm3, %ymm3 -; AVX1-NEXT: vmovaps {{.*#+}} ymm4 = [255,255,255,255] +; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm4 = [1.2598673968951787E-321,1.2598673968951787E-321,1.2598673968951787E-321,1.2598673968951787E-321] ; AVX1-NEXT: vandps %ymm4, %ymm3, %ymm3 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm5 ; AVX1-NEXT: vpackusdw %xmm5, %xmm3, %xmm3 @@ -2806,7 +2806,7 @@ ; AVX1: # %bb.0: ; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 ; AVX1-NEXT: vandps %ymm3, %ymm1, %ymm1 -; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255] +; AVX1-NEXT: vbroadcastss {{.*#+}} ymm2 = [3.57331108E-43,3.57331108E-43,3.57331108E-43,3.57331108E-43,3.57331108E-43,3.57331108E-43,3.57331108E-43,3.57331108E-43] ; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 ; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 @@ -2973,7 +2973,7 @@ ; ; AVX1-LABEL: trunc_and_const_v8i64_v8i16: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535] +; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm2 = [3.2378592100206092E-319,3.2378592100206092E-319,3.2378592100206092E-319,3.2378592100206092E-319] ; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 ; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 @@ -3073,7 +3073,7 @@ ; ; AVX1-LABEL: trunc_and_const_v16i64_v16i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovaps {{.*#+}} ymm4 = [255,255,255,255] +; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm4 = [1.2598673968951787E-321,1.2598673968951787E-321,1.2598673968951787E-321,1.2598673968951787E-321] ; AVX1-NEXT: vandps %ymm4, %ymm3, %ymm3 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm5 ; AVX1-NEXT: vpackusdw %xmm5, %xmm3, %xmm3 @@ -3141,7 +3141,7 @@ ; ; AVX1-LABEL: trunc_and_const_v16i32_v16i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255] +; AVX1-NEXT: vbroadcastss {{.*#+}} ymm2 = [3.57331108E-43,3.57331108E-43,3.57331108E-43,3.57331108E-43,3.57331108E-43,3.57331108E-43,3.57331108E-43,3.57331108E-43] ; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 ; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 @@ -3314,7 +3314,7 @@ ; AVX1: # %bb.0: ; AVX1-NEXT: vxorps %ymm2, %ymm0, %ymm0 ; AVX1-NEXT: vxorps %ymm3, %ymm1, %ymm1 -; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535] +; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm2 = [3.2378592100206092E-319,3.2378592100206092E-319,3.2378592100206092E-319,3.2378592100206092E-319] ; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 ; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 @@ -3427,7 +3427,7 @@ ; AVX1-NEXT: vxorps %ymm5, %ymm1, %ymm1 ; AVX1-NEXT: vxorps %ymm6, %ymm2, %ymm2 ; AVX1-NEXT: vxorps %ymm7, %ymm3, %ymm3 -; AVX1-NEXT: vmovaps {{.*#+}} ymm4 = [255,255,255,255] +; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm4 = [1.2598673968951787E-321,1.2598673968951787E-321,1.2598673968951787E-321,1.2598673968951787E-321] ; AVX1-NEXT: vandps %ymm4, %ymm3, %ymm3 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm5 ; AVX1-NEXT: vpackusdw %xmm5, %xmm3, %xmm3 @@ -3503,7 +3503,7 @@ ; AVX1: # %bb.0: ; AVX1-NEXT: vxorps %ymm2, %ymm0, %ymm0 ; AVX1-NEXT: vxorps %ymm3, %ymm1, %ymm1 -; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255] +; AVX1-NEXT: vbroadcastss {{.*#+}} ymm2 = [3.57331108E-43,3.57331108E-43,3.57331108E-43,3.57331108E-43,3.57331108E-43,3.57331108E-43,3.57331108E-43,3.57331108E-43] ; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 ; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 @@ -3670,7 +3670,7 @@ ; ; AVX1-LABEL: trunc_xor_const_v8i64_v8i16: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535] +; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm2 = [3.2378592100206092E-319,3.2378592100206092E-319,3.2378592100206092E-319,3.2378592100206092E-319] ; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 ; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 @@ -3770,7 +3770,7 @@ ; ; AVX1-LABEL: trunc_xor_const_v16i64_v16i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovaps {{.*#+}} ymm4 = [255,255,255,255] +; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm4 = [1.2598673968951787E-321,1.2598673968951787E-321,1.2598673968951787E-321,1.2598673968951787E-321] ; AVX1-NEXT: vandps %ymm4, %ymm3, %ymm3 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm5 ; AVX1-NEXT: vpackusdw %xmm5, %xmm3, %xmm3 @@ -3838,7 +3838,7 @@ ; ; AVX1-LABEL: trunc_xor_const_v16i32_v16i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255] +; AVX1-NEXT: vbroadcastss {{.*#+}} ymm2 = [3.57331108E-43,3.57331108E-43,3.57331108E-43,3.57331108E-43,3.57331108E-43,3.57331108E-43,3.57331108E-43,3.57331108E-43] ; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 ; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 @@ -4011,7 +4011,7 @@ ; AVX1: # %bb.0: ; AVX1-NEXT: vorps %ymm2, %ymm0, %ymm0 ; AVX1-NEXT: vorps %ymm3, %ymm1, %ymm1 -; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535] +; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm2 = [3.2378592100206092E-319,3.2378592100206092E-319,3.2378592100206092E-319,3.2378592100206092E-319] ; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 ; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 @@ -4124,7 +4124,7 @@ ; AVX1-NEXT: vorps %ymm5, %ymm1, %ymm1 ; AVX1-NEXT: vorps %ymm6, %ymm2, %ymm2 ; AVX1-NEXT: vorps %ymm7, %ymm3, %ymm3 -; AVX1-NEXT: vmovaps {{.*#+}} ymm4 = [255,255,255,255] +; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm4 = [1.2598673968951787E-321,1.2598673968951787E-321,1.2598673968951787E-321,1.2598673968951787E-321] ; AVX1-NEXT: vandps %ymm4, %ymm3, %ymm3 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm5 ; AVX1-NEXT: vpackusdw %xmm5, %xmm3, %xmm3 @@ -4200,7 +4200,7 @@ ; AVX1: # %bb.0: ; AVX1-NEXT: vorps %ymm2, %ymm0, %ymm0 ; AVX1-NEXT: vorps %ymm3, %ymm1, %ymm1 -; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255] +; AVX1-NEXT: vbroadcastss {{.*#+}} ymm2 = [3.57331108E-43,3.57331108E-43,3.57331108E-43,3.57331108E-43,3.57331108E-43,3.57331108E-43,3.57331108E-43,3.57331108E-43] ; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 ; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 @@ -4367,7 +4367,7 @@ ; ; AVX1-LABEL: trunc_or_const_v8i64_v8i16: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535] +; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm2 = [3.2378592100206092E-319,3.2378592100206092E-319,3.2378592100206092E-319,3.2378592100206092E-319] ; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 ; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 @@ -4467,7 +4467,7 @@ ; ; AVX1-LABEL: trunc_or_const_v16i64_v16i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovaps {{.*#+}} ymm4 = [255,255,255,255] +; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm4 = [1.2598673968951787E-321,1.2598673968951787E-321,1.2598673968951787E-321,1.2598673968951787E-321] ; AVX1-NEXT: vandps %ymm4, %ymm3, %ymm3 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm5 ; AVX1-NEXT: vpackusdw %xmm5, %xmm3, %xmm3 @@ -4535,7 +4535,7 @@ ; ; AVX1-LABEL: trunc_or_const_v16i32_v16i8: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255] +; AVX1-NEXT: vbroadcastss {{.*#+}} ymm2 = [3.57331108E-43,3.57331108E-43,3.57331108E-43,3.57331108E-43,3.57331108E-43,3.57331108E-43,3.57331108E-43,3.57331108E-43] ; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 ; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 diff --git a/llvm/test/CodeGen/X86/vector-trunc-packus.ll b/llvm/test/CodeGen/X86/vector-trunc-packus.ll --- a/llvm/test/CodeGen/X86/vector-trunc-packus.ll +++ b/llvm/test/CodeGen/X86/vector-trunc-packus.ll @@ -107,16 +107,27 @@ ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3] ; SSE41-NEXT: retq ; -; AVX-LABEL: trunc_packus_v2i64_v2i32: -; AVX: # %bb.0: -; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [4294967295,4294967295] -; AVX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 -; AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm1 -; AVX-NEXT: vpand %xmm0, %xmm1, %xmm0 -; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; AVX-NEXT: retq +; AVX1-LABEL: trunc_packus_v2i64_v2i32: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [4294967295,4294967295] +; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 +; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm1 +; AVX1-NEXT: vpand %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX1-NEXT: retq +; +; AVX2-LABEL: trunc_packus_v2i64_v2i32: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [4294967295,4294967295] +; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 +; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm1 +; AVX2-NEXT: vpand %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX2-NEXT: retq ; ; AVX512F-LABEL: trunc_packus_v2i64_v2i32: ; AVX512F: # %bb.0: @@ -257,17 +268,29 @@ ; SSE41-NEXT: movq %xmm0, (%rdi) ; SSE41-NEXT: retq ; -; AVX-LABEL: trunc_packus_v2i64_v2i32_store: -; AVX: # %bb.0: -; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [4294967295,4294967295] -; AVX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 -; AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm1 -; AVX-NEXT: vpand %xmm0, %xmm1, %xmm0 -; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; AVX-NEXT: vmovq %xmm0, (%rdi) -; AVX-NEXT: retq +; AVX1-LABEL: trunc_packus_v2i64_v2i32_store: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [4294967295,4294967295] +; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 +; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm1 +; AVX1-NEXT: vpand %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX1-NEXT: vmovq %xmm0, (%rdi) +; AVX1-NEXT: retq +; +; AVX2-LABEL: trunc_packus_v2i64_v2i32_store: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [4294967295,4294967295] +; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 +; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm1 +; AVX2-NEXT: vpand %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX2-NEXT: vmovq %xmm0, (%rdi) +; AVX2-NEXT: retq ; ; AVX512F-LABEL: trunc_packus_v2i64_v2i32_store: ; AVX512F: # %bb.0: @@ -1115,7 +1138,7 @@ ; ; AVX2-SLOW-LABEL: trunc_packus_v2i64_v2i16: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = [65535,65535] +; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm1 = [65535,65535] ; AVX2-SLOW-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 ; AVX2-SLOW-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; AVX2-SLOW-NEXT: vpxor %xmm1, %xmm1, %xmm1 @@ -1127,7 +1150,7 @@ ; ; AVX2-FAST-LABEL: trunc_packus_v2i64_v2i16: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [65535,65535] +; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm1 = [65535,65535] ; AVX2-FAST-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 ; AVX2-FAST-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; AVX2-FAST-NEXT: vpxor %xmm1, %xmm1, %xmm1 @@ -1293,7 +1316,7 @@ ; ; AVX2-SLOW-LABEL: trunc_packus_v2i64_v2i16_store: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = [65535,65535] +; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm1 = [65535,65535] ; AVX2-SLOW-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 ; AVX2-SLOW-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; AVX2-SLOW-NEXT: vpxor %xmm1, %xmm1, %xmm1 @@ -1306,7 +1329,7 @@ ; ; AVX2-FAST-LABEL: trunc_packus_v2i64_v2i16_store: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [65535,65535] +; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm1 = [65535,65535] ; AVX2-FAST-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 ; AVX2-FAST-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; AVX2-FAST-NEXT: vpxor %xmm1, %xmm1, %xmm1 @@ -2801,16 +2824,27 @@ ; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: retq ; -; AVX-LABEL: trunc_packus_v2i64_v2i8: -; AVX: # %bb.0: -; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255] -; AVX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 -; AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm1 -; AVX-NEXT: vpand %xmm0, %xmm1, %xmm0 -; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX-NEXT: retq +; AVX1-LABEL: trunc_packus_v2i64_v2i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255] +; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 +; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm1 +; AVX1-NEXT: vpand %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX1-NEXT: retq +; +; AVX2-LABEL: trunc_packus_v2i64_v2i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [255,255] +; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 +; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm1 +; AVX2-NEXT: vpand %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-NEXT: retq ; ; AVX512F-LABEL: trunc_packus_v2i64_v2i8: ; AVX512F: # %bb.0: @@ -2955,17 +2989,29 @@ ; SSE41-NEXT: pextrw $0, %xmm1, (%rdi) ; SSE41-NEXT: retq ; -; AVX-LABEL: trunc_packus_v2i64_v2i8_store: -; AVX: # %bb.0: -; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255] -; AVX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 -; AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm1 -; AVX-NEXT: vpand %xmm0, %xmm1, %xmm0 -; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpextrw $0, %xmm0, (%rdi) -; AVX-NEXT: retq +; AVX1-LABEL: trunc_packus_v2i64_v2i8_store: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255] +; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 +; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm1 +; AVX1-NEXT: vpand %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX1-NEXT: vpextrw $0, %xmm0, (%rdi) +; AVX1-NEXT: retq +; +; AVX2-LABEL: trunc_packus_v2i64_v2i8_store: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [255,255] +; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 +; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm1 +; AVX2-NEXT: vpand %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vpextrw $0, %xmm0, (%rdi) +; AVX2-NEXT: retq ; ; AVX512F-LABEL: trunc_packus_v2i64_v2i8_store: ; AVX512F: # %bb.0: @@ -5125,24 +5171,14 @@ ; SSE41-NEXT: packuswb %xmm0, %xmm0 ; SSE41-NEXT: retq ; -; AVX1-LABEL: trunc_packus_v4i32_v4i8: -; AVX1: # %bb.0: -; AVX1-NEXT: vpminsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 -; AVX1-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: trunc_packus_v4i32_v4i8: -; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [255,255,255,255] -; AVX2-NEXT: vpminsd %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 -; AVX2-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 -; AVX2-NEXT: retq +; AVX-LABEL: trunc_packus_v4i32_v4i8: +; AVX: # %bb.0: +; AVX-NEXT: vpminsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 +; AVX-NEXT: retq ; ; AVX512F-LABEL: trunc_packus_v4i32_v4i8: ; AVX512F: # %bb.0: @@ -5233,26 +5269,15 @@ ; SSE41-NEXT: movd %xmm1, (%rdi) ; SSE41-NEXT: retq ; -; AVX1-LABEL: trunc_packus_v4i32_v4i8_store: -; AVX1: # %bb.0: -; AVX1-NEXT: vpminsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 -; AVX1-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 -; AVX1-NEXT: vmovd %xmm0, (%rdi) -; AVX1-NEXT: retq -; -; AVX2-LABEL: trunc_packus_v4i32_v4i8_store: -; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [255,255,255,255] -; AVX2-NEXT: vpminsd %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 -; AVX2-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 -; AVX2-NEXT: vmovd %xmm0, (%rdi) -; AVX2-NEXT: retq +; AVX-LABEL: trunc_packus_v4i32_v4i8_store: +; AVX: # %bb.0: +; AVX-NEXT: vpminsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vmovd %xmm0, (%rdi) +; AVX-NEXT: retq ; ; AVX512F-LABEL: trunc_packus_v4i32_v4i8_store: ; AVX512F: # %bb.0: diff --git a/llvm/test/CodeGen/X86/vector-trunc-ssat.ll b/llvm/test/CodeGen/X86/vector-trunc-ssat.ll --- a/llvm/test/CodeGen/X86/vector-trunc-ssat.ll +++ b/llvm/test/CodeGen/X86/vector-trunc-ssat.ll @@ -111,16 +111,27 @@ ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3] ; SSE41-NEXT: retq ; -; AVX-LABEL: trunc_ssat_v2i64_v2i32: -; AVX: # %bb.0: -; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [2147483647,2147483647] -; AVX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 -; AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [18446744071562067968,18446744071562067968] -; AVX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 -; AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3] -; AVX-NEXT: retq +; AVX1-LABEL: trunc_ssat_v2i64_v2i32: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [2147483647,2147483647] +; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 +; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [18446744071562067968,18446744071562067968] +; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX1-NEXT: retq +; +; AVX2-LABEL: trunc_ssat_v2i64_v2i32: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [2147483647,2147483647] +; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 +; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [18446744071562067968,18446744071562067968] +; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 +; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX2-NEXT: retq ; ; AVX512F-LABEL: trunc_ssat_v2i64_v2i32: ; AVX512F: # %bb.0: @@ -255,17 +266,29 @@ ; SSE41-NEXT: movq %xmm0, (%rdi) ; SSE41-NEXT: retq ; -; AVX-LABEL: trunc_ssat_v2i64_v2i32_store: -; AVX: # %bb.0: -; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [2147483647,2147483647] -; AVX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 -; AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [18446744071562067968,18446744071562067968] -; AVX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 -; AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3] -; AVX-NEXT: vmovlpd %xmm0, (%rdi) -; AVX-NEXT: retq +; AVX1-LABEL: trunc_ssat_v2i64_v2i32_store: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [2147483647,2147483647] +; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 +; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [18446744071562067968,18446744071562067968] +; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX1-NEXT: vmovlpd %xmm0, (%rdi) +; AVX1-NEXT: retq +; +; AVX2-LABEL: trunc_ssat_v2i64_v2i32_store: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [2147483647,2147483647] +; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 +; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [18446744071562067968,18446744071562067968] +; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 +; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX2-NEXT: vmovlpd %xmm0, (%rdi) +; AVX2-NEXT: retq ; ; AVX512F-LABEL: trunc_ssat_v2i64_v2i32_store: ; AVX512F: # %bb.0: @@ -1125,10 +1148,10 @@ ; ; AVX2-SLOW-LABEL: trunc_ssat_v2i64_v2i16: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = [32767,32767] +; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm1 = [32767,32767] ; AVX2-SLOW-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 ; AVX2-SLOW-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = [18446744073709518848,18446744073709518848] +; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm1 = [18446744073709518848,18446744073709518848] ; AVX2-SLOW-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 ; AVX2-SLOW-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3] @@ -1137,10 +1160,10 @@ ; ; AVX2-FAST-LABEL: trunc_ssat_v2i64_v2i16: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [32767,32767] +; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm1 = [32767,32767] ; AVX2-FAST-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 ; AVX2-FAST-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [18446744073709518848,18446744073709518848] +; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm1 = [18446744073709518848,18446744073709518848] ; AVX2-FAST-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 ; AVX2-FAST-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,10,11,8,9,10,11,12,13,14,15] @@ -1295,10 +1318,10 @@ ; ; AVX2-SLOW-LABEL: trunc_ssat_v2i64_v2i16_store: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = [32767,32767] +; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm1 = [32767,32767] ; AVX2-SLOW-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 ; AVX2-SLOW-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = [18446744073709518848,18446744073709518848] +; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm1 = [18446744073709518848,18446744073709518848] ; AVX2-SLOW-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 ; AVX2-SLOW-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3] @@ -1308,10 +1331,10 @@ ; ; AVX2-FAST-LABEL: trunc_ssat_v2i64_v2i16_store: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [32767,32767] +; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm1 = [32767,32767] ; AVX2-FAST-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 ; AVX2-FAST-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [18446744073709518848,18446744073709518848] +; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm1 = [18446744073709518848,18446744073709518848] ; AVX2-FAST-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 ; AVX2-FAST-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,u,u,u,u,u,u,u,u,u,u,u,u] @@ -2538,16 +2561,27 @@ ; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: retq ; -; AVX-LABEL: trunc_ssat_v2i64_v2i8: -; AVX: # %bb.0: -; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [127,127] -; AVX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 -; AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [18446744073709551488,18446744073709551488] -; AVX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 -; AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX-NEXT: retq +; AVX1-LABEL: trunc_ssat_v2i64_v2i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [127,127] +; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 +; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [18446744073709551488,18446744073709551488] +; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX1-NEXT: retq +; +; AVX2-LABEL: trunc_ssat_v2i64_v2i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [127,127] +; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 +; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [18446744073709551488,18446744073709551488] +; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 +; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-NEXT: retq ; ; AVX512F-LABEL: trunc_ssat_v2i64_v2i8: ; AVX512F: # %bb.0: @@ -2685,17 +2719,29 @@ ; SSE41-NEXT: pextrw $0, %xmm1, (%rdi) ; SSE41-NEXT: retq ; -; AVX-LABEL: trunc_ssat_v2i64_v2i8_store: -; AVX: # %bb.0: -; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [127,127] -; AVX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 -; AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [18446744073709551488,18446744073709551488] -; AVX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 -; AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpextrw $0, %xmm0, (%rdi) -; AVX-NEXT: retq +; AVX1-LABEL: trunc_ssat_v2i64_v2i8_store: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [127,127] +; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 +; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [18446744073709551488,18446744073709551488] +; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX1-NEXT: vpextrw $0, %xmm0, (%rdi) +; AVX1-NEXT: retq +; +; AVX2-LABEL: trunc_ssat_v2i64_v2i8_store: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [127,127] +; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 +; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [18446744073709551488,18446744073709551488] +; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 +; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vpextrw $0, %xmm0, (%rdi) +; AVX2-NEXT: retq ; ; AVX512F-LABEL: trunc_ssat_v2i64_v2i8_store: ; AVX512F: # %bb.0: @@ -4890,23 +4936,13 @@ ; SSE41-NEXT: packsswb %xmm0, %xmm0 ; SSE41-NEXT: retq ; -; AVX1-LABEL: trunc_ssat_v4i32_v4i8: -; AVX1: # %bb.0: -; AVX1-NEXT: vpminsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX1-NEXT: vpmaxsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX1-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 -; AVX1-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: trunc_ssat_v4i32_v4i8: -; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [127,127,127,127] -; AVX2-NEXT: vpminsd %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [4294967168,4294967168,4294967168,4294967168] -; AVX2-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 -; AVX2-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 -; AVX2-NEXT: retq +; AVX-LABEL: trunc_ssat_v4i32_v4i8: +; AVX: # %bb.0: +; AVX-NEXT: vpminsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX-NEXT: vpmaxsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 +; AVX-NEXT: retq ; ; AVX512F-LABEL: trunc_ssat_v4i32_v4i8: ; AVX512F: # %bb.0: @@ -4992,25 +5028,14 @@ ; SSE41-NEXT: movd %xmm0, (%rdi) ; SSE41-NEXT: retq ; -; AVX1-LABEL: trunc_ssat_v4i32_v4i8_store: -; AVX1: # %bb.0: -; AVX1-NEXT: vpminsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX1-NEXT: vpmaxsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX1-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 -; AVX1-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 -; AVX1-NEXT: vmovd %xmm0, (%rdi) -; AVX1-NEXT: retq -; -; AVX2-LABEL: trunc_ssat_v4i32_v4i8_store: -; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [127,127,127,127] -; AVX2-NEXT: vpminsd %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [4294967168,4294967168,4294967168,4294967168] -; AVX2-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 -; AVX2-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 -; AVX2-NEXT: vmovd %xmm0, (%rdi) -; AVX2-NEXT: retq +; AVX-LABEL: trunc_ssat_v4i32_v4i8_store: +; AVX: # %bb.0: +; AVX-NEXT: vpminsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX-NEXT: vpmaxsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vmovd %xmm0, (%rdi) +; AVX-NEXT: retq ; ; AVX512F-LABEL: trunc_ssat_v4i32_v4i8_store: ; AVX512F: # %bb.0: diff --git a/llvm/test/CodeGen/X86/vector-trunc-usat.ll b/llvm/test/CodeGen/X86/vector-trunc-usat.ll --- a/llvm/test/CodeGen/X86/vector-trunc-usat.ll +++ b/llvm/test/CodeGen/X86/vector-trunc-usat.ll @@ -67,15 +67,27 @@ ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] ; SSE41-NEXT: retq ; -; AVX-LABEL: trunc_usat_v2i64_v2i32: -; AVX: # %bb.0: -; AVX-NEXT: vmovapd {{.*#+}} xmm1 = [4294967295,4294967295] -; AVX-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 -; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [9223372041149743103,9223372041149743103] -; AVX-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 -; AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3] -; AVX-NEXT: retq +; AVX1-LABEL: trunc_usat_v2i64_v2i32: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [2.1219957904712067E-314,2.1219957904712067E-314] +; AVX1-NEXT: # xmm1 = mem[0,0] +; AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [9223372041149743103,9223372041149743103] +; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX1-NEXT: retq +; +; AVX2-LABEL: trunc_usat_v2i64_v2i32: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovddup {{.*#+}} xmm1 = [2.1219957904712067E-314,2.1219957904712067E-314] +; AVX2-NEXT: # xmm1 = mem[0,0] +; AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm3 = [9223372041149743103,9223372041149743103] +; AVX2-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 +; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX2-NEXT: retq ; ; AVX512F-LABEL: trunc_usat_v2i64_v2i32: ; AVX512F: # %bb.0: @@ -164,16 +176,29 @@ ; SSE41-NEXT: movq %xmm0, (%rdi) ; SSE41-NEXT: retq ; -; AVX-LABEL: trunc_usat_v2i64_v2i32_store: -; AVX: # %bb.0: -; AVX-NEXT: vmovapd {{.*#+}} xmm1 = [4294967295,4294967295] -; AVX-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 -; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [9223372041149743103,9223372041149743103] -; AVX-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 -; AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3] -; AVX-NEXT: vmovlpd %xmm0, (%rdi) -; AVX-NEXT: retq +; AVX1-LABEL: trunc_usat_v2i64_v2i32_store: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [2.1219957904712067E-314,2.1219957904712067E-314] +; AVX1-NEXT: # xmm1 = mem[0,0] +; AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [9223372041149743103,9223372041149743103] +; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX1-NEXT: vmovlpd %xmm0, (%rdi) +; AVX1-NEXT: retq +; +; AVX2-LABEL: trunc_usat_v2i64_v2i32_store: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovddup {{.*#+}} xmm1 = [2.1219957904712067E-314,2.1219957904712067E-314] +; AVX2-NEXT: # xmm1 = mem[0,0] +; AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm3 = [9223372041149743103,9223372041149743103] +; AVX2-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 +; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX2-NEXT: vmovlpd %xmm0, (%rdi) +; AVX2-NEXT: retq ; ; AVX512F-LABEL: trunc_usat_v2i64_v2i32_store: ; AVX512F: # %bb.0: @@ -307,7 +332,8 @@ ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm3, %xmm1 ; AVX1-NEXT: vmovapd {{.*#+}} xmm3 = [4294967295,429496729] ; AVX1-NEXT: vblendvpd %xmm1, %xmm4, %xmm3, %xmm1 -; AVX1-NEXT: vmovapd {{.*#+}} xmm3 = [4294967295,4294967295] +; AVX1-NEXT: vmovddup {{.*#+}} xmm3 = [2.1219957904712067E-314,2.1219957904712067E-314] +; AVX1-NEXT: # xmm3 = mem[0,0] ; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm3, %xmm0 ; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] ; AVX1-NEXT: vzeroupper @@ -315,8 +341,7 @@ ; ; AVX2-SLOW-LABEL: trunc_usat_v4i64_v4i32: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm1 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] -; AVX2-SLOW-NEXT: vpxor %ymm1, %ymm0, %ymm1 +; AVX2-SLOW-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1 ; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9223372041149743103,9223372041149743103,9223372041149743103,9223372041149743103] ; AVX2-SLOW-NEXT: vpcmpgtq %ymm1, %ymm2, %ymm1 ; AVX2-SLOW-NEXT: vmovapd {{.*#+}} ymm2 = [4294967295,4294967295,4294967295,429496729] @@ -328,8 +353,7 @@ ; ; AVX2-FAST-ALL-LABEL: trunc_usat_v4i64_v4i32: ; AVX2-FAST-ALL: # %bb.0: -; AVX2-FAST-ALL-NEXT: vpbroadcastq {{.*#+}} ymm1 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] -; AVX2-FAST-ALL-NEXT: vpxor %ymm1, %ymm0, %ymm1 +; AVX2-FAST-ALL-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1 ; AVX2-FAST-ALL-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9223372041149743103,9223372041149743103,9223372041149743103,9223372041149743103] ; AVX2-FAST-ALL-NEXT: vpcmpgtq %ymm1, %ymm2, %ymm1 ; AVX2-FAST-ALL-NEXT: vmovapd {{.*#+}} ymm2 = [4294967295,4294967295,4294967295,429496729] @@ -342,8 +366,7 @@ ; ; AVX2-FAST-PERLANE-LABEL: trunc_usat_v4i64_v4i32: ; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} ymm1 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] -; AVX2-FAST-PERLANE-NEXT: vpxor %ymm1, %ymm0, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9223372041149743103,9223372041149743103,9223372041149743103,9223372041149743103] ; AVX2-FAST-PERLANE-NEXT: vpcmpgtq %ymm1, %ymm2, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovapd {{.*#+}} ymm2 = [4294967295,4294967295,4294967295,429496729] @@ -582,7 +605,8 @@ ; AVX1-NEXT: vpxor %xmm4, %xmm2, %xmm5 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [9223372041149743103,9223372041149743103] ; AVX1-NEXT: vpcmpgtq %xmm5, %xmm6, %xmm5 -; AVX1-NEXT: vmovapd {{.*#+}} xmm7 = [4294967295,4294967295] +; AVX1-NEXT: vmovddup {{.*#+}} xmm7 = [2.1219957904712067E-314,2.1219957904712067E-314] +; AVX1-NEXT: # xmm7 = mem[0,0] ; AVX1-NEXT: vblendvpd %xmm5, %xmm2, %xmm7, %xmm2 ; AVX1-NEXT: vpxor %xmm4, %xmm0, %xmm5 ; AVX1-NEXT: vpcmpgtq %xmm5, %xmm6, %xmm5 @@ -602,7 +626,7 @@ ; AVX2-SLOW: # %bb.0: ; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm0 ; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX2-SLOW-NEXT: vbroadcastsd {{.*#+}} ymm2 = [4294967295,4294967295,4294967295,4294967295] +; AVX2-SLOW-NEXT: vbroadcastsd {{.*#+}} ymm2 = [2.1219957904712067E-314,2.1219957904712067E-314,2.1219957904712067E-314,2.1219957904712067E-314] ; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm3 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] ; AVX2-SLOW-NEXT: vpxor %ymm3, %ymm1, %ymm4 ; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm5 = [9223372041149743103,9223372041149743103,9223372041149743103,9223372041149743103] @@ -620,7 +644,7 @@ ; AVX2-FAST-ALL: # %bb.0: ; AVX2-FAST-ALL-NEXT: vmovdqa (%rdi), %ymm0 ; AVX2-FAST-ALL-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX2-FAST-ALL-NEXT: vbroadcastsd {{.*#+}} ymm2 = [4294967295,4294967295,4294967295,4294967295] +; AVX2-FAST-ALL-NEXT: vbroadcastsd {{.*#+}} ymm2 = [2.1219957904712067E-314,2.1219957904712067E-314,2.1219957904712067E-314,2.1219957904712067E-314] ; AVX2-FAST-ALL-NEXT: vpbroadcastq {{.*#+}} ymm3 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] ; AVX2-FAST-ALL-NEXT: vpxor %ymm3, %ymm1, %ymm4 ; AVX2-FAST-ALL-NEXT: vpbroadcastq {{.*#+}} ymm5 = [9223372041149743103,9223372041149743103,9223372041149743103,9223372041149743103] @@ -639,7 +663,7 @@ ; AVX2-FAST-PERLANE: # %bb.0: ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vbroadcastsd {{.*#+}} ymm2 = [4294967295,4294967295,4294967295,4294967295] +; AVX2-FAST-PERLANE-NEXT: vbroadcastsd {{.*#+}} ymm2 = [2.1219957904712067E-314,2.1219957904712067E-314,2.1219957904712067E-314,2.1219957904712067E-314] ; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} ymm3 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] ; AVX2-FAST-PERLANE-NEXT: vpxor %ymm3, %ymm1, %ymm4 ; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} ymm5 = [9223372041149743103,9223372041149743103,9223372041149743103,9223372041149743103] @@ -728,7 +752,8 @@ ; ; AVX1-LABEL: trunc_usat_v2i64_v2i16: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovapd {{.*#+}} xmm1 = [65535,65535] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [3.2378592100206092E-319,3.2378592100206092E-319] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [9223372036854841343,9223372036854841343] ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 @@ -739,9 +764,10 @@ ; ; AVX2-SLOW-LABEL: trunc_usat_v2i64_v2i16: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vmovapd {{.*#+}} xmm1 = [65535,65535] +; AVX2-SLOW-NEXT: vmovddup {{.*#+}} xmm1 = [3.2378592100206092E-319,3.2378592100206092E-319] +; AVX2-SLOW-NEXT: # xmm1 = mem[0,0] ; AVX2-SLOW-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = [9223372036854841343,9223372036854841343] +; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm3 = [9223372036854841343,9223372036854841343] ; AVX2-SLOW-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 ; AVX2-SLOW-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3] @@ -750,9 +776,10 @@ ; ; AVX2-FAST-LABEL: trunc_usat_v2i64_v2i16: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vmovapd {{.*#+}} xmm1 = [65535,65535] +; AVX2-FAST-NEXT: vmovddup {{.*#+}} xmm1 = [3.2378592100206092E-319,3.2378592100206092E-319] +; AVX2-FAST-NEXT: # xmm1 = mem[0,0] ; AVX2-FAST-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = [9223372036854841343,9223372036854841343] +; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm3 = [9223372036854841343,9223372036854841343] ; AVX2-FAST-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 ; AVX2-FAST-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,10,11,8,9,10,11,12,13,14,15] @@ -848,7 +875,8 @@ ; ; AVX1-LABEL: trunc_usat_v2i64_v2i16_store: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovapd {{.*#+}} xmm1 = [65535,65535] +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [3.2378592100206092E-319,3.2378592100206092E-319] +; AVX1-NEXT: # xmm1 = mem[0,0] ; AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [9223372036854841343,9223372036854841343] ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 @@ -860,9 +888,10 @@ ; ; AVX2-SLOW-LABEL: trunc_usat_v2i64_v2i16_store: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vmovapd {{.*#+}} xmm1 = [65535,65535] +; AVX2-SLOW-NEXT: vmovddup {{.*#+}} xmm1 = [3.2378592100206092E-319,3.2378592100206092E-319] +; AVX2-SLOW-NEXT: # xmm1 = mem[0,0] ; AVX2-SLOW-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = [9223372036854841343,9223372036854841343] +; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} xmm3 = [9223372036854841343,9223372036854841343] ; AVX2-SLOW-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 ; AVX2-SLOW-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3] @@ -872,9 +901,10 @@ ; ; AVX2-FAST-LABEL: trunc_usat_v2i64_v2i16_store: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vmovapd {{.*#+}} xmm1 = [65535,65535] +; AVX2-FAST-NEXT: vmovddup {{.*#+}} xmm1 = [3.2378592100206092E-319,3.2378592100206092E-319] +; AVX2-FAST-NEXT: # xmm1 = mem[0,0] ; AVX2-FAST-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = [9223372036854841343,9223372036854841343] +; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} xmm3 = [9223372036854841343,9223372036854841343] ; AVX2-FAST-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 ; AVX2-FAST-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,u,u,u,u,u,u,u,u,u,u,u,u] @@ -1021,7 +1051,8 @@ ; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm3 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [9223372036854841343,9223372036854841343] ; AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3 -; AVX1-NEXT: vmovapd {{.*#+}} xmm5 = [65535,65535] +; AVX1-NEXT: vmovddup {{.*#+}} xmm5 = [3.2378592100206092E-319,3.2378592100206092E-319] +; AVX1-NEXT: # xmm5 = mem[0,0] ; AVX1-NEXT: vblendvpd %xmm3, %xmm1, %xmm5, %xmm1 ; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm2 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm4, %xmm2 @@ -1033,9 +1064,8 @@ ; ; AVX2-LABEL: trunc_usat_v4i64_v4i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm1 = [65535,65535,65535,65535] -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] -; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm2 +; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm1 = [3.2378592100206092E-319,3.2378592100206092E-319,3.2378592100206092E-319,3.2378592100206092E-319] +; AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2 ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [9223372036854841343,9223372036854841343,9223372036854841343,9223372036854841343] ; AVX2-NEXT: vpcmpgtq %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 @@ -1187,7 +1217,8 @@ ; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm3 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [9223372036854841343,9223372036854841343] ; AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3 -; AVX1-NEXT: vmovapd {{.*#+}} xmm5 = [65535,65535] +; AVX1-NEXT: vmovddup {{.*#+}} xmm5 = [3.2378592100206092E-319,3.2378592100206092E-319] +; AVX1-NEXT: # xmm5 = mem[0,0] ; AVX1-NEXT: vblendvpd %xmm3, %xmm1, %xmm5, %xmm1 ; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm2 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm4, %xmm2 @@ -1200,9 +1231,8 @@ ; ; AVX2-LABEL: trunc_usat_v4i64_v4i16_store: ; AVX2: # %bb.0: -; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm1 = [65535,65535,65535,65535] -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] -; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm2 +; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm1 = [3.2378592100206092E-319,3.2378592100206092E-319,3.2378592100206092E-319,3.2378592100206092E-319] +; AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2 ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [9223372036854841343,9223372036854841343,9223372036854841343,9223372036854841343] ; AVX2-NEXT: vpcmpgtq %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 @@ -1447,7 +1477,8 @@ ; AVX1-NEXT: vpxor %xmm4, %xmm1, %xmm5 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [9223372036854841343,9223372036854841343] ; AVX1-NEXT: vpcmpgtq %xmm5, %xmm6, %xmm5 -; AVX1-NEXT: vmovapd {{.*#+}} xmm7 = [65535,65535] +; AVX1-NEXT: vmovddup {{.*#+}} xmm7 = [3.2378592100206092E-319,3.2378592100206092E-319] +; AVX1-NEXT: # xmm7 = mem[0,0] ; AVX1-NEXT: vblendvpd %xmm5, %xmm1, %xmm7, %xmm1 ; AVX1-NEXT: vpxor %xmm4, %xmm0, %xmm5 ; AVX1-NEXT: vpcmpgtq %xmm5, %xmm6, %xmm5 @@ -1467,7 +1498,7 @@ ; AVX2: # %bb.0: ; AVX2-NEXT: vmovdqa (%rdi), %ymm0 ; AVX2-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm2 = [65535,65535,65535,65535] +; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm2 = [3.2378592100206092E-319,3.2378592100206092E-319,3.2378592100206092E-319,3.2378592100206092E-319] ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] ; AVX2-NEXT: vpxor %ymm3, %ymm1, %ymm4 ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm5 = [9223372036854841343,9223372036854841343,9223372036854841343,9223372036854841343] @@ -1536,18 +1567,11 @@ ; SSE41-NEXT: packusdw %xmm0, %xmm0 ; SSE41-NEXT: retq ; -; AVX1-LABEL: trunc_usat_v4i32_v4i16: -; AVX1: # %bb.0: -; AVX1-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX1-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: trunc_usat_v4i32_v4i16: -; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [65535,65535,65535,65535] -; AVX2-NEXT: vpminud %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 -; AVX2-NEXT: retq +; AVX-LABEL: trunc_usat_v4i32_v4i16: +; AVX: # %bb.0: +; AVX-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 +; AVX-NEXT: retq ; ; AVX512F-LABEL: trunc_usat_v4i32_v4i16: ; AVX512F: # %bb.0: @@ -1621,20 +1645,12 @@ ; SSE41-NEXT: movq %xmm0, (%rdi) ; SSE41-NEXT: retq ; -; AVX1-LABEL: trunc_usat_v4i32_v4i16_store: -; AVX1: # %bb.0: -; AVX1-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX1-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 -; AVX1-NEXT: vmovq %xmm0, (%rdi) -; AVX1-NEXT: retq -; -; AVX2-LABEL: trunc_usat_v4i32_v4i16_store: -; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [65535,65535,65535,65535] -; AVX2-NEXT: vpminud %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 -; AVX2-NEXT: vmovq %xmm0, (%rdi) -; AVX2-NEXT: retq +; AVX-LABEL: trunc_usat_v4i32_v4i16_store: +; AVX: # %bb.0: +; AVX-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vmovq %xmm0, (%rdi) +; AVX-NEXT: retq ; ; AVX512F-LABEL: trunc_usat_v4i32_v4i16_store: ; AVX512F: # %bb.0: @@ -1741,8 +1757,7 @@ ; ; AVX2-LABEL: trunc_usat_v8i32_v8i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm1 = [65535,65535,65535,65535,65535,65535,65535,65535] -; AVX2-NEXT: vpminud %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper @@ -1991,15 +2006,27 @@ ; SSE41-NEXT: movdqa %xmm2, %xmm0 ; SSE41-NEXT: retq ; -; AVX-LABEL: trunc_usat_v2i64_v2i8: -; AVX: # %bb.0: -; AVX-NEXT: vmovapd {{.*#+}} xmm1 = [255,255] -; AVX-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 -; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [9223372036854776063,9223372036854776063] -; AVX-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 -; AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX-NEXT: retq +; AVX1-LABEL: trunc_usat_v2i64_v2i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [1.2598673968951787E-321,1.2598673968951787E-321] +; AVX1-NEXT: # xmm1 = mem[0,0] +; AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [9223372036854776063,9223372036854776063] +; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX1-NEXT: retq +; +; AVX2-LABEL: trunc_usat_v2i64_v2i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovddup {{.*#+}} xmm1 = [1.2598673968951787E-321,1.2598673968951787E-321] +; AVX2-NEXT: # xmm1 = mem[0,0] +; AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm3 = [9223372036854776063,9223372036854776063] +; AVX2-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 +; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-NEXT: retq ; ; AVX512F-LABEL: trunc_usat_v2i64_v2i8: ; AVX512F: # %bb.0: @@ -2091,16 +2118,29 @@ ; SSE41-NEXT: pextrw $0, %xmm2, (%rdi) ; SSE41-NEXT: retq ; -; AVX-LABEL: trunc_usat_v2i64_v2i8_store: -; AVX: # %bb.0: -; AVX-NEXT: vmovapd {{.*#+}} xmm1 = [255,255] -; AVX-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 -; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [9223372036854776063,9223372036854776063] -; AVX-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 -; AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpextrw $0, %xmm0, (%rdi) -; AVX-NEXT: retq +; AVX1-LABEL: trunc_usat_v2i64_v2i8_store: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [1.2598673968951787E-321,1.2598673968951787E-321] +; AVX1-NEXT: # xmm1 = mem[0,0] +; AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [9223372036854776063,9223372036854776063] +; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX1-NEXT: vpextrw $0, %xmm0, (%rdi) +; AVX1-NEXT: retq +; +; AVX2-LABEL: trunc_usat_v2i64_v2i8_store: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovddup {{.*#+}} xmm1 = [1.2598673968951787E-321,1.2598673968951787E-321] +; AVX2-NEXT: # xmm1 = mem[0,0] +; AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm3 = [9223372036854776063,9223372036854776063] +; AVX2-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 +; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vpextrw $0, %xmm0, (%rdi) +; AVX2-NEXT: retq ; ; AVX512F-LABEL: trunc_usat_v2i64_v2i8_store: ; AVX512F: # %bb.0: @@ -2241,7 +2281,8 @@ ; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [9223372036854776063,9223372036854776063] ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vmovapd {{.*#+}} xmm4 = [255,255] +; AVX1-NEXT: vmovddup {{.*#+}} xmm4 = [1.2598673968951787E-321,1.2598673968951787E-321] +; AVX1-NEXT: # xmm4 = mem[0,0] ; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm4, %xmm2 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm1 @@ -2256,9 +2297,8 @@ ; ; AVX2-LABEL: trunc_usat_v4i64_v4i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm1 = [255,255,255,255] -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] -; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm2 +; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm1 = [1.2598673968951787E-321,1.2598673968951787E-321,1.2598673968951787E-321,1.2598673968951787E-321] +; AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2 ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [9223372036854776063,9223372036854776063,9223372036854776063,9223372036854776063] ; AVX2-NEXT: vpcmpgtq %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 @@ -2411,7 +2451,8 @@ ; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [9223372036854776063,9223372036854776063] ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vmovapd {{.*#+}} xmm4 = [255,255] +; AVX1-NEXT: vmovddup {{.*#+}} xmm4 = [1.2598673968951787E-321,1.2598673968951787E-321] +; AVX1-NEXT: # xmm4 = mem[0,0] ; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm4, %xmm2 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm1 @@ -2427,9 +2468,8 @@ ; ; AVX2-LABEL: trunc_usat_v4i64_v4i8_store: ; AVX2: # %bb.0: -; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm1 = [255,255,255,255] -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] -; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm2 +; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm1 = [1.2598673968951787E-321,1.2598673968951787E-321,1.2598673968951787E-321,1.2598673968951787E-321] +; AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2 ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [9223372036854776063,9223372036854776063,9223372036854776063,9223372036854776063] ; AVX2-NEXT: vpcmpgtq %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 @@ -2663,7 +2703,8 @@ ; AVX1-NEXT: vpxor %xmm4, %xmm1, %xmm5 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [9223372036854776063,9223372036854776063] ; AVX1-NEXT: vpcmpgtq %xmm5, %xmm6, %xmm5 -; AVX1-NEXT: vmovapd {{.*#+}} xmm7 = [255,255] +; AVX1-NEXT: vmovddup {{.*#+}} xmm7 = [1.2598673968951787E-321,1.2598673968951787E-321] +; AVX1-NEXT: # xmm7 = mem[0,0] ; AVX1-NEXT: vblendvpd %xmm5, %xmm1, %xmm7, %xmm1 ; AVX1-NEXT: vpxor %xmm4, %xmm0, %xmm5 ; AVX1-NEXT: vpcmpgtq %xmm5, %xmm6, %xmm5 @@ -2684,7 +2725,7 @@ ; AVX2: # %bb.0: ; AVX2-NEXT: vmovdqa (%rdi), %ymm0 ; AVX2-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm2 = [255,255,255,255] +; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm2 = [1.2598673968951787E-321,1.2598673968951787E-321,1.2598673968951787E-321,1.2598673968951787E-321] ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] ; AVX2-NEXT: vpxor %ymm3, %ymm1, %ymm4 ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm5 = [9223372036854776063,9223372036854776063,9223372036854776063,9223372036854776063] @@ -2904,7 +2945,8 @@ ; AVX1-NEXT: vpxor %xmm4, %xmm1, %xmm5 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [9223372036854776063,9223372036854776063] ; AVX1-NEXT: vpcmpgtq %xmm5, %xmm6, %xmm5 -; AVX1-NEXT: vmovapd {{.*#+}} xmm7 = [255,255] +; AVX1-NEXT: vmovddup {{.*#+}} xmm7 = [1.2598673968951787E-321,1.2598673968951787E-321] +; AVX1-NEXT: # xmm7 = mem[0,0] ; AVX1-NEXT: vblendvpd %xmm5, %xmm1, %xmm7, %xmm1 ; AVX1-NEXT: vpxor %xmm4, %xmm0, %xmm5 ; AVX1-NEXT: vpcmpgtq %xmm5, %xmm6, %xmm5 @@ -2926,7 +2968,7 @@ ; AVX2: # %bb.0: ; AVX2-NEXT: vmovdqa (%rdi), %ymm0 ; AVX2-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm2 = [255,255,255,255] +; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm2 = [1.2598673968951787E-321,1.2598673968951787E-321,1.2598673968951787E-321,1.2598673968951787E-321] ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] ; AVX2-NEXT: vpxor %ymm3, %ymm1, %ymm4 ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm5 = [9223372036854776063,9223372036854776063,9223372036854776063,9223372036854776063] @@ -3295,7 +3337,8 @@ ; AVX1-NEXT: vpxor %xmm4, %xmm1, %xmm5 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [9223372036854776063,9223372036854776063] ; AVX1-NEXT: vpcmpgtq %xmm5, %xmm6, %xmm5 -; AVX1-NEXT: vmovapd {{.*#+}} xmm7 = [255,255] +; AVX1-NEXT: vmovddup {{.*#+}} xmm7 = [1.2598673968951787E-321,1.2598673968951787E-321] +; AVX1-NEXT: # xmm7 = mem[0,0] ; AVX1-NEXT: vblendvpd %xmm5, %xmm1, %xmm7, %xmm1 ; AVX1-NEXT: vpxor %xmm4, %xmm0, %xmm5 ; AVX1-NEXT: vpcmpgtq %xmm5, %xmm6, %xmm5 @@ -3337,7 +3380,7 @@ ; AVX2-NEXT: vmovdqa 32(%rdi), %ymm1 ; AVX2-NEXT: vmovdqa 64(%rdi), %ymm2 ; AVX2-NEXT: vmovdqa 96(%rdi), %ymm3 -; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm4 = [255,255,255,255] +; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm4 = [1.2598673968951787E-321,1.2598673968951787E-321,1.2598673968951787E-321,1.2598673968951787E-321] ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm5 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] ; AVX2-NEXT: vpxor %ymm5, %ymm1, %ymm6 ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm7 = [9223372036854776063,9223372036854776063,9223372036854776063,9223372036854776063] @@ -3423,20 +3466,12 @@ ; SSE41-NEXT: packuswb %xmm0, %xmm0 ; SSE41-NEXT: retq ; -; AVX1-LABEL: trunc_usat_v4i32_v4i8: -; AVX1: # %bb.0: -; AVX1-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX1-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 -; AVX1-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: trunc_usat_v4i32_v4i8: -; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [255,255,255,255] -; AVX2-NEXT: vpminud %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 -; AVX2-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 -; AVX2-NEXT: retq +; AVX-LABEL: trunc_usat_v4i32_v4i8: +; AVX: # %bb.0: +; AVX-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 +; AVX-NEXT: retq ; ; AVX512F-LABEL: trunc_usat_v4i32_v4i8: ; AVX512F: # %bb.0: @@ -3519,8 +3554,7 @@ ; ; AVX2-SLOW-LABEL: trunc_usat_v4i32_v4i8_store: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm1 = [255,255,255,255] -; AVX2-SLOW-NEXT: vpminud %xmm1, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX2-SLOW-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 ; AVX2-SLOW-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 ; AVX2-SLOW-NEXT: vmovd %xmm0, (%rdi) @@ -3528,8 +3562,7 @@ ; ; AVX2-FAST-LABEL: trunc_usat_v4i32_v4i8_store: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} xmm1 = [255,255,255,255] -; AVX2-FAST-NEXT: vpminud %xmm1, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX2-FAST-NEXT: vmovd %xmm0, (%rdi) ; AVX2-FAST-NEXT: retq @@ -3636,8 +3669,7 @@ ; ; AVX2-LABEL: trunc_usat_v8i32_v8i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255] -; AVX2-NEXT: vpminud %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 @@ -3750,8 +3782,7 @@ ; ; AVX2-LABEL: trunc_usat_v8i32_v8i8_store: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255] -; AVX2-NEXT: vpminud %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 @@ -4387,7 +4418,7 @@ ; ; AVX2-LABEL: trunc_usat_v32i16_v32i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX2-NEXT: vpminuw 32(%rdi), %ymm0, %ymm1 ; AVX2-NEXT: vpminuw (%rdi), %ymm0, %ymm0 ; AVX2-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 @@ -4396,7 +4427,7 @@ ; ; AVX512F-LABEL: trunc_usat_v32i16_v32i8: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512F-NEXT: vpbroadcastw {{.*#+}} ymm0 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX512F-NEXT: vpminuw 32(%rdi), %ymm0, %ymm1 ; AVX512F-NEXT: vpminuw (%rdi), %ymm0, %ymm0 ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero diff --git a/llvm/test/CodeGen/X86/vector-trunc.ll b/llvm/test/CodeGen/X86/vector-trunc.ll --- a/llvm/test/CodeGen/X86/vector-trunc.ll +++ b/llvm/test/CodeGen/X86/vector-trunc.ll @@ -201,7 +201,7 @@ ; ; AVX1-LABEL: trunc8i64_8i16: ; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535] +; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm2 = [3.2378592100206092E-319,3.2378592100206092E-319,3.2378592100206092E-319,3.2378592100206092E-319] ; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 ; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 @@ -279,7 +279,7 @@ ; ; AVX1-LABEL: trunc8i64_8i8: ; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [255,255,255,255] +; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm2 = [1.2598673968951787E-321,1.2598673968951787E-321,1.2598673968951787E-321,1.2598673968951787E-321] ; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 ; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 @@ -671,7 +671,7 @@ ; ; AVX1-LABEL: trunc16i32_16i16: ; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535,65535,65535,65535,65535] +; AVX1-NEXT: vbroadcastss {{.*#+}} ymm2 = [9.18340949E-41,9.18340949E-41,9.18340949E-41,9.18340949E-41,9.18340949E-41,9.18340949E-41,9.18340949E-41,9.18340949E-41] ; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 ; AVX1-NEXT: vpackusdw %xmm3, %xmm0, %xmm0 @@ -897,7 +897,7 @@ ; ; AVX1-LABEL: trunc16i32_16i8: ; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255] +; AVX1-NEXT: vbroadcastss {{.*#+}} ymm2 = [3.57331108E-43,3.57331108E-43,3.57331108E-43,3.57331108E-43,3.57331108E-43,3.57331108E-43,3.57331108E-43,3.57331108E-43] ; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 ; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 @@ -1295,7 +1295,7 @@ ; ; AVX1-LABEL: trunc32i16_32i8: ; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX1-NEXT: vbroadcastss {{.*#+}} ymm2 = [2.34184089E-38,2.34184089E-38,2.34184089E-38,2.34184089E-38,2.34184089E-38,2.34184089E-38,2.34184089E-38,2.34184089E-38] ; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 ; AVX1-NEXT: vpackuswb %xmm3, %xmm0, %xmm0 @@ -1733,7 +1733,7 @@ ; ; AVX1-LABEL: trunc2x16i16_32i8: ; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX1-NEXT: vbroadcastss {{.*#+}} ymm2 = [2.34184089E-38,2.34184089E-38,2.34184089E-38,2.34184089E-38,2.34184089E-38,2.34184089E-38,2.34184089E-38,2.34184089E-38] ; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 ; AVX1-NEXT: vpackuswb %xmm3, %xmm0, %xmm0 @@ -1815,17 +1815,25 @@ ; SSE41-NEXT: packuswb %xmm1, %xmm0 ; SSE41-NEXT: retq ; -; AVX-LABEL: trunc2x8i16_16i8: -; AVX: # %bb.0: # %entry -; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] -; AVX-NEXT: vpand %xmm2, %xmm1, %xmm1 -; AVX-NEXT: vpand %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: trunc2x8i16_16i8: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] +; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: trunc2x8i16_16i8: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: vpbroadcastw {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: retq ; ; AVX512F-LABEL: trunc2x8i16_16i8: ; AVX512F: # %bb.0: # %entry -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] +; AVX512F-NEXT: vpbroadcastw {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] ; AVX512F-NEXT: vpand %xmm2, %xmm1, %xmm1 ; AVX512F-NEXT: vpand %xmm2, %xmm0, %xmm0 ; AVX512F-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 @@ -2146,7 +2154,7 @@ ; ; AVX1-LABEL: store_merge_split: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535,65535,65535,65535,65535] +; AVX1-NEXT: vbroadcastss {{.*#+}} ymm2 = [9.18340949E-41,9.18340949E-41,9.18340949E-41,9.18340949E-41,9.18340949E-41,9.18340949E-41,9.18340949E-41,9.18340949E-41] ; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 ; AVX1-NEXT: vpackusdw %xmm3, %xmm0, %xmm0 diff --git a/llvm/test/CodeGen/X86/vector-tzcnt-128.ll b/llvm/test/CodeGen/X86/vector-tzcnt-128.ll --- a/llvm/test/CodeGen/X86/vector-tzcnt-128.ll +++ b/llvm/test/CodeGen/X86/vector-tzcnt-128.ll @@ -122,7 +122,7 @@ ; AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm1 ; AVX2-NEXT: vpandn %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -150,7 +150,7 @@ ; AVX512CD-NEXT: vpaddq %xmm1, %xmm0, %xmm1 ; AVX512CD-NEXT: vpandn %xmm1, %xmm0, %xmm0 ; AVX512CD-NEXT: vplzcntq %zmm0, %zmm0 -; AVX512CD-NEXT: vmovdqa {{.*#+}} xmm1 = [64,64] +; AVX512CD-NEXT: vpbroadcastq {{.*#+}} xmm1 = [64,64] ; AVX512CD-NEXT: vpsubq %xmm0, %xmm1, %xmm0 ; AVX512CD-NEXT: vzeroupper ; AVX512CD-NEXT: retq @@ -323,7 +323,7 @@ ; AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm1 ; AVX2-NEXT: vpandn %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -351,7 +351,7 @@ ; AVX512CD-NEXT: vpaddq %xmm1, %xmm0, %xmm1 ; AVX512CD-NEXT: vpandn %xmm1, %xmm0, %xmm0 ; AVX512CD-NEXT: vplzcntq %zmm0, %zmm0 -; AVX512CD-NEXT: vmovdqa {{.*#+}} xmm1 = [64,64] +; AVX512CD-NEXT: vpbroadcastq {{.*#+}} xmm1 = [64,64] ; AVX512CD-NEXT: vpsubq %xmm0, %xmm1, %xmm0 ; AVX512CD-NEXT: vzeroupper ; AVX512CD-NEXT: retq @@ -548,7 +548,7 @@ ; AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm1 ; AVX2-NEXT: vpandn %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -789,7 +789,7 @@ ; AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm1 ; AVX2-NEXT: vpandn %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 @@ -992,23 +992,77 @@ ; SSE41-NEXT: psrlw $8, %xmm0 ; SSE41-NEXT: retq ; -; AVX-LABEL: testv8i16: -; AVX: # %bb.0: -; AVX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vpaddw %xmm1, %xmm0, %xmm1 -; AVX-NEXT: vpandn %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpshufb %xmm0, %xmm3, %xmm0 -; AVX-NEXT: vpaddb %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpsllw $8, %xmm0, %xmm1 -; AVX-NEXT: vpaddb %xmm0, %xmm1, %xmm0 -; AVX-NEXT: vpsrlw $8, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: testv8i16: +; AVX1: # %bb.0: +; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm1 +; AVX1-NEXT: vpandn %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 +; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 +; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpsllw $8, %xmm0, %xmm1 +; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: testv8i16: +; AVX2: # %bb.0: +; AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm1 +; AVX2-NEXT: vpandn %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 +; AVX2-NEXT: vpsrlw $4, %xmm0, %xmm0 +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufb %xmm0, %xmm3, %xmm0 +; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpsllw $8, %xmm0, %xmm1 +; AVX2-NEXT: vpaddb %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm0 +; AVX2-NEXT: retq +; +; AVX512CDVL-LABEL: testv8i16: +; AVX512CDVL: # %bb.0: +; AVX512CDVL-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX512CDVL-NEXT: vpaddw %xmm1, %xmm0, %xmm1 +; AVX512CDVL-NEXT: vpandn %xmm1, %xmm0, %xmm0 +; AVX512CDVL-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512CDVL-NEXT: vpand %xmm1, %xmm0, %xmm2 +; AVX512CDVL-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512CDVL-NEXT: vpshufb %xmm2, %xmm3, %xmm2 +; AVX512CDVL-NEXT: vpsrlw $4, %xmm0, %xmm0 +; AVX512CDVL-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512CDVL-NEXT: vpshufb %xmm0, %xmm3, %xmm0 +; AVX512CDVL-NEXT: vpaddb %xmm2, %xmm0, %xmm0 +; AVX512CDVL-NEXT: vpsllw $8, %xmm0, %xmm1 +; AVX512CDVL-NEXT: vpaddb %xmm0, %xmm1, %xmm0 +; AVX512CDVL-NEXT: vpsrlw $8, %xmm0, %xmm0 +; AVX512CDVL-NEXT: retq +; +; AVX512CD-LABEL: testv8i16: +; AVX512CD: # %bb.0: +; AVX512CD-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX512CD-NEXT: vpaddw %xmm1, %xmm0, %xmm1 +; AVX512CD-NEXT: vpandn %xmm1, %xmm0, %xmm0 +; AVX512CD-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512CD-NEXT: vpand %xmm1, %xmm0, %xmm2 +; AVX512CD-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512CD-NEXT: vpshufb %xmm2, %xmm3, %xmm2 +; AVX512CD-NEXT: vpsrlw $4, %xmm0, %xmm0 +; AVX512CD-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512CD-NEXT: vpshufb %xmm0, %xmm3, %xmm0 +; AVX512CD-NEXT: vpaddb %xmm2, %xmm0, %xmm0 +; AVX512CD-NEXT: vpsllw $8, %xmm0, %xmm1 +; AVX512CD-NEXT: vpaddb %xmm0, %xmm1, %xmm0 +; AVX512CD-NEXT: vpsrlw $8, %xmm0, %xmm0 +; AVX512CD-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: testv8i16: ; AVX512VPOPCNTDQ: # %bb.0: @@ -1168,23 +1222,77 @@ ; SSE41-NEXT: psrlw $8, %xmm0 ; SSE41-NEXT: retq ; -; AVX-LABEL: testv8i16u: -; AVX: # %bb.0: -; AVX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vpaddw %xmm1, %xmm0, %xmm1 -; AVX-NEXT: vpandn %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpshufb %xmm0, %xmm3, %xmm0 -; AVX-NEXT: vpaddb %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpsllw $8, %xmm0, %xmm1 -; AVX-NEXT: vpaddb %xmm0, %xmm1, %xmm0 -; AVX-NEXT: vpsrlw $8, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: testv8i16u: +; AVX1: # %bb.0: +; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm1 +; AVX1-NEXT: vpandn %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 +; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 +; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpsllw $8, %xmm0, %xmm1 +; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: testv8i16u: +; AVX2: # %bb.0: +; AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm1 +; AVX2-NEXT: vpandn %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 +; AVX2-NEXT: vpsrlw $4, %xmm0, %xmm0 +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufb %xmm0, %xmm3, %xmm0 +; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpsllw $8, %xmm0, %xmm1 +; AVX2-NEXT: vpaddb %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm0 +; AVX2-NEXT: retq +; +; AVX512CDVL-LABEL: testv8i16u: +; AVX512CDVL: # %bb.0: +; AVX512CDVL-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX512CDVL-NEXT: vpaddw %xmm1, %xmm0, %xmm1 +; AVX512CDVL-NEXT: vpandn %xmm1, %xmm0, %xmm0 +; AVX512CDVL-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512CDVL-NEXT: vpand %xmm1, %xmm0, %xmm2 +; AVX512CDVL-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512CDVL-NEXT: vpshufb %xmm2, %xmm3, %xmm2 +; AVX512CDVL-NEXT: vpsrlw $4, %xmm0, %xmm0 +; AVX512CDVL-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512CDVL-NEXT: vpshufb %xmm0, %xmm3, %xmm0 +; AVX512CDVL-NEXT: vpaddb %xmm2, %xmm0, %xmm0 +; AVX512CDVL-NEXT: vpsllw $8, %xmm0, %xmm1 +; AVX512CDVL-NEXT: vpaddb %xmm0, %xmm1, %xmm0 +; AVX512CDVL-NEXT: vpsrlw $8, %xmm0, %xmm0 +; AVX512CDVL-NEXT: retq +; +; AVX512CD-LABEL: testv8i16u: +; AVX512CD: # %bb.0: +; AVX512CD-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX512CD-NEXT: vpaddw %xmm1, %xmm0, %xmm1 +; AVX512CD-NEXT: vpandn %xmm1, %xmm0, %xmm0 +; AVX512CD-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512CD-NEXT: vpand %xmm1, %xmm0, %xmm2 +; AVX512CD-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512CD-NEXT: vpshufb %xmm2, %xmm3, %xmm2 +; AVX512CD-NEXT: vpsrlw $4, %xmm0, %xmm0 +; AVX512CD-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512CD-NEXT: vpshufb %xmm0, %xmm3, %xmm0 +; AVX512CD-NEXT: vpaddb %xmm2, %xmm0, %xmm0 +; AVX512CD-NEXT: vpsllw $8, %xmm0, %xmm1 +; AVX512CD-NEXT: vpaddb %xmm0, %xmm1, %xmm0 +; AVX512CD-NEXT: vpsrlw $8, %xmm0, %xmm0 +; AVX512CD-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: testv8i16u: ; AVX512VPOPCNTDQ: # %bb.0: @@ -1330,20 +1438,65 @@ ; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: retq ; -; AVX-LABEL: testv16i8: -; AVX: # %bb.0: -; AVX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vpaddb %xmm1, %xmm0, %xmm1 -; AVX-NEXT: vpandn %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpshufb %xmm0, %xmm3, %xmm0 -; AVX-NEXT: vpaddb %xmm2, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: testv16i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm1 +; AVX1-NEXT: vpandn %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 +; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 +; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: testv16i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm1 +; AVX2-NEXT: vpandn %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 +; AVX2-NEXT: vpsrlw $4, %xmm0, %xmm0 +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufb %xmm0, %xmm3, %xmm0 +; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: retq +; +; AVX512CDVL-LABEL: testv16i8: +; AVX512CDVL: # %bb.0: +; AVX512CDVL-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX512CDVL-NEXT: vpaddb %xmm1, %xmm0, %xmm1 +; AVX512CDVL-NEXT: vpandn %xmm1, %xmm0, %xmm0 +; AVX512CDVL-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512CDVL-NEXT: vpand %xmm1, %xmm0, %xmm2 +; AVX512CDVL-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512CDVL-NEXT: vpshufb %xmm2, %xmm3, %xmm2 +; AVX512CDVL-NEXT: vpsrlw $4, %xmm0, %xmm0 +; AVX512CDVL-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512CDVL-NEXT: vpshufb %xmm0, %xmm3, %xmm0 +; AVX512CDVL-NEXT: vpaddb %xmm2, %xmm0, %xmm0 +; AVX512CDVL-NEXT: retq +; +; AVX512CD-LABEL: testv16i8: +; AVX512CD: # %bb.0: +; AVX512CD-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX512CD-NEXT: vpaddb %xmm1, %xmm0, %xmm1 +; AVX512CD-NEXT: vpandn %xmm1, %xmm0, %xmm0 +; AVX512CD-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512CD-NEXT: vpand %xmm1, %xmm0, %xmm2 +; AVX512CD-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512CD-NEXT: vpshufb %xmm2, %xmm3, %xmm2 +; AVX512CD-NEXT: vpsrlw $4, %xmm0, %xmm0 +; AVX512CD-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512CD-NEXT: vpshufb %xmm0, %xmm3, %xmm0 +; AVX512CD-NEXT: vpaddb %xmm2, %xmm0, %xmm0 +; AVX512CD-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: testv16i8: ; AVX512VPOPCNTDQ: # %bb.0: @@ -1485,20 +1638,65 @@ ; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: retq ; -; AVX-LABEL: testv16i8u: -; AVX: # %bb.0: -; AVX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vpaddb %xmm1, %xmm0, %xmm1 -; AVX-NEXT: vpandn %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX-NEXT: vpshufb %xmm2, %xmm3, %xmm2 -; AVX-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpshufb %xmm0, %xmm3, %xmm0 -; AVX-NEXT: vpaddb %xmm2, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: testv16i8u: +; AVX1: # %bb.0: +; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm1 +; AVX1-NEXT: vpandn %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 +; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0 +; AVX1-NEXT: vpaddb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: testv16i8u: +; AVX2: # %bb.0: +; AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm1 +; AVX2-NEXT: vpandn %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm2 +; AVX2-NEXT: vpsrlw $4, %xmm0, %xmm0 +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufb %xmm0, %xmm3, %xmm0 +; AVX2-NEXT: vpaddb %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: retq +; +; AVX512CDVL-LABEL: testv16i8u: +; AVX512CDVL: # %bb.0: +; AVX512CDVL-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX512CDVL-NEXT: vpaddb %xmm1, %xmm0, %xmm1 +; AVX512CDVL-NEXT: vpandn %xmm1, %xmm0, %xmm0 +; AVX512CDVL-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512CDVL-NEXT: vpand %xmm1, %xmm0, %xmm2 +; AVX512CDVL-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512CDVL-NEXT: vpshufb %xmm2, %xmm3, %xmm2 +; AVX512CDVL-NEXT: vpsrlw $4, %xmm0, %xmm0 +; AVX512CDVL-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512CDVL-NEXT: vpshufb %xmm0, %xmm3, %xmm0 +; AVX512CDVL-NEXT: vpaddb %xmm2, %xmm0, %xmm0 +; AVX512CDVL-NEXT: retq +; +; AVX512CD-LABEL: testv16i8u: +; AVX512CD: # %bb.0: +; AVX512CD-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX512CD-NEXT: vpaddb %xmm1, %xmm0, %xmm1 +; AVX512CD-NEXT: vpandn %xmm1, %xmm0, %xmm0 +; AVX512CD-NEXT: vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512CD-NEXT: vpand %xmm1, %xmm0, %xmm2 +; AVX512CD-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512CD-NEXT: vpshufb %xmm2, %xmm3, %xmm2 +; AVX512CD-NEXT: vpsrlw $4, %xmm0, %xmm0 +; AVX512CD-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512CD-NEXT: vpshufb %xmm0, %xmm3, %xmm0 +; AVX512CD-NEXT: vpaddb %xmm2, %xmm0, %xmm0 +; AVX512CD-NEXT: retq ; ; AVX512VPOPCNTDQ-LABEL: testv16i8u: ; AVX512VPOPCNTDQ: # %bb.0: diff --git a/llvm/test/CodeGen/X86/vector-tzcnt-256.ll b/llvm/test/CodeGen/X86/vector-tzcnt-256.ll --- a/llvm/test/CodeGen/X86/vector-tzcnt-256.ll +++ b/llvm/test/CodeGen/X86/vector-tzcnt-256.ll @@ -45,7 +45,7 @@ ; AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 ; AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm1 ; AVX2-NEXT: vpandn %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -119,7 +119,7 @@ ; X32-AVX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 ; X32-AVX-NEXT: vpaddq %ymm1, %ymm0, %ymm1 ; X32-AVX-NEXT: vpandn %ymm1, %ymm0, %ymm0 -; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; X32-AVX-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; X32-AVX-NEXT: vpand %ymm1, %ymm0, %ymm2 ; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; X32-AVX-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -168,7 +168,7 @@ ; AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 ; AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm1 ; AVX2-NEXT: vpandn %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -242,7 +242,7 @@ ; X32-AVX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 ; X32-AVX-NEXT: vpaddq %ymm1, %ymm0, %ymm1 ; X32-AVX-NEXT: vpandn %ymm1, %ymm0, %ymm0 -; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; X32-AVX-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; X32-AVX-NEXT: vpand %ymm1, %ymm0, %ymm2 ; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; X32-AVX-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -299,7 +299,7 @@ ; AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 ; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm1 ; AVX2-NEXT: vpandn %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -385,7 +385,7 @@ ; X32-AVX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 ; X32-AVX-NEXT: vpaddd %ymm1, %ymm0, %ymm1 ; X32-AVX-NEXT: vpandn %ymm1, %ymm0, %ymm0 -; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; X32-AVX-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; X32-AVX-NEXT: vpand %ymm1, %ymm0, %ymm2 ; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; X32-AVX-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -446,7 +446,7 @@ ; AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 ; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm1 ; AVX2-NEXT: vpandn %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -532,7 +532,7 @@ ; X32-AVX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 ; X32-AVX-NEXT: vpaddd %ymm1, %ymm0, %ymm1 ; X32-AVX-NEXT: vpandn %ymm1, %ymm0, %ymm0 -; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; X32-AVX-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; X32-AVX-NEXT: vpand %ymm1, %ymm0, %ymm2 ; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; X32-AVX-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -588,7 +588,7 @@ ; AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 ; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm1 ; AVX2-NEXT: vpandn %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -624,7 +624,7 @@ ; AVX512CD-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 ; AVX512CD-NEXT: vpaddw %ymm1, %ymm0, %ymm1 ; AVX512CD-NEXT: vpandn %ymm1, %ymm0, %ymm0 -; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512CD-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512CD-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512CD-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -679,7 +679,7 @@ ; X32-AVX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 ; X32-AVX-NEXT: vpaddw %ymm1, %ymm0, %ymm1 ; X32-AVX-NEXT: vpandn %ymm1, %ymm0, %ymm0 -; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; X32-AVX-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; X32-AVX-NEXT: vpand %ymm1, %ymm0, %ymm2 ; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; X32-AVX-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -732,7 +732,7 @@ ; AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 ; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm1 ; AVX2-NEXT: vpandn %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -768,7 +768,7 @@ ; AVX512CD-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 ; AVX512CD-NEXT: vpaddw %ymm1, %ymm0, %ymm1 ; AVX512CD-NEXT: vpandn %ymm1, %ymm0, %ymm0 -; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512CD-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512CD-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512CD-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -823,7 +823,7 @@ ; X32-AVX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 ; X32-AVX-NEXT: vpaddw %ymm1, %ymm0, %ymm1 ; X32-AVX-NEXT: vpandn %ymm1, %ymm0, %ymm0 -; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; X32-AVX-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; X32-AVX-NEXT: vpand %ymm1, %ymm0, %ymm2 ; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; X32-AVX-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -870,7 +870,7 @@ ; AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 ; AVX2-NEXT: vpaddb %ymm1, %ymm0, %ymm1 ; AVX2-NEXT: vpandn %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -900,7 +900,7 @@ ; AVX512CD-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 ; AVX512CD-NEXT: vpaddb %ymm1, %ymm0, %ymm1 ; AVX512CD-NEXT: vpandn %ymm1, %ymm0, %ymm0 -; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512CD-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512CD-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512CD-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -915,7 +915,7 @@ ; AVX512VPOPCNTDQ-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 ; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm1, %ymm0, %ymm1 ; AVX512VPOPCNTDQ-NEXT: vpandn %ymm1, %ymm0, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -962,7 +962,7 @@ ; X32-AVX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 ; X32-AVX-NEXT: vpaddb %ymm1, %ymm0, %ymm1 ; X32-AVX-NEXT: vpandn %ymm1, %ymm0, %ymm0 -; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; X32-AVX-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; X32-AVX-NEXT: vpand %ymm1, %ymm0, %ymm2 ; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; X32-AVX-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -1006,7 +1006,7 @@ ; AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 ; AVX2-NEXT: vpaddb %ymm1, %ymm0, %ymm1 ; AVX2-NEXT: vpandn %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -1036,7 +1036,7 @@ ; AVX512CD-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 ; AVX512CD-NEXT: vpaddb %ymm1, %ymm0, %ymm1 ; AVX512CD-NEXT: vpandn %ymm1, %ymm0, %ymm0 -; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512CD-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512CD-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512CD-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -1051,7 +1051,7 @@ ; AVX512VPOPCNTDQ-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 ; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm1, %ymm0, %ymm1 ; AVX512VPOPCNTDQ-NEXT: vpandn %ymm1, %ymm0, %ymm0 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQ-NEXT: vpand %ymm1, %ymm0, %ymm2 ; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm2, %ymm3, %ymm2 @@ -1098,7 +1098,7 @@ ; X32-AVX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 ; X32-AVX-NEXT: vpaddb %ymm1, %ymm0, %ymm1 ; X32-AVX-NEXT: vpandn %ymm1, %ymm0, %ymm0 -; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; X32-AVX-NEXT: vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; X32-AVX-NEXT: vpand %ymm1, %ymm0, %ymm2 ; X32-AVX-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; X32-AVX-NEXT: vpshufb %ymm2, %ymm3, %ymm2 diff --git a/llvm/test/CodeGen/X86/vector-tzcnt-512.ll b/llvm/test/CodeGen/X86/vector-tzcnt-512.ll --- a/llvm/test/CodeGen/X86/vector-tzcnt-512.ll +++ b/llvm/test/CodeGen/X86/vector-tzcnt-512.ll @@ -263,7 +263,7 @@ ; AVX512CD-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 ; AVX512CD-NEXT: vpaddw %ymm1, %ymm0, %ymm2 ; AVX512CD-NEXT: vpandn %ymm2, %ymm0, %ymm2 -; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512CD-NEXT: vpbroadcastb {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512CD-NEXT: vpand %ymm3, %ymm2, %ymm4 ; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512CD-NEXT: vpshufb %ymm4, %ymm5, %ymm4 @@ -359,7 +359,7 @@ ; AVX512CD-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 ; AVX512CD-NEXT: vpaddw %ymm1, %ymm0, %ymm2 ; AVX512CD-NEXT: vpandn %ymm2, %ymm0, %ymm2 -; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512CD-NEXT: vpbroadcastb {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512CD-NEXT: vpand %ymm3, %ymm2, %ymm4 ; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512CD-NEXT: vpshufb %ymm4, %ymm5, %ymm4 @@ -456,7 +456,7 @@ ; AVX512CD-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 ; AVX512CD-NEXT: vpaddb %ymm2, %ymm1, %ymm3 ; AVX512CD-NEXT: vpandn %ymm3, %ymm1, %ymm1 -; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512CD-NEXT: vpbroadcastb {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512CD-NEXT: vpand %ymm3, %ymm1, %ymm4 ; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512CD-NEXT: vpshufb %ymm4, %ymm5, %ymm4 @@ -511,7 +511,7 @@ ; AVX512VPOPCNTDQ-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 ; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm2, %ymm1, %ymm3 ; AVX512VPOPCNTDQ-NEXT: vpandn %ymm3, %ymm1, %ymm1 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastb {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQ-NEXT: vpand %ymm3, %ymm1, %ymm4 ; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm4, %ymm5, %ymm4 @@ -548,7 +548,7 @@ ; AVX512CD-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 ; AVX512CD-NEXT: vpaddb %ymm2, %ymm1, %ymm3 ; AVX512CD-NEXT: vpandn %ymm3, %ymm1, %ymm1 -; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512CD-NEXT: vpbroadcastb {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512CD-NEXT: vpand %ymm3, %ymm1, %ymm4 ; AVX512CD-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512CD-NEXT: vpshufb %ymm4, %ymm5, %ymm4 @@ -603,7 +603,7 @@ ; AVX512VPOPCNTDQ-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 ; AVX512VPOPCNTDQ-NEXT: vpaddb %ymm2, %ymm1, %ymm3 ; AVX512VPOPCNTDQ-NEXT: vpandn %ymm3, %ymm1, %ymm1 -; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VPOPCNTDQ-NEXT: vpbroadcastb {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX512VPOPCNTDQ-NEXT: vpand %ymm3, %ymm1, %ymm4 ; AVX512VPOPCNTDQ-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] ; AVX512VPOPCNTDQ-NEXT: vpshufb %ymm4, %ymm5, %ymm4 diff --git a/llvm/test/CodeGen/X86/vector-unsigned-cmp.ll b/llvm/test/CodeGen/X86/vector-unsigned-cmp.ll --- a/llvm/test/CodeGen/X86/vector-unsigned-cmp.ll +++ b/llvm/test/CodeGen/X86/vector-unsigned-cmp.ll @@ -353,15 +353,25 @@ ; SSE-NEXT: pcmpgtb %xmm1, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: ugt_v16i8: -; AVX: # %bb.0: -; AVX-NEXT: vpsrlw $1, %xmm0, %xmm0 -; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] -; AVX-NEXT: vpand %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpsrlw $1, %xmm1, %xmm1 -; AVX-NEXT: vpand %xmm2, %xmm1, %xmm1 -; AVX-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: ugt_v16i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vpsrlw $1, %xmm0, %xmm0 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpsrlw $1, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: ugt_v16i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpsrlw $1, %xmm0, %xmm0 +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm2 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpsrlw $1, %xmm1, %xmm1 +; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: retq %sh1 = lshr <16 x i8> %x, %sh2 = lshr <16 x i8> %y, %cmp = icmp ugt <16 x i8> %sh1, %sh2 @@ -380,15 +390,25 @@ ; SSE-NEXT: movdqa %xmm2, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: ult_v16i8: -; AVX: # %bb.0: -; AVX-NEXT: vpsrlw $1, %xmm0, %xmm0 -; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] -; AVX-NEXT: vpand %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpsrlw $1, %xmm1, %xmm1 -; AVX-NEXT: vpand %xmm2, %xmm1, %xmm1 -; AVX-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: ult_v16i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vpsrlw $1, %xmm0, %xmm0 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpsrlw $1, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: ult_v16i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpsrlw $1, %xmm0, %xmm0 +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm2 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpsrlw $1, %xmm1, %xmm1 +; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: retq %sh1 = lshr <16 x i8> %x, %sh2 = lshr <16 x i8> %y, %cmp = icmp ult <16 x i8> %sh1, %sh2 @@ -407,16 +427,27 @@ ; SSE-NEXT: pcmpeqb %xmm2, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: uge_v16i8: -; AVX: # %bb.0: -; AVX-NEXT: vpsrlw $1, %xmm0, %xmm0 -; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] -; AVX-NEXT: vpand %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpsrlw $1, %xmm1, %xmm1 -; AVX-NEXT: vpand %xmm2, %xmm1, %xmm1 -; AVX-NEXT: vpmaxub %xmm1, %xmm0, %xmm1 -; AVX-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: uge_v16i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vpsrlw $1, %xmm0, %xmm0 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpsrlw $1, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpmaxub %xmm1, %xmm0, %xmm1 +; AVX1-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: uge_v16i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpsrlw $1, %xmm0, %xmm0 +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm2 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpsrlw $1, %xmm1, %xmm1 +; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpmaxub %xmm1, %xmm0, %xmm1 +; AVX2-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: retq %sh1 = lshr <16 x i8> %x, %sh2 = lshr <16 x i8> %y, %cmp = icmp uge <16 x i8> %sh1, %sh2 @@ -435,16 +466,27 @@ ; SSE-NEXT: pcmpeqb %xmm2, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: ule_v16i8: -; AVX: # %bb.0: -; AVX-NEXT: vpsrlw $1, %xmm0, %xmm0 -; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] -; AVX-NEXT: vpand %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpsrlw $1, %xmm1, %xmm1 -; AVX-NEXT: vpand %xmm2, %xmm1, %xmm1 -; AVX-NEXT: vpminub %xmm1, %xmm0, %xmm1 -; AVX-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: ule_v16i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vpsrlw $1, %xmm0, %xmm0 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpsrlw $1, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm1 +; AVX1-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: ule_v16i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpsrlw $1, %xmm0, %xmm0 +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm2 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpsrlw $1, %xmm1, %xmm1 +; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpminub %xmm1, %xmm0, %xmm1 +; AVX2-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: retq %sh1 = lshr <16 x i8> %x, %sh2 = lshr <16 x i8> %y, %cmp = icmp ule <16 x i8> %sh1, %sh2 diff --git a/llvm/test/CodeGen/X86/vector-zext.ll b/llvm/test/CodeGen/X86/vector-zext.ll --- a/llvm/test/CodeGen/X86/vector-zext.ll +++ b/llvm/test/CodeGen/X86/vector-zext.ll @@ -2416,8 +2416,7 @@ ; AVX2-NEXT: shrq $34, %rcx ; AVX2-NEXT: vpinsrd $2, %ecx, %xmm0, %xmm0 ; AVX2-NEXT: vpinsrd $3, %edx, %xmm0, %xmm0 -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [131071,131071,131071,131071] -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX2-NEXT: retq ; ; AVX512-LABEL: zext_4i17_to_4i32: diff --git a/llvm/test/CodeGen/X86/vector_splat-const-shift-of-constmasked.ll b/llvm/test/CodeGen/X86/vector_splat-const-shift-of-constmasked.ll --- a/llvm/test/CodeGen/X86/vector_splat-const-shift-of-constmasked.ll +++ b/llvm/test/CodeGen/X86/vector_splat-const-shift-of-constmasked.ll @@ -447,14 +447,23 @@ ; X86-SSE2-NEXT: psubb %xmm1, %xmm0 ; X86-SSE2-NEXT: retl ; -; X86-AVX-LABEL: test_128_i8_x_16_224_mask_ashr_1: -; X86-AVX: # %bb.0: -; X86-AVX-NEXT: vpsrlw $1, %xmm0, %xmm0 -; X86-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 -; X86-AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64] -; X86-AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; X86-AVX-NEXT: vpsubb %xmm1, %xmm0, %xmm0 -; X86-AVX-NEXT: retl +; X86-AVX1-LABEL: test_128_i8_x_16_224_mask_ashr_1: +; X86-AVX1: # %bb.0: +; X86-AVX1-NEXT: vpsrlw $1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 +; X86-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64] +; X86-AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpsubb %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: test_128_i8_x_16_224_mask_ashr_1: +; X86-AVX2: # %bb.0: +; X86-AVX2-NEXT: vpsrlw $1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 +; X86-AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64] +; X86-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vpsubb %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: retl ; ; X64-SSE2-LABEL: test_128_i8_x_16_224_mask_ashr_1: ; X64-SSE2: # %bb.0: @@ -465,14 +474,23 @@ ; X64-SSE2-NEXT: psubb %xmm1, %xmm0 ; X64-SSE2-NEXT: retq ; -; X64-AVX-LABEL: test_128_i8_x_16_224_mask_ashr_1: -; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vpsrlw $1, %xmm0, %xmm0 -; X64-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; X64-AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64] -; X64-AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; X64-AVX-NEXT: vpsubb %xmm1, %xmm0, %xmm0 -; X64-AVX-NEXT: retq +; X64-AVX1-LABEL: test_128_i8_x_16_224_mask_ashr_1: +; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: vpsrlw $1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64] +; X64-AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpsubb %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: test_128_i8_x_16_224_mask_ashr_1: +; X64-AVX2: # %bb.0: +; X64-AVX2-NEXT: vpsrlw $1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64] +; X64-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpsubb %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: retq %t0 = and <16 x i8> %a0, %t1 = ashr <16 x i8> %t0, ret <16 x i8> %t1 @@ -487,14 +505,23 @@ ; X86-SSE2-NEXT: psubb %xmm1, %xmm0 ; X86-SSE2-NEXT: retl ; -; X86-AVX-LABEL: test_128_i8_x_16_224_mask_ashr_4: -; X86-AVX: # %bb.0: -; X86-AVX-NEXT: vpsrlw $4, %xmm0, %xmm0 -; X86-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 -; X86-AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] -; X86-AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; X86-AVX-NEXT: vpsubb %xmm1, %xmm0, %xmm0 -; X86-AVX-NEXT: retl +; X86-AVX1-LABEL: test_128_i8_x_16_224_mask_ashr_4: +; X86-AVX1: # %bb.0: +; X86-AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 +; X86-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; X86-AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpsubb %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: test_128_i8_x_16_224_mask_ashr_4: +; X86-AVX2: # %bb.0: +; X86-AVX2-NEXT: vpsrlw $4, %xmm0, %xmm0 +; X86-AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 +; X86-AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; X86-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vpsubb %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: retl ; ; X64-SSE2-LABEL: test_128_i8_x_16_224_mask_ashr_4: ; X64-SSE2: # %bb.0: @@ -505,14 +532,23 @@ ; X64-SSE2-NEXT: psubb %xmm1, %xmm0 ; X64-SSE2-NEXT: retq ; -; X64-AVX-LABEL: test_128_i8_x_16_224_mask_ashr_4: -; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vpsrlw $4, %xmm0, %xmm0 -; X64-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; X64-AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] -; X64-AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; X64-AVX-NEXT: vpsubb %xmm1, %xmm0, %xmm0 -; X64-AVX-NEXT: retq +; X64-AVX1-LABEL: test_128_i8_x_16_224_mask_ashr_4: +; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; X64-AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpsubb %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: test_128_i8_x_16_224_mask_ashr_4: +; X64-AVX2: # %bb.0: +; X64-AVX2-NEXT: vpsrlw $4, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; X64-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpsubb %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: retq %t0 = and <16 x i8> %a0, %t1 = ashr <16 x i8> %t0, ret <16 x i8> %t1 @@ -527,14 +563,23 @@ ; X86-SSE2-NEXT: psubb %xmm1, %xmm0 ; X86-SSE2-NEXT: retl ; -; X86-AVX-LABEL: test_128_i8_x_16_224_mask_ashr_5: -; X86-AVX: # %bb.0: -; X86-AVX-NEXT: vpsrlw $5, %xmm0, %xmm0 -; X86-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 -; X86-AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] -; X86-AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; X86-AVX-NEXT: vpsubb %xmm1, %xmm0, %xmm0 -; X86-AVX-NEXT: retl +; X86-AVX1-LABEL: test_128_i8_x_16_224_mask_ashr_5: +; X86-AVX1: # %bb.0: +; X86-AVX1-NEXT: vpsrlw $5, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 +; X86-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] +; X86-AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpsubb %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: test_128_i8_x_16_224_mask_ashr_5: +; X86-AVX2: # %bb.0: +; X86-AVX2-NEXT: vpsrlw $5, %xmm0, %xmm0 +; X86-AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 +; X86-AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] +; X86-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vpsubb %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: retl ; ; X64-SSE2-LABEL: test_128_i8_x_16_224_mask_ashr_5: ; X64-SSE2: # %bb.0: @@ -545,14 +590,23 @@ ; X64-SSE2-NEXT: psubb %xmm1, %xmm0 ; X64-SSE2-NEXT: retq ; -; X64-AVX-LABEL: test_128_i8_x_16_224_mask_ashr_5: -; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vpsrlw $5, %xmm0, %xmm0 -; X64-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; X64-AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] -; X64-AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; X64-AVX-NEXT: vpsubb %xmm1, %xmm0, %xmm0 -; X64-AVX-NEXT: retq +; X64-AVX1-LABEL: test_128_i8_x_16_224_mask_ashr_5: +; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: vpsrlw $5, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] +; X64-AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpsubb %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: test_128_i8_x_16_224_mask_ashr_5: +; X64-AVX2: # %bb.0: +; X64-AVX2-NEXT: vpsrlw $5, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4] +; X64-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpsubb %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: retq %t0 = and <16 x i8> %a0, %t1 = ashr <16 x i8> %t0, ret <16 x i8> %t1 @@ -567,14 +621,23 @@ ; X86-SSE2-NEXT: psubb %xmm1, %xmm0 ; X86-SSE2-NEXT: retl ; -; X86-AVX-LABEL: test_128_i8_x_16_224_mask_ashr_6: -; X86-AVX: # %bb.0: -; X86-AVX-NEXT: vpsrlw $6, %xmm0, %xmm0 -; X86-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 -; X86-AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2] -; X86-AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; X86-AVX-NEXT: vpsubb %xmm1, %xmm0, %xmm0 -; X86-AVX-NEXT: retl +; X86-AVX1-LABEL: test_128_i8_x_16_224_mask_ashr_6: +; X86-AVX1: # %bb.0: +; X86-AVX1-NEXT: vpsrlw $6, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 +; X86-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2] +; X86-AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: vpsubb %xmm1, %xmm0, %xmm0 +; X86-AVX1-NEXT: retl +; +; X86-AVX2-LABEL: test_128_i8_x_16_224_mask_ashr_6: +; X86-AVX2: # %bb.0: +; X86-AVX2-NEXT: vpsrlw $6, %xmm0, %xmm0 +; X86-AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 +; X86-AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2] +; X86-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: vpsubb %xmm1, %xmm0, %xmm0 +; X86-AVX2-NEXT: retl ; ; X64-SSE2-LABEL: test_128_i8_x_16_224_mask_ashr_6: ; X64-SSE2: # %bb.0: @@ -585,14 +648,23 @@ ; X64-SSE2-NEXT: psubb %xmm1, %xmm0 ; X64-SSE2-NEXT: retq ; -; X64-AVX-LABEL: test_128_i8_x_16_224_mask_ashr_6: -; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vpsrlw $6, %xmm0, %xmm0 -; X64-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; X64-AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2] -; X64-AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; X64-AVX-NEXT: vpsubb %xmm1, %xmm0, %xmm0 -; X64-AVX-NEXT: retq +; X64-AVX1-LABEL: test_128_i8_x_16_224_mask_ashr_6: +; X64-AVX1: # %bb.0: +; X64-AVX1-NEXT: vpsrlw $6, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2] +; X64-AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpsubb %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: retq +; +; X64-AVX2-LABEL: test_128_i8_x_16_224_mask_ashr_6: +; X64-AVX2: # %bb.0: +; X64-AVX2-NEXT: vpsrlw $6, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2] +; X64-AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: vpsubb %xmm1, %xmm0, %xmm0 +; X64-AVX2-NEXT: retq %t0 = and <16 x i8> %a0, %t1 = ashr <16 x i8> %t0, ret <16 x i8> %t1 @@ -1623,18 +1695,11 @@ ; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 ; X86-SSE2-NEXT: retl ; -; X86-AVX1-LABEL: test_128_i32_x_4_32767_mask_lshr_1: -; X86-AVX1: # %bb.0: -; X86-AVX1-NEXT: vpsrld $1, %xmm0, %xmm0 -; X86-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 -; X86-AVX1-NEXT: retl -; -; X86-AVX2-LABEL: test_128_i32_x_4_32767_mask_lshr_1: -; X86-AVX2: # %bb.0: -; X86-AVX2-NEXT: vpsrld $1, %xmm0, %xmm0 -; X86-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [16383,16383,16383,16383] -; X86-AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; X86-AVX2-NEXT: retl +; X86-AVX-LABEL: test_128_i32_x_4_32767_mask_lshr_1: +; X86-AVX: # %bb.0: +; X86-AVX-NEXT: vpsrld $1, %xmm0, %xmm0 +; X86-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 +; X86-AVX-NEXT: retl ; ; X64-SSE2-LABEL: test_128_i32_x_4_32767_mask_lshr_1: ; X64-SSE2: # %bb.0: @@ -1642,18 +1707,11 @@ ; X64-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; X64-SSE2-NEXT: retq ; -; X64-AVX1-LABEL: test_128_i32_x_4_32767_mask_lshr_1: -; X64-AVX1: # %bb.0: -; X64-AVX1-NEXT: vpsrld $1, %xmm0, %xmm0 -; X64-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; X64-AVX1-NEXT: retq -; -; X64-AVX2-LABEL: test_128_i32_x_4_32767_mask_lshr_1: -; X64-AVX2: # %bb.0: -; X64-AVX2-NEXT: vpsrld $1, %xmm0, %xmm0 -; X64-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [16383,16383,16383,16383] -; X64-AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; X64-AVX2-NEXT: retq +; X64-AVX-LABEL: test_128_i32_x_4_32767_mask_lshr_1: +; X64-AVX: # %bb.0: +; X64-AVX-NEXT: vpsrld $1, %xmm0, %xmm0 +; X64-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-AVX-NEXT: retq %t0 = and <4 x i32> %a0, %t1 = lshr <4 x i32> %t0, ret <4 x i32> %t1 @@ -1666,18 +1724,11 @@ ; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 ; X86-SSE2-NEXT: retl ; -; X86-AVX1-LABEL: test_128_i32_x_4_8388352_mask_lshr_7: -; X86-AVX1: # %bb.0: -; X86-AVX1-NEXT: vpsrld $7, %xmm0, %xmm0 -; X86-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 -; X86-AVX1-NEXT: retl -; -; X86-AVX2-LABEL: test_128_i32_x_4_8388352_mask_lshr_7: -; X86-AVX2: # %bb.0: -; X86-AVX2-NEXT: vpsrld $7, %xmm0, %xmm0 -; X86-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [65534,65534,65534,65534] -; X86-AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; X86-AVX2-NEXT: retl +; X86-AVX-LABEL: test_128_i32_x_4_8388352_mask_lshr_7: +; X86-AVX: # %bb.0: +; X86-AVX-NEXT: vpsrld $7, %xmm0, %xmm0 +; X86-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 +; X86-AVX-NEXT: retl ; ; X64-SSE2-LABEL: test_128_i32_x_4_8388352_mask_lshr_7: ; X64-SSE2: # %bb.0: @@ -1685,18 +1736,11 @@ ; X64-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; X64-SSE2-NEXT: retq ; -; X64-AVX1-LABEL: test_128_i32_x_4_8388352_mask_lshr_7: -; X64-AVX1: # %bb.0: -; X64-AVX1-NEXT: vpsrld $7, %xmm0, %xmm0 -; X64-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; X64-AVX1-NEXT: retq -; -; X64-AVX2-LABEL: test_128_i32_x_4_8388352_mask_lshr_7: -; X64-AVX2: # %bb.0: -; X64-AVX2-NEXT: vpsrld $7, %xmm0, %xmm0 -; X64-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [65534,65534,65534,65534] -; X64-AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; X64-AVX2-NEXT: retq +; X64-AVX-LABEL: test_128_i32_x_4_8388352_mask_lshr_7: +; X64-AVX: # %bb.0: +; X64-AVX-NEXT: vpsrld $7, %xmm0, %xmm0 +; X64-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-AVX-NEXT: retq %t0 = and <4 x i32> %a0, %t1 = lshr <4 x i32> %t0, ret <4 x i32> %t1 @@ -1708,18 +1752,11 @@ ; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 ; X86-SSE2-NEXT: retl ; -; X86-AVX1-LABEL: test_128_i32_x_4_8388352_mask_lshr_8: -; X86-AVX1: # %bb.0: -; X86-AVX1-NEXT: vpsrld $8, %xmm0, %xmm0 -; X86-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 -; X86-AVX1-NEXT: retl -; -; X86-AVX2-LABEL: test_128_i32_x_4_8388352_mask_lshr_8: -; X86-AVX2: # %bb.0: -; X86-AVX2-NEXT: vpsrld $8, %xmm0, %xmm0 -; X86-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [32767,32767,32767,32767] -; X86-AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; X86-AVX2-NEXT: retl +; X86-AVX-LABEL: test_128_i32_x_4_8388352_mask_lshr_8: +; X86-AVX: # %bb.0: +; X86-AVX-NEXT: vpsrld $8, %xmm0, %xmm0 +; X86-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 +; X86-AVX-NEXT: retl ; ; X64-SSE2-LABEL: test_128_i32_x_4_8388352_mask_lshr_8: ; X64-SSE2: # %bb.0: @@ -1727,18 +1764,11 @@ ; X64-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; X64-SSE2-NEXT: retq ; -; X64-AVX1-LABEL: test_128_i32_x_4_8388352_mask_lshr_8: -; X64-AVX1: # %bb.0: -; X64-AVX1-NEXT: vpsrld $8, %xmm0, %xmm0 -; X64-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; X64-AVX1-NEXT: retq -; -; X64-AVX2-LABEL: test_128_i32_x_4_8388352_mask_lshr_8: -; X64-AVX2: # %bb.0: -; X64-AVX2-NEXT: vpsrld $8, %xmm0, %xmm0 -; X64-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [32767,32767,32767,32767] -; X64-AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; X64-AVX2-NEXT: retq +; X64-AVX-LABEL: test_128_i32_x_4_8388352_mask_lshr_8: +; X64-AVX: # %bb.0: +; X64-AVX-NEXT: vpsrld $8, %xmm0, %xmm0 +; X64-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-AVX-NEXT: retq %t0 = and <4 x i32> %a0, %t1 = lshr <4 x i32> %t0, ret <4 x i32> %t1 @@ -1750,18 +1780,11 @@ ; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 ; X86-SSE2-NEXT: retl ; -; X86-AVX1-LABEL: test_128_i32_x_4_8388352_mask_lshr_9: -; X86-AVX1: # %bb.0: -; X86-AVX1-NEXT: vpsrld $9, %xmm0, %xmm0 -; X86-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 -; X86-AVX1-NEXT: retl -; -; X86-AVX2-LABEL: test_128_i32_x_4_8388352_mask_lshr_9: -; X86-AVX2: # %bb.0: -; X86-AVX2-NEXT: vpsrld $9, %xmm0, %xmm0 -; X86-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [16383,16383,16383,16383] -; X86-AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; X86-AVX2-NEXT: retl +; X86-AVX-LABEL: test_128_i32_x_4_8388352_mask_lshr_9: +; X86-AVX: # %bb.0: +; X86-AVX-NEXT: vpsrld $9, %xmm0, %xmm0 +; X86-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 +; X86-AVX-NEXT: retl ; ; X64-SSE2-LABEL: test_128_i32_x_4_8388352_mask_lshr_9: ; X64-SSE2: # %bb.0: @@ -1769,18 +1792,11 @@ ; X64-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; X64-SSE2-NEXT: retq ; -; X64-AVX1-LABEL: test_128_i32_x_4_8388352_mask_lshr_9: -; X64-AVX1: # %bb.0: -; X64-AVX1-NEXT: vpsrld $9, %xmm0, %xmm0 -; X64-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; X64-AVX1-NEXT: retq -; -; X64-AVX2-LABEL: test_128_i32_x_4_8388352_mask_lshr_9: -; X64-AVX2: # %bb.0: -; X64-AVX2-NEXT: vpsrld $9, %xmm0, %xmm0 -; X64-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [16383,16383,16383,16383] -; X64-AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; X64-AVX2-NEXT: retq +; X64-AVX-LABEL: test_128_i32_x_4_8388352_mask_lshr_9: +; X64-AVX: # %bb.0: +; X64-AVX-NEXT: vpsrld $9, %xmm0, %xmm0 +; X64-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-AVX-NEXT: retq %t0 = and <4 x i32> %a0, %t1 = lshr <4 x i32> %t0, ret <4 x i32> %t1 @@ -1792,18 +1808,11 @@ ; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 ; X86-SSE2-NEXT: retl ; -; X86-AVX1-LABEL: test_128_i32_x_4_8388352_mask_lshr_10: -; X86-AVX1: # %bb.0: -; X86-AVX1-NEXT: vpsrld $10, %xmm0, %xmm0 -; X86-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 -; X86-AVX1-NEXT: retl -; -; X86-AVX2-LABEL: test_128_i32_x_4_8388352_mask_lshr_10: -; X86-AVX2: # %bb.0: -; X86-AVX2-NEXT: vpsrld $10, %xmm0, %xmm0 -; X86-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [8191,8191,8191,8191] -; X86-AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; X86-AVX2-NEXT: retl +; X86-AVX-LABEL: test_128_i32_x_4_8388352_mask_lshr_10: +; X86-AVX: # %bb.0: +; X86-AVX-NEXT: vpsrld $10, %xmm0, %xmm0 +; X86-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 +; X86-AVX-NEXT: retl ; ; X64-SSE2-LABEL: test_128_i32_x_4_8388352_mask_lshr_10: ; X64-SSE2: # %bb.0: @@ -1811,18 +1820,11 @@ ; X64-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; X64-SSE2-NEXT: retq ; -; X64-AVX1-LABEL: test_128_i32_x_4_8388352_mask_lshr_10: -; X64-AVX1: # %bb.0: -; X64-AVX1-NEXT: vpsrld $10, %xmm0, %xmm0 -; X64-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; X64-AVX1-NEXT: retq -; -; X64-AVX2-LABEL: test_128_i32_x_4_8388352_mask_lshr_10: -; X64-AVX2: # %bb.0: -; X64-AVX2-NEXT: vpsrld $10, %xmm0, %xmm0 -; X64-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [8191,8191,8191,8191] -; X64-AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; X64-AVX2-NEXT: retq +; X64-AVX-LABEL: test_128_i32_x_4_8388352_mask_lshr_10: +; X64-AVX: # %bb.0: +; X64-AVX-NEXT: vpsrld $10, %xmm0, %xmm0 +; X64-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-AVX-NEXT: retq %t0 = and <4 x i32> %a0, %t1 = lshr <4 x i32> %t0, ret <4 x i32> %t1 @@ -1835,18 +1837,11 @@ ; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 ; X86-SSE2-NEXT: retl ; -; X86-AVX1-LABEL: test_128_i32_x_4_4294836224_mask_lshr_1: -; X86-AVX1: # %bb.0: -; X86-AVX1-NEXT: vpsrld $1, %xmm0, %xmm0 -; X86-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 -; X86-AVX1-NEXT: retl -; -; X86-AVX2-LABEL: test_128_i32_x_4_4294836224_mask_lshr_1: -; X86-AVX2: # %bb.0: -; X86-AVX2-NEXT: vpsrld $1, %xmm0, %xmm0 -; X86-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2147418112,2147418112,2147418112,2147418112] -; X86-AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; X86-AVX2-NEXT: retl +; X86-AVX-LABEL: test_128_i32_x_4_4294836224_mask_lshr_1: +; X86-AVX: # %bb.0: +; X86-AVX-NEXT: vpsrld $1, %xmm0, %xmm0 +; X86-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 +; X86-AVX-NEXT: retl ; ; X64-SSE2-LABEL: test_128_i32_x_4_4294836224_mask_lshr_1: ; X64-SSE2: # %bb.0: @@ -1854,18 +1849,11 @@ ; X64-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; X64-SSE2-NEXT: retq ; -; X64-AVX1-LABEL: test_128_i32_x_4_4294836224_mask_lshr_1: -; X64-AVX1: # %bb.0: -; X64-AVX1-NEXT: vpsrld $1, %xmm0, %xmm0 -; X64-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; X64-AVX1-NEXT: retq -; -; X64-AVX2-LABEL: test_128_i32_x_4_4294836224_mask_lshr_1: -; X64-AVX2: # %bb.0: -; X64-AVX2-NEXT: vpsrld $1, %xmm0, %xmm0 -; X64-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2147418112,2147418112,2147418112,2147418112] -; X64-AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; X64-AVX2-NEXT: retq +; X64-AVX-LABEL: test_128_i32_x_4_4294836224_mask_lshr_1: +; X64-AVX: # %bb.0: +; X64-AVX-NEXT: vpsrld $1, %xmm0, %xmm0 +; X64-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-AVX-NEXT: retq %t0 = and <4 x i32> %a0, %t1 = lshr <4 x i32> %t0, ret <4 x i32> %t1 @@ -1877,18 +1865,11 @@ ; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 ; X86-SSE2-NEXT: retl ; -; X86-AVX1-LABEL: test_128_i32_x_4_4294836224_mask_lshr_16: -; X86-AVX1: # %bb.0: -; X86-AVX1-NEXT: vpsrld $16, %xmm0, %xmm0 -; X86-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 -; X86-AVX1-NEXT: retl -; -; X86-AVX2-LABEL: test_128_i32_x_4_4294836224_mask_lshr_16: -; X86-AVX2: # %bb.0: -; X86-AVX2-NEXT: vpsrld $16, %xmm0, %xmm0 -; X86-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [65534,65534,65534,65534] -; X86-AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; X86-AVX2-NEXT: retl +; X86-AVX-LABEL: test_128_i32_x_4_4294836224_mask_lshr_16: +; X86-AVX: # %bb.0: +; X86-AVX-NEXT: vpsrld $16, %xmm0, %xmm0 +; X86-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 +; X86-AVX-NEXT: retl ; ; X64-SSE2-LABEL: test_128_i32_x_4_4294836224_mask_lshr_16: ; X64-SSE2: # %bb.0: @@ -1896,18 +1877,11 @@ ; X64-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; X64-SSE2-NEXT: retq ; -; X64-AVX1-LABEL: test_128_i32_x_4_4294836224_mask_lshr_16: -; X64-AVX1: # %bb.0: -; X64-AVX1-NEXT: vpsrld $16, %xmm0, %xmm0 -; X64-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; X64-AVX1-NEXT: retq -; -; X64-AVX2-LABEL: test_128_i32_x_4_4294836224_mask_lshr_16: -; X64-AVX2: # %bb.0: -; X64-AVX2-NEXT: vpsrld $16, %xmm0, %xmm0 -; X64-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [65534,65534,65534,65534] -; X64-AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; X64-AVX2-NEXT: retq +; X64-AVX-LABEL: test_128_i32_x_4_4294836224_mask_lshr_16: +; X64-AVX: # %bb.0: +; X64-AVX-NEXT: vpsrld $16, %xmm0, %xmm0 +; X64-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-AVX-NEXT: retq %t0 = and <4 x i32> %a0, %t1 = lshr <4 x i32> %t0, ret <4 x i32> %t1 @@ -1970,18 +1944,11 @@ ; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 ; X86-SSE2-NEXT: retl ; -; X86-AVX1-LABEL: test_128_i32_x_4_32767_mask_ashr_1: -; X86-AVX1: # %bb.0: -; X86-AVX1-NEXT: vpsrld $1, %xmm0, %xmm0 -; X86-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 -; X86-AVX1-NEXT: retl -; -; X86-AVX2-LABEL: test_128_i32_x_4_32767_mask_ashr_1: -; X86-AVX2: # %bb.0: -; X86-AVX2-NEXT: vpsrld $1, %xmm0, %xmm0 -; X86-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [16383,16383,16383,16383] -; X86-AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; X86-AVX2-NEXT: retl +; X86-AVX-LABEL: test_128_i32_x_4_32767_mask_ashr_1: +; X86-AVX: # %bb.0: +; X86-AVX-NEXT: vpsrld $1, %xmm0, %xmm0 +; X86-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 +; X86-AVX-NEXT: retl ; ; X64-SSE2-LABEL: test_128_i32_x_4_32767_mask_ashr_1: ; X64-SSE2: # %bb.0: @@ -1989,18 +1956,11 @@ ; X64-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; X64-SSE2-NEXT: retq ; -; X64-AVX1-LABEL: test_128_i32_x_4_32767_mask_ashr_1: -; X64-AVX1: # %bb.0: -; X64-AVX1-NEXT: vpsrld $1, %xmm0, %xmm0 -; X64-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; X64-AVX1-NEXT: retq -; -; X64-AVX2-LABEL: test_128_i32_x_4_32767_mask_ashr_1: -; X64-AVX2: # %bb.0: -; X64-AVX2-NEXT: vpsrld $1, %xmm0, %xmm0 -; X64-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [16383,16383,16383,16383] -; X64-AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; X64-AVX2-NEXT: retq +; X64-AVX-LABEL: test_128_i32_x_4_32767_mask_ashr_1: +; X64-AVX: # %bb.0: +; X64-AVX-NEXT: vpsrld $1, %xmm0, %xmm0 +; X64-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-AVX-NEXT: retq %t0 = and <4 x i32> %a0, %t1 = ashr <4 x i32> %t0, ret <4 x i32> %t1 @@ -2013,18 +1973,11 @@ ; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 ; X86-SSE2-NEXT: retl ; -; X86-AVX1-LABEL: test_128_i32_x_4_8388352_mask_ashr_7: -; X86-AVX1: # %bb.0: -; X86-AVX1-NEXT: vpsrld $7, %xmm0, %xmm0 -; X86-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 -; X86-AVX1-NEXT: retl -; -; X86-AVX2-LABEL: test_128_i32_x_4_8388352_mask_ashr_7: -; X86-AVX2: # %bb.0: -; X86-AVX2-NEXT: vpsrld $7, %xmm0, %xmm0 -; X86-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [65534,65534,65534,65534] -; X86-AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; X86-AVX2-NEXT: retl +; X86-AVX-LABEL: test_128_i32_x_4_8388352_mask_ashr_7: +; X86-AVX: # %bb.0: +; X86-AVX-NEXT: vpsrld $7, %xmm0, %xmm0 +; X86-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 +; X86-AVX-NEXT: retl ; ; X64-SSE2-LABEL: test_128_i32_x_4_8388352_mask_ashr_7: ; X64-SSE2: # %bb.0: @@ -2032,18 +1985,11 @@ ; X64-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; X64-SSE2-NEXT: retq ; -; X64-AVX1-LABEL: test_128_i32_x_4_8388352_mask_ashr_7: -; X64-AVX1: # %bb.0: -; X64-AVX1-NEXT: vpsrld $7, %xmm0, %xmm0 -; X64-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; X64-AVX1-NEXT: retq -; -; X64-AVX2-LABEL: test_128_i32_x_4_8388352_mask_ashr_7: -; X64-AVX2: # %bb.0: -; X64-AVX2-NEXT: vpsrld $7, %xmm0, %xmm0 -; X64-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [65534,65534,65534,65534] -; X64-AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; X64-AVX2-NEXT: retq +; X64-AVX-LABEL: test_128_i32_x_4_8388352_mask_ashr_7: +; X64-AVX: # %bb.0: +; X64-AVX-NEXT: vpsrld $7, %xmm0, %xmm0 +; X64-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-AVX-NEXT: retq %t0 = and <4 x i32> %a0, %t1 = ashr <4 x i32> %t0, ret <4 x i32> %t1 @@ -2055,18 +2001,11 @@ ; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 ; X86-SSE2-NEXT: retl ; -; X86-AVX1-LABEL: test_128_i32_x_4_8388352_mask_ashr_8: -; X86-AVX1: # %bb.0: -; X86-AVX1-NEXT: vpsrld $8, %xmm0, %xmm0 -; X86-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 -; X86-AVX1-NEXT: retl -; -; X86-AVX2-LABEL: test_128_i32_x_4_8388352_mask_ashr_8: -; X86-AVX2: # %bb.0: -; X86-AVX2-NEXT: vpsrld $8, %xmm0, %xmm0 -; X86-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [32767,32767,32767,32767] -; X86-AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; X86-AVX2-NEXT: retl +; X86-AVX-LABEL: test_128_i32_x_4_8388352_mask_ashr_8: +; X86-AVX: # %bb.0: +; X86-AVX-NEXT: vpsrld $8, %xmm0, %xmm0 +; X86-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 +; X86-AVX-NEXT: retl ; ; X64-SSE2-LABEL: test_128_i32_x_4_8388352_mask_ashr_8: ; X64-SSE2: # %bb.0: @@ -2074,18 +2013,11 @@ ; X64-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; X64-SSE2-NEXT: retq ; -; X64-AVX1-LABEL: test_128_i32_x_4_8388352_mask_ashr_8: -; X64-AVX1: # %bb.0: -; X64-AVX1-NEXT: vpsrld $8, %xmm0, %xmm0 -; X64-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; X64-AVX1-NEXT: retq -; -; X64-AVX2-LABEL: test_128_i32_x_4_8388352_mask_ashr_8: -; X64-AVX2: # %bb.0: -; X64-AVX2-NEXT: vpsrld $8, %xmm0, %xmm0 -; X64-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [32767,32767,32767,32767] -; X64-AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; X64-AVX2-NEXT: retq +; X64-AVX-LABEL: test_128_i32_x_4_8388352_mask_ashr_8: +; X64-AVX: # %bb.0: +; X64-AVX-NEXT: vpsrld $8, %xmm0, %xmm0 +; X64-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-AVX-NEXT: retq %t0 = and <4 x i32> %a0, %t1 = ashr <4 x i32> %t0, ret <4 x i32> %t1 @@ -2097,18 +2029,11 @@ ; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 ; X86-SSE2-NEXT: retl ; -; X86-AVX1-LABEL: test_128_i32_x_4_8388352_mask_ashr_9: -; X86-AVX1: # %bb.0: -; X86-AVX1-NEXT: vpsrld $9, %xmm0, %xmm0 -; X86-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 -; X86-AVX1-NEXT: retl -; -; X86-AVX2-LABEL: test_128_i32_x_4_8388352_mask_ashr_9: -; X86-AVX2: # %bb.0: -; X86-AVX2-NEXT: vpsrld $9, %xmm0, %xmm0 -; X86-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [16383,16383,16383,16383] -; X86-AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; X86-AVX2-NEXT: retl +; X86-AVX-LABEL: test_128_i32_x_4_8388352_mask_ashr_9: +; X86-AVX: # %bb.0: +; X86-AVX-NEXT: vpsrld $9, %xmm0, %xmm0 +; X86-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 +; X86-AVX-NEXT: retl ; ; X64-SSE2-LABEL: test_128_i32_x_4_8388352_mask_ashr_9: ; X64-SSE2: # %bb.0: @@ -2116,18 +2041,11 @@ ; X64-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; X64-SSE2-NEXT: retq ; -; X64-AVX1-LABEL: test_128_i32_x_4_8388352_mask_ashr_9: -; X64-AVX1: # %bb.0: -; X64-AVX1-NEXT: vpsrld $9, %xmm0, %xmm0 -; X64-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; X64-AVX1-NEXT: retq -; -; X64-AVX2-LABEL: test_128_i32_x_4_8388352_mask_ashr_9: -; X64-AVX2: # %bb.0: -; X64-AVX2-NEXT: vpsrld $9, %xmm0, %xmm0 -; X64-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [16383,16383,16383,16383] -; X64-AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; X64-AVX2-NEXT: retq +; X64-AVX-LABEL: test_128_i32_x_4_8388352_mask_ashr_9: +; X64-AVX: # %bb.0: +; X64-AVX-NEXT: vpsrld $9, %xmm0, %xmm0 +; X64-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-AVX-NEXT: retq %t0 = and <4 x i32> %a0, %t1 = ashr <4 x i32> %t0, ret <4 x i32> %t1 @@ -2139,18 +2057,11 @@ ; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 ; X86-SSE2-NEXT: retl ; -; X86-AVX1-LABEL: test_128_i32_x_4_8388352_mask_ashr_10: -; X86-AVX1: # %bb.0: -; X86-AVX1-NEXT: vpsrld $10, %xmm0, %xmm0 -; X86-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 -; X86-AVX1-NEXT: retl -; -; X86-AVX2-LABEL: test_128_i32_x_4_8388352_mask_ashr_10: -; X86-AVX2: # %bb.0: -; X86-AVX2-NEXT: vpsrld $10, %xmm0, %xmm0 -; X86-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [8191,8191,8191,8191] -; X86-AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; X86-AVX2-NEXT: retl +; X86-AVX-LABEL: test_128_i32_x_4_8388352_mask_ashr_10: +; X86-AVX: # %bb.0: +; X86-AVX-NEXT: vpsrld $10, %xmm0, %xmm0 +; X86-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 +; X86-AVX-NEXT: retl ; ; X64-SSE2-LABEL: test_128_i32_x_4_8388352_mask_ashr_10: ; X64-SSE2: # %bb.0: @@ -2158,18 +2069,11 @@ ; X64-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; X64-SSE2-NEXT: retq ; -; X64-AVX1-LABEL: test_128_i32_x_4_8388352_mask_ashr_10: -; X64-AVX1: # %bb.0: -; X64-AVX1-NEXT: vpsrld $10, %xmm0, %xmm0 -; X64-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; X64-AVX1-NEXT: retq -; -; X64-AVX2-LABEL: test_128_i32_x_4_8388352_mask_ashr_10: -; X64-AVX2: # %bb.0: -; X64-AVX2-NEXT: vpsrld $10, %xmm0, %xmm0 -; X64-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [8191,8191,8191,8191] -; X64-AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; X64-AVX2-NEXT: retq +; X64-AVX-LABEL: test_128_i32_x_4_8388352_mask_ashr_10: +; X64-AVX: # %bb.0: +; X64-AVX-NEXT: vpsrld $10, %xmm0, %xmm0 +; X64-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-AVX-NEXT: retq %t0 = and <4 x i32> %a0, %t1 = ashr <4 x i32> %t0, ret <4 x i32> %t1 @@ -2212,18 +2116,11 @@ ; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 ; X86-SSE2-NEXT: retl ; -; X86-AVX1-LABEL: test_128_i32_x_4_4294836224_mask_ashr_16: -; X86-AVX1: # %bb.0: -; X86-AVX1-NEXT: vpsrad $16, %xmm0, %xmm0 -; X86-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 -; X86-AVX1-NEXT: retl -; -; X86-AVX2-LABEL: test_128_i32_x_4_4294836224_mask_ashr_16: -; X86-AVX2: # %bb.0: -; X86-AVX2-NEXT: vpsrad $16, %xmm0, %xmm0 -; X86-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [4294967294,4294967294,4294967294,4294967294] -; X86-AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; X86-AVX2-NEXT: retl +; X86-AVX-LABEL: test_128_i32_x_4_4294836224_mask_ashr_16: +; X86-AVX: # %bb.0: +; X86-AVX-NEXT: vpsrad $16, %xmm0, %xmm0 +; X86-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 +; X86-AVX-NEXT: retl ; ; X64-SSE2-LABEL: test_128_i32_x_4_4294836224_mask_ashr_16: ; X64-SSE2: # %bb.0: @@ -2231,18 +2128,11 @@ ; X64-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; X64-SSE2-NEXT: retq ; -; X64-AVX1-LABEL: test_128_i32_x_4_4294836224_mask_ashr_16: -; X64-AVX1: # %bb.0: -; X64-AVX1-NEXT: vpsrad $16, %xmm0, %xmm0 -; X64-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; X64-AVX1-NEXT: retq -; -; X64-AVX2-LABEL: test_128_i32_x_4_4294836224_mask_ashr_16: -; X64-AVX2: # %bb.0: -; X64-AVX2-NEXT: vpsrad $16, %xmm0, %xmm0 -; X64-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [4294967294,4294967294,4294967294,4294967294] -; X64-AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; X64-AVX2-NEXT: retq +; X64-AVX-LABEL: test_128_i32_x_4_4294836224_mask_ashr_16: +; X64-AVX: # %bb.0: +; X64-AVX-NEXT: vpsrad $16, %xmm0, %xmm0 +; X64-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-AVX-NEXT: retq %t0 = and <4 x i32> %a0, %t1 = ashr <4 x i32> %t0, ret <4 x i32> %t1 @@ -2305,18 +2195,11 @@ ; X86-SSE2-NEXT: paddd %xmm0, %xmm0 ; X86-SSE2-NEXT: retl ; -; X86-AVX1-LABEL: test_128_i32_x_4_32767_mask_shl_1: -; X86-AVX1: # %bb.0: -; X86-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 -; X86-AVX1-NEXT: vpaddd %xmm0, %xmm0, %xmm0 -; X86-AVX1-NEXT: retl -; -; X86-AVX2-LABEL: test_128_i32_x_4_32767_mask_shl_1: -; X86-AVX2: # %bb.0: -; X86-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [32767,32767,32767,32767] -; X86-AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; X86-AVX2-NEXT: vpaddd %xmm0, %xmm0, %xmm0 -; X86-AVX2-NEXT: retl +; X86-AVX-LABEL: test_128_i32_x_4_32767_mask_shl_1: +; X86-AVX: # %bb.0: +; X86-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 +; X86-AVX-NEXT: vpaddd %xmm0, %xmm0, %xmm0 +; X86-AVX-NEXT: retl ; ; X64-SSE2-LABEL: test_128_i32_x_4_32767_mask_shl_1: ; X64-SSE2: # %bb.0: @@ -2324,18 +2207,11 @@ ; X64-SSE2-NEXT: paddd %xmm0, %xmm0 ; X64-SSE2-NEXT: retq ; -; X64-AVX1-LABEL: test_128_i32_x_4_32767_mask_shl_1: -; X64-AVX1: # %bb.0: -; X64-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; X64-AVX1-NEXT: vpaddd %xmm0, %xmm0, %xmm0 -; X64-AVX1-NEXT: retq -; -; X64-AVX2-LABEL: test_128_i32_x_4_32767_mask_shl_1: -; X64-AVX2: # %bb.0: -; X64-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [32767,32767,32767,32767] -; X64-AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; X64-AVX2-NEXT: vpaddd %xmm0, %xmm0, %xmm0 -; X64-AVX2-NEXT: retq +; X64-AVX-LABEL: test_128_i32_x_4_32767_mask_shl_1: +; X64-AVX: # %bb.0: +; X64-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-AVX-NEXT: vpaddd %xmm0, %xmm0, %xmm0 +; X64-AVX-NEXT: retq %t0 = and <4 x i32> %a0, %t1 = shl <4 x i32> %t0, ret <4 x i32> %t1 @@ -2347,18 +2223,11 @@ ; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 ; X86-SSE2-NEXT: retl ; -; X86-AVX1-LABEL: test_128_i32_x_4_32767_mask_shl_16: -; X86-AVX1: # %bb.0: -; X86-AVX1-NEXT: vpslld $16, %xmm0, %xmm0 -; X86-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 -; X86-AVX1-NEXT: retl -; -; X86-AVX2-LABEL: test_128_i32_x_4_32767_mask_shl_16: -; X86-AVX2: # %bb.0: -; X86-AVX2-NEXT: vpslld $16, %xmm0, %xmm0 -; X86-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2147418112,2147418112,2147418112,2147418112] -; X86-AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; X86-AVX2-NEXT: retl +; X86-AVX-LABEL: test_128_i32_x_4_32767_mask_shl_16: +; X86-AVX: # %bb.0: +; X86-AVX-NEXT: vpslld $16, %xmm0, %xmm0 +; X86-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 +; X86-AVX-NEXT: retl ; ; X64-SSE2-LABEL: test_128_i32_x_4_32767_mask_shl_16: ; X64-SSE2: # %bb.0: @@ -2366,18 +2235,11 @@ ; X64-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; X64-SSE2-NEXT: retq ; -; X64-AVX1-LABEL: test_128_i32_x_4_32767_mask_shl_16: -; X64-AVX1: # %bb.0: -; X64-AVX1-NEXT: vpslld $16, %xmm0, %xmm0 -; X64-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; X64-AVX1-NEXT: retq -; -; X64-AVX2-LABEL: test_128_i32_x_4_32767_mask_shl_16: -; X64-AVX2: # %bb.0: -; X64-AVX2-NEXT: vpslld $16, %xmm0, %xmm0 -; X64-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2147418112,2147418112,2147418112,2147418112] -; X64-AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; X64-AVX2-NEXT: retq +; X64-AVX-LABEL: test_128_i32_x_4_32767_mask_shl_16: +; X64-AVX: # %bb.0: +; X64-AVX-NEXT: vpslld $16, %xmm0, %xmm0 +; X64-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-AVX-NEXT: retq %t0 = and <4 x i32> %a0, %t1 = shl <4 x i32> %t0, ret <4 x i32> %t1 @@ -2438,18 +2300,11 @@ ; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 ; X86-SSE2-NEXT: retl ; -; X86-AVX1-LABEL: test_128_i32_x_4_8388352_mask_shl_7: -; X86-AVX1: # %bb.0: -; X86-AVX1-NEXT: vpslld $7, %xmm0, %xmm0 -; X86-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 -; X86-AVX1-NEXT: retl -; -; X86-AVX2-LABEL: test_128_i32_x_4_8388352_mask_shl_7: -; X86-AVX2: # %bb.0: -; X86-AVX2-NEXT: vpslld $7, %xmm0, %xmm0 -; X86-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1073709056,1073709056,1073709056,1073709056] -; X86-AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; X86-AVX2-NEXT: retl +; X86-AVX-LABEL: test_128_i32_x_4_8388352_mask_shl_7: +; X86-AVX: # %bb.0: +; X86-AVX-NEXT: vpslld $7, %xmm0, %xmm0 +; X86-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 +; X86-AVX-NEXT: retl ; ; X64-SSE2-LABEL: test_128_i32_x_4_8388352_mask_shl_7: ; X64-SSE2: # %bb.0: @@ -2457,18 +2312,11 @@ ; X64-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; X64-SSE2-NEXT: retq ; -; X64-AVX1-LABEL: test_128_i32_x_4_8388352_mask_shl_7: -; X64-AVX1: # %bb.0: -; X64-AVX1-NEXT: vpslld $7, %xmm0, %xmm0 -; X64-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; X64-AVX1-NEXT: retq -; -; X64-AVX2-LABEL: test_128_i32_x_4_8388352_mask_shl_7: -; X64-AVX2: # %bb.0: -; X64-AVX2-NEXT: vpslld $7, %xmm0, %xmm0 -; X64-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1073709056,1073709056,1073709056,1073709056] -; X64-AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; X64-AVX2-NEXT: retq +; X64-AVX-LABEL: test_128_i32_x_4_8388352_mask_shl_7: +; X64-AVX: # %bb.0: +; X64-AVX-NEXT: vpslld $7, %xmm0, %xmm0 +; X64-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-AVX-NEXT: retq %t0 = and <4 x i32> %a0, %t1 = shl <4 x i32> %t0, ret <4 x i32> %t1 @@ -2480,18 +2328,11 @@ ; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 ; X86-SSE2-NEXT: retl ; -; X86-AVX1-LABEL: test_128_i32_x_4_8388352_mask_shl_8: -; X86-AVX1: # %bb.0: -; X86-AVX1-NEXT: vpslld $8, %xmm0, %xmm0 -; X86-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 -; X86-AVX1-NEXT: retl -; -; X86-AVX2-LABEL: test_128_i32_x_4_8388352_mask_shl_8: -; X86-AVX2: # %bb.0: -; X86-AVX2-NEXT: vpslld $8, %xmm0, %xmm0 -; X86-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2147418112,2147418112,2147418112,2147418112] -; X86-AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; X86-AVX2-NEXT: retl +; X86-AVX-LABEL: test_128_i32_x_4_8388352_mask_shl_8: +; X86-AVX: # %bb.0: +; X86-AVX-NEXT: vpslld $8, %xmm0, %xmm0 +; X86-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 +; X86-AVX-NEXT: retl ; ; X64-SSE2-LABEL: test_128_i32_x_4_8388352_mask_shl_8: ; X64-SSE2: # %bb.0: @@ -2499,18 +2340,11 @@ ; X64-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; X64-SSE2-NEXT: retq ; -; X64-AVX1-LABEL: test_128_i32_x_4_8388352_mask_shl_8: -; X64-AVX1: # %bb.0: -; X64-AVX1-NEXT: vpslld $8, %xmm0, %xmm0 -; X64-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; X64-AVX1-NEXT: retq -; -; X64-AVX2-LABEL: test_128_i32_x_4_8388352_mask_shl_8: -; X64-AVX2: # %bb.0: -; X64-AVX2-NEXT: vpslld $8, %xmm0, %xmm0 -; X64-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2147418112,2147418112,2147418112,2147418112] -; X64-AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; X64-AVX2-NEXT: retq +; X64-AVX-LABEL: test_128_i32_x_4_8388352_mask_shl_8: +; X64-AVX: # %bb.0: +; X64-AVX-NEXT: vpslld $8, %xmm0, %xmm0 +; X64-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-AVX-NEXT: retq %t0 = and <4 x i32> %a0, %t1 = shl <4 x i32> %t0, ret <4 x i32> %t1 @@ -2522,18 +2356,11 @@ ; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 ; X86-SSE2-NEXT: retl ; -; X86-AVX1-LABEL: test_128_i32_x_4_8388352_mask_shl_9: -; X86-AVX1: # %bb.0: -; X86-AVX1-NEXT: vpslld $9, %xmm0, %xmm0 -; X86-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 -; X86-AVX1-NEXT: retl -; -; X86-AVX2-LABEL: test_128_i32_x_4_8388352_mask_shl_9: -; X86-AVX2: # %bb.0: -; X86-AVX2-NEXT: vpslld $9, %xmm0, %xmm0 -; X86-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [4294836224,4294836224,4294836224,4294836224] -; X86-AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; X86-AVX2-NEXT: retl +; X86-AVX-LABEL: test_128_i32_x_4_8388352_mask_shl_9: +; X86-AVX: # %bb.0: +; X86-AVX-NEXT: vpslld $9, %xmm0, %xmm0 +; X86-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 +; X86-AVX-NEXT: retl ; ; X64-SSE2-LABEL: test_128_i32_x_4_8388352_mask_shl_9: ; X64-SSE2: # %bb.0: @@ -2541,18 +2368,11 @@ ; X64-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; X64-SSE2-NEXT: retq ; -; X64-AVX1-LABEL: test_128_i32_x_4_8388352_mask_shl_9: -; X64-AVX1: # %bb.0: -; X64-AVX1-NEXT: vpslld $9, %xmm0, %xmm0 -; X64-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; X64-AVX1-NEXT: retq -; -; X64-AVX2-LABEL: test_128_i32_x_4_8388352_mask_shl_9: -; X64-AVX2: # %bb.0: -; X64-AVX2-NEXT: vpslld $9, %xmm0, %xmm0 -; X64-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [4294836224,4294836224,4294836224,4294836224] -; X64-AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; X64-AVX2-NEXT: retq +; X64-AVX-LABEL: test_128_i32_x_4_8388352_mask_shl_9: +; X64-AVX: # %bb.0: +; X64-AVX-NEXT: vpslld $9, %xmm0, %xmm0 +; X64-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-AVX-NEXT: retq %t0 = and <4 x i32> %a0, %t1 = shl <4 x i32> %t0, ret <4 x i32> %t1 @@ -2564,18 +2384,11 @@ ; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 ; X86-SSE2-NEXT: retl ; -; X86-AVX1-LABEL: test_128_i32_x_4_8388352_mask_shl_10: -; X86-AVX1: # %bb.0: -; X86-AVX1-NEXT: vpslld $10, %xmm0, %xmm0 -; X86-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 -; X86-AVX1-NEXT: retl -; -; X86-AVX2-LABEL: test_128_i32_x_4_8388352_mask_shl_10: -; X86-AVX2: # %bb.0: -; X86-AVX2-NEXT: vpslld $10, %xmm0, %xmm0 -; X86-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [4294705152,4294705152,4294705152,4294705152] -; X86-AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; X86-AVX2-NEXT: retl +; X86-AVX-LABEL: test_128_i32_x_4_8388352_mask_shl_10: +; X86-AVX: # %bb.0: +; X86-AVX-NEXT: vpslld $10, %xmm0, %xmm0 +; X86-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 +; X86-AVX-NEXT: retl ; ; X64-SSE2-LABEL: test_128_i32_x_4_8388352_mask_shl_10: ; X64-SSE2: # %bb.0: @@ -2583,18 +2396,11 @@ ; X64-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; X64-SSE2-NEXT: retq ; -; X64-AVX1-LABEL: test_128_i32_x_4_8388352_mask_shl_10: -; X64-AVX1: # %bb.0: -; X64-AVX1-NEXT: vpslld $10, %xmm0, %xmm0 -; X64-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; X64-AVX1-NEXT: retq -; -; X64-AVX2-LABEL: test_128_i32_x_4_8388352_mask_shl_10: -; X64-AVX2: # %bb.0: -; X64-AVX2-NEXT: vpslld $10, %xmm0, %xmm0 -; X64-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [4294705152,4294705152,4294705152,4294705152] -; X64-AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; X64-AVX2-NEXT: retq +; X64-AVX-LABEL: test_128_i32_x_4_8388352_mask_shl_10: +; X64-AVX: # %bb.0: +; X64-AVX-NEXT: vpslld $10, %xmm0, %xmm0 +; X64-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-AVX-NEXT: retq %t0 = and <4 x i32> %a0, %t1 = shl <4 x i32> %t0, ret <4 x i32> %t1 @@ -2607,18 +2413,11 @@ ; X86-SSE2-NEXT: paddd %xmm0, %xmm0 ; X86-SSE2-NEXT: retl ; -; X86-AVX1-LABEL: test_128_i32_x_4_4294836224_mask_shl_1: -; X86-AVX1: # %bb.0: -; X86-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 -; X86-AVX1-NEXT: vpaddd %xmm0, %xmm0, %xmm0 -; X86-AVX1-NEXT: retl -; -; X86-AVX2-LABEL: test_128_i32_x_4_4294836224_mask_shl_1: -; X86-AVX2: # %bb.0: -; X86-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [4294836224,4294836224,4294836224,4294836224] -; X86-AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; X86-AVX2-NEXT: vpaddd %xmm0, %xmm0, %xmm0 -; X86-AVX2-NEXT: retl +; X86-AVX-LABEL: test_128_i32_x_4_4294836224_mask_shl_1: +; X86-AVX: # %bb.0: +; X86-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 +; X86-AVX-NEXT: vpaddd %xmm0, %xmm0, %xmm0 +; X86-AVX-NEXT: retl ; ; X64-SSE2-LABEL: test_128_i32_x_4_4294836224_mask_shl_1: ; X64-SSE2: # %bb.0: @@ -2626,18 +2425,11 @@ ; X64-SSE2-NEXT: paddd %xmm0, %xmm0 ; X64-SSE2-NEXT: retq ; -; X64-AVX1-LABEL: test_128_i32_x_4_4294836224_mask_shl_1: -; X64-AVX1: # %bb.0: -; X64-AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; X64-AVX1-NEXT: vpaddd %xmm0, %xmm0, %xmm0 -; X64-AVX1-NEXT: retq -; -; X64-AVX2-LABEL: test_128_i32_x_4_4294836224_mask_shl_1: -; X64-AVX2: # %bb.0: -; X64-AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [4294836224,4294836224,4294836224,4294836224] -; X64-AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; X64-AVX2-NEXT: vpaddd %xmm0, %xmm0, %xmm0 -; X64-AVX2-NEXT: retq +; X64-AVX-LABEL: test_128_i32_x_4_4294836224_mask_shl_1: +; X64-AVX: # %bb.0: +; X64-AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; X64-AVX-NEXT: vpaddd %xmm0, %xmm0, %xmm0 +; X64-AVX-NEXT: retq %t0 = and <4 x i32> %a0, %t1 = shl <4 x i32> %t0, ret <4 x i32> %t1 diff --git a/llvm/test/CodeGen/X86/vselect-avx.ll b/llvm/test/CodeGen/X86/vselect-avx.ll --- a/llvm/test/CodeGen/X86/vselect-avx.ll +++ b/llvm/test/CodeGen/X86/vselect-avx.ll @@ -47,7 +47,7 @@ ; AVX1-NEXT: vpmovsxdq %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: movq (%rdi,%rsi,8), %rax -; AVX1-NEXT: vmovapd {{.*#+}} ymm1 = [5.0E-1,5.0E-1,5.0E-1,5.0E-1] +; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm1 = [5.0E-1,5.0E-1,5.0E-1,5.0E-1] ; AVX1-NEXT: vblendvpd %ymm0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm0 ; AVX1-NEXT: vmovupd %ymm0, (%rax) ; AVX1-NEXT: vzeroupper @@ -58,9 +58,8 @@ ; AVX2-NEXT: vpslld $31, %xmm0, %xmm0 ; AVX2-NEXT: vpmovsxdq %xmm0, %ymm0 ; AVX2-NEXT: movq (%rdi,%rsi,8), %rax -; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] -; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm2 = [5.0E-1,5.0E-1,5.0E-1,5.0E-1] -; AVX2-NEXT: vblendvpd %ymm0, %ymm1, %ymm2, %ymm0 +; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm1 = [5.0E-1,5.0E-1,5.0E-1,5.0E-1] +; AVX2-NEXT: vblendvpd %ymm0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm0 ; AVX2-NEXT: vmovupd %ymm0, (%rax) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -110,12 +109,9 @@ ; ; AVX2-LABEL: test3: ; AVX2: ## %bb.0: -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [2863311531,2863311531,2863311531,2863311531] -; AVX2-NEXT: vpmulld %xmm3, %xmm0, %xmm0 -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [715827882,715827882,715827882,715827882] -; AVX2-NEXT: vpaddd %xmm3, %xmm0, %xmm0 -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [1431655764,1431655764,1431655764,1431655764] -; AVX2-NEXT: vpminud %xmm3, %xmm0, %xmm3 +; AVX2-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm3 ; AVX2-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 ; AVX2-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 ; AVX2-NEXT: vpblendvb %xmm0, %xmm1, %xmm2, %xmm1 diff --git a/llvm/test/CodeGen/X86/vselect-minmax.ll b/llvm/test/CodeGen/X86/vselect-minmax.ll --- a/llvm/test/CodeGen/X86/vselect-minmax.ll +++ b/llvm/test/CodeGen/X86/vselect-minmax.ll @@ -9521,7 +9521,7 @@ ; ; AVX2-LABEL: test181: ; AVX2: # %bb.0: # %entry -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] ; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm3 ; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm2 ; AVX2-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2 @@ -9589,7 +9589,7 @@ ; ; AVX2-LABEL: test182: ; AVX2: # %bb.0: # %entry -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] ; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm3 ; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm2 ; AVX2-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2 @@ -9657,7 +9657,7 @@ ; ; AVX2-LABEL: test183: ; AVX2: # %bb.0: # %entry -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] ; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm3 ; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm2 ; AVX2-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2 @@ -9725,7 +9725,7 @@ ; ; AVX2-LABEL: test184: ; AVX2: # %bb.0: # %entry -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] ; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm3 ; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm2 ; AVX2-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2 @@ -10027,7 +10027,7 @@ ; ; AVX2-LABEL: test189: ; AVX2: # %bb.0: # %entry -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] ; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm3 ; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm2 ; AVX2-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2 @@ -10095,7 +10095,7 @@ ; ; AVX2-LABEL: test190: ; AVX2: # %bb.0: # %entry -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] ; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm3 ; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm2 ; AVX2-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2 @@ -10163,7 +10163,7 @@ ; ; AVX2-LABEL: test191: ; AVX2: # %bb.0: # %entry -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] ; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm3 ; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm2 ; AVX2-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2 @@ -10231,7 +10231,7 @@ ; ; AVX2-LABEL: test192: ; AVX2: # %bb.0: # %entry -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] ; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm3 ; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm2 ; AVX2-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2 diff --git a/llvm/test/CodeGen/X86/vselect-pcmp.ll b/llvm/test/CodeGen/X86/vselect-pcmp.ll --- a/llvm/test/CodeGen/X86/vselect-pcmp.ll +++ b/llvm/test/CodeGen/X86/vselect-pcmp.ll @@ -681,7 +681,7 @@ ; AVX512F-NEXT: # kill: def $xmm2 killed $xmm2 def $zmm2 ; AVX512F-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808] +; AVX512F-NEXT: vpbroadcastq {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808] ; AVX512F-NEXT: vptestnmq %zmm3, %zmm0, %k1 ; AVX512F-NEXT: vpblendmq %zmm1, %zmm2, %zmm0 {%k1} ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 diff --git a/llvm/test/CodeGen/X86/vselect-post-combine.ll b/llvm/test/CodeGen/X86/vselect-post-combine.ll --- a/llvm/test/CodeGen/X86/vselect-post-combine.ll +++ b/llvm/test/CodeGen/X86/vselect-post-combine.ll @@ -5,7 +5,7 @@ ; AVX2-LABEL: test_mul: ; AVX2: # %bb.0: # %entry ; AVX2-NEXT: vmovdqa {{.*#+}} xmm0 = [255,0,0,0] -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] ; AVX2-NEXT: vpblendvb %xmm0, (%rdi), %xmm1, %xmm0 ; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero ; AVX2-NEXT: vmovdqu %ymm0, 0 diff --git a/llvm/test/CodeGen/X86/vselect-zero.ll b/llvm/test/CodeGen/X86/vselect-zero.ll --- a/llvm/test/CodeGen/X86/vselect-zero.ll +++ b/llvm/test/CodeGen/X86/vselect-zero.ll @@ -125,7 +125,8 @@ ; AVX-LABEL: fsel_nonzero_false_val: ; AVX: # %bb.0: ; AVX-NEXT: vcmpeqsd %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vmovapd {{.*#+}} xmm1 = [4.2E+1,4.2E+1] +; AVX-NEXT: vmovddup {{.*#+}} xmm1 = [4.2E+1,4.2E+1] +; AVX-NEXT: # xmm1 = mem[0,0] ; AVX-NEXT: vblendvpd %xmm0, %xmm2, %xmm1, %xmm0 ; AVX-NEXT: retq ; @@ -179,7 +180,8 @@ ; AVX-LABEL: fsel_nonzero_constants: ; AVX: # %bb.0: ; AVX-NEXT: vcmpeqsd %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vmovapd {{.*#+}} xmm1 = [4.2E+1,4.2E+1] +; AVX-NEXT: vmovddup {{.*#+}} xmm1 = [4.2E+1,4.2E+1] +; AVX-NEXT: # xmm1 = mem[0,0] ; AVX-NEXT: vblendvpd %xmm0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm0 ; AVX-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/win_cst_pool.ll b/llvm/test/CodeGen/X86/win_cst_pool.ll --- a/llvm/test/CodeGen/X86/win_cst_pool.ll +++ b/llvm/test/CodeGen/X86/win_cst_pool.ll @@ -65,16 +65,14 @@ define <4 x float> @undef1() { ret <4 x float> -; CHECK: .globl __xmm@00000000000000003f8000003f800000 -; CHECK-NEXT: .section .rdata,"dr",discard,__xmm@00000000000000003f8000003f800000 -; CHECK-NEXT: .p2align 4 -; CHECK-NEXT: __xmm@00000000000000003f8000003f800000: -; CHECK-NEXT: .long 0x3f800000 # float 1 +; CHECK: .globl __real@3f800000 +; CHECK-NEXT: .section .rdata,"dr",discard,__real@3f800000 +; CHECK-NEXT: .p2align 2 +; CHECK-NEXT: __real@3f800000: ; CHECK-NEXT: .long 0x3f800000 # float 1 -; CHECK-NEXT: .zero 4 -; CHECK-NEXT: .zero 4 +; CHECK-NEXT: .text ; CHECK: undef1: -; CHECK: movaps __xmm@00000000000000003f8000003f800000(%rip), %xmm0 +; CHECK: vbroadcastss __real@3f800000(%rip), %xmm0 ; CHECK-NEXT: ret } diff --git a/llvm/test/CodeGen/X86/x86-interleaved-access.ll b/llvm/test/CodeGen/X86/x86-interleaved-access.ll --- a/llvm/test/CodeGen/X86/x86-interleaved-access.ll +++ b/llvm/test/CodeGen/X86/x86-interleaved-access.ll @@ -656,51 +656,52 @@ ; AVX2-NEXT: vpshufb %xmm7, %xmm3, %xmm8 ; AVX2-NEXT: vpshufb %xmm7, %xmm2, %xmm7 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1] -; AVX2-NEXT: vpblendd {{.*#+}} xmm7 = xmm7[0,1],xmm6[2,3] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm8 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX2-NEXT: vpshufb %ymm8, %ymm1, %ymm9 -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm6 = [17179869184,17179869184,17179869184,17179869184] +; AVX2-NEXT: vpblendd {{.*#+}} xmm8 = xmm7[0,1],xmm6[2,3] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm7 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX2-NEXT: vpshufb %ymm7, %ymm1, %ymm9 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm6 = ; AVX2-NEXT: vpermd %ymm9, %ymm6, %ymm9 -; AVX2-NEXT: vpshufb %ymm8, %ymm0, %ymm8 -; AVX2-NEXT: vpermd %ymm8, %ymm6, %ymm8 -; AVX2-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm9[6,7] -; AVX2-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5,6,7] -; AVX2-NEXT: vmovdqa {{.*#+}} xmm8 = -; AVX2-NEXT: vpshufb %xmm8, %xmm5, %xmm9 -; AVX2-NEXT: vpshufb %xmm8, %xmm4, %xmm8 -; AVX2-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1] -; AVX2-NEXT: vmovdqa {{.*#+}} xmm9 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX2-NEXT: vpshufb %xmm9, %xmm3, %xmm10 -; AVX2-NEXT: vpshufb %xmm9, %xmm2, %xmm9 -; AVX2-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1] -; AVX2-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm9 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX2-NEXT: vpshufb %ymm9, %ymm1, %ymm10 -; AVX2-NEXT: vpermd %ymm10, %ymm6, %ymm10 -; AVX2-NEXT: vpshufb %ymm9, %ymm0, %ymm9 -; AVX2-NEXT: vpermd %ymm9, %ymm6, %ymm9 -; AVX2-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm10[6,7] +; AVX2-NEXT: vpshufb %ymm7, %ymm0, %ymm10 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm7 = +; AVX2-NEXT: vpermd %ymm10, %ymm7, %ymm10 +; AVX2-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3,4,5],ymm9[6,7] ; AVX2-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7] -; AVX2-NEXT: vpcmpeqb %ymm7, %ymm8, %ymm7 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm8 = -; AVX2-NEXT: vpshufb %xmm8, %xmm5, %xmm9 -; AVX2-NEXT: vpshufb %xmm8, %xmm4, %xmm8 -; AVX2-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1] -; AVX2-NEXT: vmovdqa {{.*#+}} xmm9 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX2-NEXT: vpshufb %xmm9, %xmm3, %xmm10 -; AVX2-NEXT: vpshufb %xmm9, %xmm2, %xmm9 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm9 = +; AVX2-NEXT: vpshufb %xmm9, %xmm5, %xmm10 +; AVX2-NEXT: vpshufb %xmm9, %xmm4, %xmm9 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1] -; AVX2-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm9 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u,2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX2-NEXT: vpshufb %ymm9, %ymm1, %ymm10 -; AVX2-NEXT: vpermd %ymm10, %ymm6, %ymm10 -; AVX2-NEXT: vpshufb %ymm9, %ymm0, %ymm9 -; AVX2-NEXT: vpermd %ymm9, %ymm6, %ymm9 -; AVX2-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm10[6,7] -; AVX2-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7] -; AVX2-NEXT: vmovdqa {{.*#+}} xmm9 = -; AVX2-NEXT: vpshufb %xmm9, %xmm5, %xmm5 -; AVX2-NEXT: vpshufb %xmm9, %xmm4, %xmm4 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm10 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX2-NEXT: vpshufb %xmm10, %xmm3, %xmm11 +; AVX2-NEXT: vpshufb %xmm10, %xmm2, %xmm10 +; AVX2-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1] +; AVX2-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0,1],xmm9[2,3] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm10 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX2-NEXT: vpshufb %ymm10, %ymm1, %ymm11 +; AVX2-NEXT: vpermd %ymm11, %ymm6, %ymm11 +; AVX2-NEXT: vpshufb %ymm10, %ymm0, %ymm10 +; AVX2-NEXT: vpermd %ymm10, %ymm7, %ymm10 +; AVX2-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm11[6,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7] +; AVX2-NEXT: vpcmpeqb %ymm9, %ymm8, %ymm8 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm9 = +; AVX2-NEXT: vpshufb %xmm9, %xmm5, %xmm10 +; AVX2-NEXT: vpshufb %xmm9, %xmm4, %xmm9 +; AVX2-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm10 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX2-NEXT: vpshufb %xmm10, %xmm3, %xmm11 +; AVX2-NEXT: vpshufb %xmm10, %xmm2, %xmm10 +; AVX2-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1] +; AVX2-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0,1],xmm9[2,3] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm10 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u,2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX2-NEXT: vpshufb %ymm10, %ymm1, %ymm11 +; AVX2-NEXT: vpermd %ymm11, %ymm6, %ymm11 +; AVX2-NEXT: vpshufb %ymm10, %ymm0, %ymm10 +; AVX2-NEXT: vpermd %ymm10, %ymm7, %ymm10 +; AVX2-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm11[6,7] +; AVX2-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm10 = +; AVX2-NEXT: vpshufb %xmm10, %xmm5, %xmm5 +; AVX2-NEXT: vpshufb %xmm10, %xmm4, %xmm4 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] ; AVX2-NEXT: vmovdqa {{.*#+}} xmm5 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u> ; AVX2-NEXT: vpshufb %xmm5, %xmm3, %xmm3 @@ -711,11 +712,11 @@ ; AVX2-NEXT: vpshufb %ymm3, %ymm1, %ymm1 ; AVX2-NEXT: vpermd %ymm1, %ymm6, %ymm1 ; AVX2-NEXT: vpshufb %ymm3, %ymm0, %ymm0 -; AVX2-NEXT: vpermd %ymm0, %ymm6, %ymm0 +; AVX2-NEXT: vpermd %ymm0, %ymm7, %ymm0 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] -; AVX2-NEXT: vpcmpeqb %ymm0, %ymm8, %ymm0 -; AVX2-NEXT: vpxor %ymm0, %ymm7, %ymm0 +; AVX2-NEXT: vpcmpeqb %ymm0, %ymm9, %ymm0 +; AVX2-NEXT: vpxor %ymm0, %ymm8, %ymm0 ; AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: retq ; @@ -805,8 +806,7 @@ ; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm2 ; AVX1-NEXT: vpalignr {{.*#+}} xmm1 = xmm7[11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10] ; AVX1-NEXT: vpalignr {{.*#+}} xmm4 = xmm6[11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7,8,9,10] -; AVX1-NEXT: vbroadcastf128 {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0] -; AVX1-NEXT: # ymm5 = mem[0,1,0,1] +; AVX1-NEXT: vmovaps {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0] ; AVX1-NEXT: vandnps %ymm2, %ymm5, %ymm2 ; AVX1-NEXT: vandps %ymm5, %ymm8, %ymm5 ; AVX1-NEXT: vorps %ymm2, %ymm5, %ymm2 @@ -1179,15 +1179,13 @@ ; AVX2: # %bb.0: ; AVX2-NEXT: vpalignr {{.*#+}} ymm6 = ymm0[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,27,28,29,30,31,16,17,18,19,20,21,22,23,24,25,26] ; AVX2-NEXT: vpslldq {{.*#+}} ymm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm4[0,1,2,3,4],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm4[16,17,18,19,20] -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0] -; AVX2-NEXT: # ymm8 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0] ; AVX2-NEXT: vpblendvb %ymm8, %ymm6, %ymm7, %ymm7 ; AVX2-NEXT: vpalignr {{.*#+}} ymm9 = ymm1[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,27,28,29,30,31,16,17,18,19,20,21,22,23,24,25,26] ; AVX2-NEXT: vpslldq {{.*#+}} ymm10 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm5[0,1,2,3,4],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm5[16,17,18,19,20] ; AVX2-NEXT: vpblendvb %ymm8, %ymm9, %ymm10, %ymm10 ; AVX2-NEXT: vpslldq {{.*#+}} ymm0 = zero,zero,zero,zero,zero,ymm0[0,1,2,3,4,5,6,7,8,9,10],zero,zero,zero,zero,zero,ymm0[16,17,18,19,20,21,22,23,24,25,26] -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [0,0,0,0,0,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,0,0,0,0,0] -; AVX2-NEXT: # ymm11 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm11 = ; AVX2-NEXT: vpblendvb %ymm11, %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpslldq {{.*#+}} ymm1 = zero,zero,zero,zero,zero,ymm1[0,1,2,3,4,5,6,7,8,9,10],zero,zero,zero,zero,zero,ymm1[16,17,18,19,20,21,22,23,24,25,26] ; AVX2-NEXT: vpblendvb %ymm11, %ymm3, %ymm1, %ymm1 @@ -1396,15 +1394,13 @@ ; AVX2-NEXT: vinserti128 $1, 144(%rdi), %ymm3, %ymm3 ; AVX2-NEXT: vinserti128 $1, 160(%rdi), %ymm4, %ymm1 ; AVX2-NEXT: vinserti128 $1, 176(%rdi), %ymm5, %ymm4 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [255,0,255,255,0,255,255,0,255,255,0,255,255,0,255,255,255,0,255,255,0,255,255,0,255,255,0,255,255,0,255,255] -; AVX2-NEXT: # ymm5 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [255,0,255,255,0,255,255,0,255,255,0,255,255,0,255,255,255,0,255,255,0,255,255,0,255,255,0,255,255,0,255,255] ; AVX2-NEXT: vpblendvb %ymm5, %ymm6, %ymm2, %ymm7 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm8 = [1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14] ; AVX2-NEXT: vpshufb %ymm8, %ymm7, %ymm7 ; AVX2-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm5 ; AVX2-NEXT: vpshufb %ymm8, %ymm5, %ymm5 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [255,0,0,255,0,0,255,0,0,255,0,0,255,0,0,255,255,0,0,255,0,0,255,0,0,255,0,0,255,0,0,255] -; AVX2-NEXT: # ymm8 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm8 = <255,0,u,255,0,u,255,0,u,255,0,u,255,0,u,255,255,0,u,255,0,u,255,0,u,255,0,u,255,0,u,255> ; AVX2-NEXT: vpblendvb %ymm8, %ymm1, %ymm3, %ymm3 ; AVX2-NEXT: vpblendvb %ymm8, %ymm0, %ymm6, %ymm6 ; AVX2-NEXT: vpblendvb %ymm8, %ymm2, %ymm0, %ymm9 @@ -1427,8 +1423,7 @@ ; AVX2-NEXT: vpalignr {{.*#+}} ymm4 = ymm7[5,6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,21,22,23,24,25,26,27,28,29,30,31,16,17,18,19,20] ; AVX2-NEXT: vmovdqa {{.*#+}} ymm6 = ; AVX2-NEXT: vpshufb %ymm6, %ymm0, %ymm0 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [255,255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,255,255,255,255,255] -; AVX2-NEXT: # ymm7 = mem[0,1,0,1] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,255,255,255,255,255] ; AVX2-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0 ; AVX2-NEXT: vpaddb %ymm0, %ymm2, %ymm0 ; AVX2-NEXT: vpalignr {{.*#+}} ymm2 = ymm5[5,6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,21,22,23,24,25,26,27,28,29,30,31,16,17,18,19,20] diff --git a/llvm/test/CodeGen/X86/zero_extend_vector_inreg.ll b/llvm/test/CodeGen/X86/zero_extend_vector_inreg.ll --- a/llvm/test/CodeGen/X86/zero_extend_vector_inreg.ll +++ b/llvm/test/CodeGen/X86/zero_extend_vector_inreg.ll @@ -5165,8 +5165,7 @@ ; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm0 ; AVX2-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX2-FAST-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [0,0,1,0,0,0,1,0] -; AVX2-FAST-NEXT: # ymm2 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = <0,u,u,u,u,u,1,u> ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0 ; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5],ymm0[6],ymm1[7] ; AVX2-FAST-NEXT: vmovaps 32(%rdx), %ymm1 @@ -6043,8 +6042,7 @@ ; AVX2-SLOW-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX2-SLOW-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3] -; AVX2-SLOW-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX2-SLOW-NEXT: # ymm2 = mem[0,1,0,1] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX2-SLOW-NEXT: vpand %ymm2, %ymm1, %ymm1 ; AVX2-SLOW-NEXT: vpsrld $16, %xmm0, %xmm0 ; AVX2-SLOW-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero @@ -6063,8 +6061,7 @@ ; AVX2-FAST-PERLANE-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX2-FAST-PERLANE-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3] -; AVX2-FAST-PERLANE-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[0,1,0,1] +; AVX2-FAST-PERLANE-NEXT: vmovdqa {{.*#+}} ymm2 = [255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX2-FAST-PERLANE-NEXT: vpand %ymm2, %ymm1, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,u,u,u,u,u,u,u,3,u,u,u,u,u,u,u] ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3] @@ -6082,8 +6079,7 @@ ; AVX2-FAST-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX2-FAST-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3] -; AVX2-FAST-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX2-FAST-NEXT: # ymm2 = mem[0,1,0,1] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ; AVX2-FAST-NEXT: vpand %ymm2, %ymm1, %ymm1 ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,u,u,u,u,u,u,u,3,u,u,u,u,u,u,u] ; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3] diff --git a/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll b/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll --- a/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll +++ b/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll @@ -1221,7 +1221,7 @@ ; AVX2-NEXT: vmovdqa (%rdi), %xmm1 ; AVX2-NEXT: vpaddb (%rsi), %xmm1, %xmm1 ; AVX2-NEXT: vpbroadcastd %xmm1, %ymm1 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255] +; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [4294967040,4294967040,4294967040,4294967040,4294967040,4294967040,4294967040,4294967040] ; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 ; AVX2-NEXT: vmovdqa %ymm0, (%rcx) @@ -1346,7 +1346,7 @@ ; AVX2-NEXT: vmovdqa (%rdi), %xmm1 ; AVX2-NEXT: vpaddb (%rsi), %xmm1, %xmm1 ; AVX2-NEXT: vpbroadcastq %xmm1, %ymm1 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255] +; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [18446744073709551360,18446744073709551360,18446744073709551360,18446744073709551360] ; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 ; AVX2-NEXT: vmovdqa %ymm0, (%rcx) @@ -2429,7 +2429,7 @@ ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm0[0,1,0,1] ; AVX2-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[0,u,0,u,0,u,0,u,0,u,0,u,0,u,0,u,16],zero,ymm2[16],zero,ymm2[16],zero,ymm2[16],zero,ymm2[16],zero,ymm2[16],zero,ymm2[16],zero,ymm2[16],zero ; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm2[2,3] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] +; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX2-NEXT: vpblendvb %ymm3, %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero ; AVX2-NEXT: vpaddb (%rdx), %ymm1, %ymm1 @@ -2723,7 +2723,7 @@ ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm0[0,1,0,1] ; AVX2-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[0,u,u,u,0,u,u,u,0,u,u,u,0,u,u,u,16],zero,zero,zero,ymm2[16],zero,zero,zero,ymm2[16],zero,zero,zero,ymm2[16],zero,zero,zero ; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm2[2,3] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255] +; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm3 = [4294967040,4294967040,4294967040,4294967040,4294967040,4294967040,4294967040,4294967040] ; AVX2-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 ; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero ; AVX2-NEXT: vpaddb (%rdx), %ymm1, %ymm1 @@ -3018,7 +3018,7 @@ ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm0[0,1,0,1] ; AVX2-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[0,u,u,u,u,u,u,u,0,u,u,u,u,u,u,u,16],zero,zero,zero,zero,zero,zero,zero,ymm2[16],zero,zero,zero,zero,zero,zero,zero ; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm2[2,3] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255] +; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [18446744073709551360,18446744073709551360,18446744073709551360,18446744073709551360] ; AVX2-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 ; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero,zero,zero ; AVX2-NEXT: vpaddb (%rdx), %ymm1, %ymm1 diff --git a/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast_from_memory.ll b/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast_from_memory.ll --- a/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast_from_memory.ll +++ b/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast_from_memory.ll @@ -1012,7 +1012,7 @@ ; AVX2: # %bb.0: ; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] ; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255] +; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm1 = [4294967040,4294967040,4294967040,4294967040,4294967040,4294967040,4294967040,4294967040] ; AVX2-NEXT: vpblendvb %ymm1, 32(%rdi), %ymm0, %ymm0 ; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX2-NEXT: vmovdqa %ymm0, (%rdx) @@ -1110,7 +1110,7 @@ ; AVX2: # %bb.0: ; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1] ; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,1,0,1,4,5,4,5] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255] +; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [18446744073709551360,18446744073709551360,18446744073709551360,18446744073709551360] ; AVX2-NEXT: vpblendvb %ymm1, 32(%rdi), %ymm0, %ymm0 ; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX2-NEXT: vmovdqa %ymm0, (%rdx) @@ -1909,7 +1909,7 @@ ; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm1 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,u,0,u,0,u,0,u,0,u,0,u,0,u,0,u,16],zero,ymm1[16],zero,ymm1[16],zero,ymm1[16],zero,ymm1[16],zero,ymm1[16],zero,ymm1[16],zero,ymm1[16],zero ; AVX2-NEXT: vperm2i128 {{.*#+}} ymm2 = mem[2,3],ymm1[2,3] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] +; AVX2-NEXT: vpbroadcastw {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX2-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 ; AVX2-NEXT: vpaddb (%rsi), %ymm1, %ymm1 ; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 @@ -2161,7 +2161,7 @@ ; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm1 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,u,u,u,0,u,u,u,0,u,u,u,0,u,u,u,16],zero,zero,zero,ymm1[16],zero,zero,zero,ymm1[16],zero,zero,zero,ymm1[16],zero,zero,zero ; AVX2-NEXT: vperm2i128 {{.*#+}} ymm2 = mem[2,3],ymm1[2,3] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255] +; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm3 = [4294967040,4294967040,4294967040,4294967040,4294967040,4294967040,4294967040,4294967040] ; AVX2-NEXT: vpblendvb %ymm3, %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vpaddb (%rsi), %ymm1, %ymm1 ; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 @@ -2413,7 +2413,7 @@ ; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm1 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,u,u,u,u,u,u,u,0,u,u,u,u,u,u,u,16],zero,zero,zero,zero,zero,zero,zero,ymm1[16],zero,zero,zero,zero,zero,zero,zero ; AVX2-NEXT: vperm2i128 {{.*#+}} ymm2 = mem[2,3],ymm1[2,3] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255] +; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [18446744073709551360,18446744073709551360,18446744073709551360,18446744073709551360] ; AVX2-NEXT: vpblendvb %ymm3, %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vpaddb (%rsi), %ymm1, %ymm1 ; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0