diff --git a/llvm/lib/Target/X86/CMakeLists.txt b/llvm/lib/Target/X86/CMakeLists.txt --- a/llvm/lib/Target/X86/CMakeLists.txt +++ b/llvm/lib/Target/X86/CMakeLists.txt @@ -46,6 +46,7 @@ X86FastISel.cpp X86FixupBWInsts.cpp X86FixupLEAs.cpp + X86FixupInstTuning.cpp X86AvoidStoreForwardingBlocks.cpp X86DynAllocaExpander.cpp X86FixupSetCC.cpp diff --git a/llvm/lib/Target/X86/X86.h b/llvm/lib/Target/X86/X86.h --- a/llvm/lib/Target/X86/X86.h +++ b/llvm/lib/Target/X86/X86.h @@ -63,6 +63,10 @@ /// instructions, in order to eliminate execution delays in some processors. FunctionPass *createX86FixupLEAs(); +/// Return as pass that replaces equivilent slower instructions with faster +/// ones. +FunctionPass *createX86FixupInstTuning(); + /// Return a pass that removes redundant LEA instructions and redundant address /// recalculations. FunctionPass *createX86OptimizeLEAs(); @@ -167,6 +171,7 @@ void initializeFPSPass(PassRegistry &); void initializeFixupBWInstPassPass(PassRegistry &); void initializeFixupLEAPassPass(PassRegistry &); +void initializeX86FixupInstTuningPassPass(PassRegistry &); void initializeWinEHStatePassPass(PassRegistry &); void initializeX86AvoidSFBPassPass(PassRegistry &); void initializeX86AvoidTrailingCallPassPass(PassRegistry &); diff --git a/llvm/lib/Target/X86/X86FixupInstTuning.cpp b/llvm/lib/Target/X86/X86FixupInstTuning.cpp new file mode 100644 --- /dev/null +++ b/llvm/lib/Target/X86/X86FixupInstTuning.cpp @@ -0,0 +1,141 @@ +//===-- X86FixupInstTunings.cpp - replace instructions -----------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file does a tuning pass replacing slower machine instructions +// with faster ones. We do this here, as opposed to during normal ISel, as +// attempting to get the "right" instruction can break patterns. This pass +// is not meant search for special cases where an instruction can be transformed +// to another, it is only meant to do transformations where the old instruction +// is always replacable with the new instructions. For example: +// +// `vpermq ymm` -> `vshufd ymm` +// -- BAD, not always valid (lane cross/non-repeated mask) +// +// `vpermq xmm` -> `vshufd xmm` +// -- GOOD, always replaceable +// +//===----------------------------------------------------------------------===// + +#include "X86.h" +#include "X86InstrInfo.h" +#include "X86Subtarget.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" + +using namespace llvm; + +#define DEBUG_TYPE "x86-fixup-inst-tuning" + +STATISTIC(NumInstChanges, "Number of instructions changes"); + +namespace { +class X86FixupInstTuningPass : public MachineFunctionPass { +public: + static char ID; + + X86FixupInstTuningPass() : MachineFunctionPass(ID) {} + + StringRef getPassName() const override { return "X86 Fixup Inst Tuning"; } + + bool runOnMachineFunction(MachineFunction &MF) override; + bool processInstruction(MachineFunction &MF, MachineBasicBlock &MBB, + MachineBasicBlock::iterator &I); + + // This pass runs after regalloc and doesn't support VReg operands. + MachineFunctionProperties getRequiredProperties() const override { + return MachineFunctionProperties().set( + MachineFunctionProperties::Property::NoVRegs); + } + +private: + const X86InstrInfo *TII = nullptr; + const X86Subtarget *ST = nullptr; +}; +} // end anonymous namespace + +char X86FixupInstTuningPass::ID = 0; + +INITIALIZE_PASS(X86FixupInstTuningPass, DEBUG_TYPE, DEBUG_TYPE, false, false) + +FunctionPass *llvm::createX86FixupInstTuning() { + return new X86FixupInstTuningPass(); +} + +bool X86FixupInstTuningPass::processInstruction( + MachineFunction &MF, MachineBasicBlock &MBB, + MachineBasicBlock::iterator &I) { + MachineInstr &MI = *I; + unsigned Opc = MI.getOpcode(); + unsigned NumOperands = MI.getDesc().getNumOperands(); + + // `vpermilps r, i` -> `vshufps r, r, i` + // `vshufps` is always as fast or faster than `vpermilps` and takes 1 less + // byte of code size. + auto ProcessVPERMILPSri = [&](unsigned NewOpc) -> bool { + unsigned MaskImm = MI.getOperand(NumOperands - 1).getImm(); + MI.removeOperand(NumOperands - 1); + MI.addOperand(MI.getOperand(1)); + MI.setDesc(TII->get(NewOpc)); + MI.addOperand(MachineOperand::CreateImm(MaskImm)); + return true; + }; + + // `vpermilps m, i` -> `vpshufd m, i` iff no domain delay penalty on shuffles. + // `vpshufd` is always as fast or faster than `vpermilps` and takes 1 less + // byte of code size. + auto ProcessVPERMILPSmi = [&](unsigned NewOpc) -> bool { + if (!ST->hasNoDomainDelayShuffle()) + return false; + MI.setDesc(TII->get(NewOpc)); + return true; + }; + + switch (Opc) { + case X86::VPERMILPSri: + return ProcessVPERMILPSri(X86::VSHUFPSrri); + case X86::VPERMILPSYri: + return ProcessVPERMILPSri(X86::VSHUFPSYrri); + case X86::VPERMILPSZ128ri: + return ProcessVPERMILPSri(X86::VSHUFPSZ128rri); + case X86::VPERMILPSZ256ri: + return ProcessVPERMILPSri(X86::VSHUFPSZ256rri); + case X86::VPERMILPSZri: + return ProcessVPERMILPSri(X86::VSHUFPSZrri); + case X86::VPERMILPSYmi: + return ProcessVPERMILPSmi(X86::VPSHUFDmi); + case X86::VPERMILPSmi: + return ProcessVPERMILPSmi(X86::VPSHUFDYmi); + case X86::VPERMILPSZ128mi: + return ProcessVPERMILPSmi(X86::VPSHUFDZ128mi); + case X86::VPERMILPSZ256mi: + return ProcessVPERMILPSmi(X86::VPSHUFDZ256mi); + case X86::VPERMILPSZmi: + return ProcessVPERMILPSmi(X86::VPSHUFDZmi); + default: + return false; + } +} + +bool X86FixupInstTuningPass::runOnMachineFunction(MachineFunction &MF) { + LLVM_DEBUG(dbgs() << "Start X86FixupInstTuning\n";); + bool Changed = false; + ST = &MF.getSubtarget(); + TII = ST->getInstrInfo(); + for (MachineBasicBlock &MBB : MF) { + for (MachineBasicBlock::iterator I = MBB.begin(); I != MBB.end(); ++I) { + if (processInstruction(MF, MBB, I)) { + ++NumInstChanges; + Changed = true; + } + } + } + LLVM_DEBUG(dbgs() << "End X86FixupInstTuning\n";); + return Changed; +} diff --git a/llvm/lib/Target/X86/X86TargetMachine.cpp b/llvm/lib/Target/X86/X86TargetMachine.cpp --- a/llvm/lib/Target/X86/X86TargetMachine.cpp +++ b/llvm/lib/Target/X86/X86TargetMachine.cpp @@ -571,6 +571,7 @@ addPass(createX86FixupBWInsts()); addPass(createX86PadShortFunctions()); addPass(createX86FixupLEAs()); + addPass(createX86FixupInstTuning()); } addPass(createX86EvexToVexInsts()); addPass(createX86DiscriminateMemOpsPass()); diff --git a/llvm/test/CodeGen/X86/2012-01-12-extract-sv.ll b/llvm/test/CodeGen/X86/2012-01-12-extract-sv.ll --- a/llvm/test/CodeGen/X86/2012-01-12-extract-sv.ll +++ b/llvm/test/CodeGen/X86/2012-01-12-extract-sv.ll @@ -6,7 +6,7 @@ ; AVX1-LABEL: endless_loop: ; AVX1: # %bb.0: # %entry ; AVX1-NEXT: vmovaps (%eax), %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,0,0,0] +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[0,0,0,0] ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-NEXT: vxorps %xmm2, %xmm2, %xmm2 ; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] diff --git a/llvm/test/CodeGen/X86/SwizzleShuff.ll b/llvm/test/CodeGen/X86/SwizzleShuff.ll --- a/llvm/test/CodeGen/X86/SwizzleShuff.ll +++ b/llvm/test/CodeGen/X86/SwizzleShuff.ll @@ -21,8 +21,8 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vmovaps (%rdi), %xmm0 ; CHECK-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1],mem[1,2] -; CHECK-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,3,2,2] -; CHECK-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,1,0,2] +; CHECK-NEXT: vshufps {{.*#+}} xmm1 = xmm0[1,3,2,2] +; CHECK-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,1,0,2] ; CHECK-NEXT: vxorps %xmm0, %xmm1, %xmm0 ; CHECK-NEXT: retq %A = load <4 x i32>, ptr %pA diff --git a/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll b/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll --- a/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll +++ b/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll @@ -1942,7 +1942,7 @@ ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0],ymm1[1,3],ymm0[4,4],ymm1[5,7] -; AVX-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7] +; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7] ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX-NEXT: vpaddb 16(%rdx), %xmm1, %xmm1 ; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 @@ -4242,7 +4242,7 @@ ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm2 ; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,0],ymm1[1,3],ymm2[4,4],ymm1[5,7] -; AVX-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,2,1,3,4,6,5,7] +; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2,1,3,4,6,5,7] ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; AVX-NEXT: vextractf128 $1, %ymm1, %xmm2 ; AVX-NEXT: vpaddb 16(%rdx), %xmm2, %xmm2 diff --git a/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll b/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll --- a/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll +++ b/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll @@ -1562,7 +1562,7 @@ ; AVX: # %bb.0: ; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0],mem[1,3],ymm0[4,4],mem[5,7] -; AVX-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7] +; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7] ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 @@ -3439,7 +3439,7 @@ ; AVX-NEXT: vmovaps 48(%rdi), %xmm0 ; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm1 = mem[0,1,0,1] ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,0],ymm0[1,3],ymm1[4,4],ymm0[5,7] -; AVX-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7] +; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7] ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = mem[0,1,0,1] ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm2 ; AVX-NEXT: vpaddb 16(%rsi), %xmm2, %xmm2 diff --git a/llvm/test/CodeGen/X86/avx-intrinsics-fast-isel.ll b/llvm/test/CodeGen/X86/avx-intrinsics-fast-isel.ll --- a/llvm/test/CodeGen/X86/avx-intrinsics-fast-isel.ll +++ b/llvm/test/CodeGen/X86/avx-intrinsics-fast-isel.ll @@ -1275,7 +1275,7 @@ define <4 x float> @test_mm_permute_ps(<4 x float> %a0) nounwind { ; CHECK-LABEL: test_mm_permute_ps: ; CHECK: # %bb.0: -; CHECK-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0] +; CHECK-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,2,1,0] ; CHECK-NEXT: ret{{[l|q]}} %res = shufflevector <4 x float> %a0, <4 x float> %a0, <4 x i32> ret <4 x float> %res @@ -1284,7 +1284,7 @@ define <4 x float> @test2_mm_permute_ps(<4 x float> %a0) nounwind { ; CHECK-LABEL: test2_mm_permute_ps: ; CHECK: # %bb.0: -; CHECK-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,1,2,3] +; CHECK-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,1,2,3] ; CHECK-NEXT: ret{{[l|q]}} %res = shufflevector <4 x float> %a0, <4 x float> %a0, <4 x i32> ret <4 x float> %res @@ -1293,7 +1293,7 @@ define <8 x float> @test_mm256_permute_ps(<8 x float> %a0) nounwind { ; CHECK-LABEL: test_mm256_permute_ps: ; CHECK: # %bb.0: -; CHECK-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; CHECK-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] ; CHECK-NEXT: ret{{[l|q]}} %res = shufflevector <8 x float> %a0, <8 x float> %a0, <8 x i32> ret <8 x float> %res @@ -1934,7 +1934,7 @@ ; X86-LABEL: test_mm256_set1_epi32: ; X86: # %bb.0: ; X86-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X86-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0] +; X86-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0,0,0] ; X86-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; X86-NEXT: retl ; @@ -2002,13 +2002,13 @@ ; X86-LABEL: test_mm256_set1_ps: ; X86: # %bb.0: ; X86-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X86-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0] +; X86-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0,0,0] ; X86-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; X86-NEXT: retl ; ; X64-LABEL: test_mm256_set1_ps: ; X64: # %bb.0: -; X64-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0] +; X64-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0,0,0] ; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; X64-NEXT: retq %res0 = insertelement <8 x float> undef, float %a0, i32 0 diff --git a/llvm/test/CodeGen/X86/avx-intrinsics-x86-upgrade.ll b/llvm/test/CodeGen/X86/avx-intrinsics-x86-upgrade.ll --- a/llvm/test/CodeGen/X86/avx-intrinsics-x86-upgrade.ll +++ b/llvm/test/CodeGen/X86/avx-intrinsics-x86-upgrade.ll @@ -879,13 +879,13 @@ define <4 x float> @test_x86_avx_vpermil_ps(<4 x float> %a0) { ; AVX-LABEL: test_x86_avx_vpermil_ps: ; AVX: # %bb.0: -; AVX-NEXT: vpermilps $7, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x04,0xc0,0x07] +; AVX-NEXT: vshufps $7, %xmm0, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0xc6,0xc0,0x07] ; AVX-NEXT: # xmm0 = xmm0[3,1,0,0] ; AVX-NEXT: ret{{[l|q]}} # encoding: [0xc3] ; ; AVX512VL-LABEL: test_x86_avx_vpermil_ps: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpermilps $7, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x04,0xc0,0x07] +; AVX512VL-NEXT: vshufps $7, %xmm0, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0xc6,0xc0,0x07] ; AVX512VL-NEXT: # xmm0 = xmm0[3,1,0,0] ; AVX512VL-NEXT: ret{{[l|q]}} # encoding: [0xc3] %res = call <4 x float> @llvm.x86.avx.vpermil.ps(<4 x float> %a0, i8 7) ; <<4 x float>> [#uses=1] @@ -897,13 +897,13 @@ define <8 x float> @test_x86_avx_vpermil_ps_256(<8 x float> %a0) { ; AVX-LABEL: test_x86_avx_vpermil_ps_256: ; AVX: # %bb.0: -; AVX-NEXT: vpermilps $7, %ymm0, %ymm0 # encoding: [0xc4,0xe3,0x7d,0x04,0xc0,0x07] +; AVX-NEXT: vshufps $7, %ymm0, %ymm0, %ymm0 # encoding: [0xc5,0xfc,0xc6,0xc0,0x07] ; AVX-NEXT: # ymm0 = ymm0[3,1,0,0,7,5,4,4] ; AVX-NEXT: ret{{[l|q]}} # encoding: [0xc3] ; ; AVX512VL-LABEL: test_x86_avx_vpermil_ps_256: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpermilps $7, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x04,0xc0,0x07] +; AVX512VL-NEXT: vshufps $7, %ymm0, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0xc6,0xc0,0x07] ; AVX512VL-NEXT: # ymm0 = ymm0[3,1,0,0,7,5,4,4] ; AVX512VL-NEXT: ret{{[l|q]}} # encoding: [0xc3] %res = call <8 x float> @llvm.x86.avx.vpermil.ps.256(<8 x float> %a0, i8 7) ; <<8 x float>> [#uses=1] diff --git a/llvm/test/CodeGen/X86/avx-splat.ll b/llvm/test/CodeGen/X86/avx-splat.ll --- a/llvm/test/CodeGen/X86/avx-splat.ll +++ b/llvm/test/CodeGen/X86/avx-splat.ll @@ -145,7 +145,7 @@ define <8 x float> @funcG(<8 x float> %a) nounwind uwtable readnone ssp { ; CHECK-LABEL: funcG: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0] +; CHECK-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0,0,0] ; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; CHECK-NEXT: ret{{[l|q]}} entry: @@ -157,7 +157,7 @@ ; CHECK-LABEL: funcH: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] -; CHECK-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,1,1,1,5,5,5,5] +; CHECK-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,1,1,1,5,5,5,5] ; CHECK-NEXT: ret{{[l|q]}} entry: %shuffle = shufflevector <8 x float> %a, <8 x float> undef, <8 x i32> diff --git a/llvm/test/CodeGen/X86/avx-vbroadcast.ll b/llvm/test/CodeGen/X86/avx-vbroadcast.ll --- a/llvm/test/CodeGen/X86/avx-vbroadcast.ll +++ b/llvm/test/CodeGen/X86/avx-vbroadcast.ll @@ -624,12 +624,12 @@ define <4 x i32> @H(<4 x i32> %a) { ; X86-LABEL: H: ; X86: ## %bb.0: ## %entry -; X86-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,1,1] +; X86-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1,1,1] ; X86-NEXT: retl ; ; X64-LABEL: H: ; X64: ## %bb.0: ## %entry -; X64-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,1,1] +; X64-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1,1,1] ; X64-NEXT: retq entry: %x = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> @@ -982,7 +982,7 @@ ; X64: ## %bb.0: ## %bb ; X64-NEXT: movdq2q %xmm0, %mm0 ; X64-NEXT: movq2dq %mm0, %xmm0 -; X64-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,0,1] +; X64-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1,0,1] ; X64-NEXT: retq bb: %tmp1 = bitcast x86_mmx %tmp to i64 diff --git a/llvm/test/CodeGen/X86/avx-vinsertf128.ll b/llvm/test/CodeGen/X86/avx-vinsertf128.ll --- a/llvm/test/CodeGen/X86/avx-vinsertf128.ll +++ b/llvm/test/CodeGen/X86/avx-vinsertf128.ll @@ -29,7 +29,7 @@ ; CHECK-NEXT: vminpd %xmm0, %xmm0, %xmm0 ; CHECK-NEXT: vminsd %xmm0, %xmm0, %xmm0 ; CHECK-NEXT: vcvtsd2ss %xmm0, %xmm0, %xmm0 -; CHECK-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0] +; CHECK-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0,0,0] ; CHECK-NEXT: vmovups %xmm0, (%rax) ; CHECK-NEXT: retq allocas: diff --git a/llvm/test/CodeGen/X86/avx-vperm2x128.ll b/llvm/test/CodeGen/X86/avx-vperm2x128.ll --- a/llvm/test/CodeGen/X86/avx-vperm2x128.ll +++ b/llvm/test/CodeGen/X86/avx-vperm2x128.ll @@ -337,7 +337,7 @@ ; ALL-LABEL: shuffle_v8f32_uu67ucuf: ; ALL: # %bb.0: # %entry ; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] -; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,2,3,4,4,6,7] +; ALL-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0,2,3,4,4,6,7] ; ALL-NEXT: retq entry: %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> diff --git a/llvm/test/CodeGen/X86/avx2-intrinsics-fast-isel.ll b/llvm/test/CodeGen/X86/avx2-intrinsics-fast-isel.ll --- a/llvm/test/CodeGen/X86/avx2-intrinsics-fast-isel.ll +++ b/llvm/test/CodeGen/X86/avx2-intrinsics-fast-isel.ll @@ -2015,7 +2015,7 @@ define <4 x i64> @test_mm256_shuffle_epi32(<4 x i64> %a0) { ; CHECK-LABEL: test_mm256_shuffle_epi32: ; CHECK: # %bb.0: -; CHECK-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,3,0,0,7,7,4,4] +; CHECK-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,3,0,0,7,7,4,4] ; CHECK-NEXT: ret{{[l|q]}} %arg0 = bitcast <4 x i64> %a0 to <8 x i32> %shuf = shufflevector <8 x i32> %arg0, <8 x i32> undef, <8 x i32> diff --git a/llvm/test/CodeGen/X86/avx512-cvt.ll b/llvm/test/CodeGen/X86/avx512-cvt.ll --- a/llvm/test/CodeGen/X86/avx512-cvt.ll +++ b/llvm/test/CodeGen/X86/avx512-cvt.ll @@ -211,7 +211,7 @@ define <4 x i64> @f32to4sl(<4 x float> %a) { ; NODQ-LABEL: f32to4sl: ; NODQ: # %bb.0: -; NODQ-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,3,3,3] +; NODQ-NEXT: vshufps {{.*#+}} xmm1 = xmm0[3,3,3,3] ; NODQ-NEXT: vcvttss2si %xmm1, %rax ; NODQ-NEXT: vmovq %rax, %xmm1 ; NODQ-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] @@ -1707,7 +1707,7 @@ ; NOVL: # %bb.0: ; NOVL-NEXT: vxorpd %xmm1, %xmm1, %xmm1 ; NOVL-NEXT: vcmpltpd %xmm0, %xmm1, %xmm0 -; NOVL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3] +; NOVL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3] ; NOVL-NEXT: vcvtdq2pd %xmm0, %xmm0 ; NOVL-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll b/llvm/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll --- a/llvm/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll +++ b/llvm/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll @@ -987,7 +987,7 @@ define <16 x float> @test_mm512_permute_ps(<16 x float> %a0) { ; CHECK-LABEL: test_mm512_permute_ps: ; CHECK: # %bb.0: -; CHECK-NEXT: vpermilps {{.*#+}} zmm0 = zmm0[2,0,0,0,6,4,4,4,10,8,8,8,14,12,12,12] +; CHECK-NEXT: vshufps {{.*#+}} zmm0 = zmm0[2,0,0,0,6,4,4,4,10,8,8,8,14,12,12,12] ; CHECK-NEXT: ret{{[l|q]}} %res = shufflevector <16 x float> %a0, <16 x float> undef, <16 x i32> ret <16 x float> %res @@ -1128,7 +1128,7 @@ define <8 x i64> @test_mm512_shuffle_epi32(<8 x i64> %a0) { ; CHECK-LABEL: test_mm512_shuffle_epi32: ; CHECK: # %bb.0: -; CHECK-NEXT: vpermilps {{.*#+}} zmm0 = zmm0[1,0,0,0,5,4,4,4,9,8,8,8,13,12,12,12] +; CHECK-NEXT: vshufps {{.*#+}} zmm0 = zmm0[1,0,0,0,5,4,4,4,9,8,8,8,13,12,12,12] ; CHECK-NEXT: ret{{[l|q]}} %arg0 = bitcast <8 x i64> %a0 to <16 x i32> %res0 = shufflevector <16 x i32> %arg0, <16 x i32> undef, <16 x i32> diff --git a/llvm/test/CodeGen/X86/avx512-intrinsics-upgrade.ll b/llvm/test/CodeGen/X86/avx512-intrinsics-upgrade.ll --- a/llvm/test/CodeGen/X86/avx512-intrinsics-upgrade.ll +++ b/llvm/test/CodeGen/X86/avx512-intrinsics-upgrade.ll @@ -1022,7 +1022,7 @@ define <16 x float>@test_int_x86_avx512_vpermil_ps_512(<16 x float> %x0, <16 x float> %x2) { ; CHECK-LABEL: test_int_x86_avx512_vpermil_ps_512: ; CHECK: ## %bb.0: -; CHECK-NEXT: vpermilps $22, %zmm0, %zmm0 ## encoding: [0x62,0xf3,0x7d,0x48,0x04,0xc0,0x16] +; CHECK-NEXT: vshufps $22, %zmm0, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0xc6,0xc0,0x16] ; CHECK-NEXT: ## zmm0 = zmm0[2,1,1,0,6,5,5,4,10,9,9,8,14,13,13,12] ; CHECK-NEXT: ret{{[l|q]}} ## encoding: [0xc3] %res = call <16 x float> @llvm.x86.avx512.mask.vpermil.ps.512(<16 x float> %x0, i32 22, <16 x float> %x2, i16 -1) @@ -1072,7 +1072,7 @@ define <16 x i32>@test_int_x86_avx512_pshuf_d_512(<16 x i32> %x0, i32 %x1, <16 x i32> %x2) { ; CHECK-LABEL: test_int_x86_avx512_pshuf_d_512: ; CHECK: ## %bb.0: -; CHECK-NEXT: vpermilps $3, %zmm0, %zmm0 ## encoding: [0x62,0xf3,0x7d,0x48,0x04,0xc0,0x03] +; CHECK-NEXT: vshufps $3, %zmm0, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0xc6,0xc0,0x03] ; CHECK-NEXT: ## zmm0 = zmm0[3,0,0,0,7,4,4,4,11,8,8,8,15,12,12,12] ; CHECK-NEXT: ret{{[l|q]}} ## encoding: [0xc3] %res = call <16 x i32> @llvm.x86.avx512.mask.pshuf.d.512(<16 x i32> %x0, i32 3, <16 x i32> %x2, i16 -1) diff --git a/llvm/test/CodeGen/X86/avx512-shuffles/in_lane_permute.ll b/llvm/test/CodeGen/X86/avx512-shuffles/in_lane_permute.ll --- a/llvm/test/CodeGen/X86/avx512-shuffles/in_lane_permute.ll +++ b/llvm/test/CodeGen/X86/avx512-shuffles/in_lane_permute.ll @@ -6,7 +6,7 @@ define <4 x float> @test_4xfloat_perm_mask0(<4 x float> %vec) { ; CHECK-LABEL: test_4xfloat_perm_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,1,3,1] +; CHECK-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,1,3,1] ; CHECK-NEXT: retq %res = shufflevector <4 x float> %vec, <4 x float> undef, <4 x i32> ret <4 x float> %res @@ -92,7 +92,7 @@ define <4 x float> @test_4xfloat_perm_mask3(<4 x float> %vec) { ; CHECK-LABEL: test_4xfloat_perm_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,2,3,2] +; CHECK-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,2,3,2] ; CHECK-NEXT: retq %res = shufflevector <4 x float> %vec, <4 x float> undef, <4 x i32> ret <4 x float> %res @@ -342,7 +342,7 @@ define <8 x float> @test_8xfloat_perm_imm_mask3(<8 x float> %vec) { ; CHECK-LABEL: test_8xfloat_perm_imm_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,2,1,0,6,6,5,4] +; CHECK-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,2,1,0,6,6,5,4] ; CHECK-NEXT: retq %res = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> ret <8 x float> %res @@ -835,7 +835,7 @@ define <16 x float> @test_16xfloat_perm_imm_mask3(<16 x float> %vec) { ; CHECK-LABEL: test_16xfloat_perm_imm_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vpermilps {{.*#+}} zmm0 = zmm0[1,1,0,2,5,5,4,6,9,9,8,10,13,13,12,14] +; CHECK-NEXT: vshufps {{.*#+}} zmm0 = zmm0[1,1,0,2,5,5,4,6,9,9,8,10,13,13,12,14] ; CHECK-NEXT: retq %res = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> ret <16 x float> %res diff --git a/llvm/test/CodeGen/X86/avx512-shuffles/shuffle.ll b/llvm/test/CodeGen/X86/avx512-shuffles/shuffle.ll --- a/llvm/test/CodeGen/X86/avx512-shuffles/shuffle.ll +++ b/llvm/test/CodeGen/X86/avx512-shuffles/shuffle.ll @@ -2091,7 +2091,7 @@ define <4 x i32> @test_4xi32_perm_mask0(<4 x i32> %vec) { ; CHECK-LABEL: test_4xi32_perm_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,3,0] +; CHECK-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,3,0] ; CHECK-NEXT: retq %res = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> ret <4 x i32> %res @@ -2171,7 +2171,7 @@ define <4 x i32> @test_4xi32_perm_mask3(<4 x i32> %vec) { ; CHECK-LABEL: test_4xi32_perm_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,0,3] +; CHECK-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1,0,3] ; CHECK-NEXT: retq %res = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> ret <4 x i32> %res @@ -2325,7 +2325,7 @@ define <8 x i32> @test_8xi32_perm_mask0(<8 x i32> %vec) { ; CHECK-LABEL: test_8xi32_perm_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,3,1,0,6,7,5,4] +; CHECK-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,3,1,0,6,7,5,4] ; CHECK-NEXT: retq %res = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> ret <8 x i32> %res @@ -2405,7 +2405,7 @@ define <8 x i32> @test_8xi32_perm_mask3(<8 x i32> %vec) { ; CHECK-LABEL: test_8xi32_perm_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,3,1,0,5,7,5,4] +; CHECK-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3,1,0,5,7,5,4] ; CHECK-NEXT: retq %res = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> ret <8 x i32> %res @@ -2559,7 +2559,7 @@ define <16 x i32> @test_16xi32_perm_mask0(<16 x i32> %vec) { ; CHECK-LABEL: test_16xi32_perm_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpermilps {{.*#+}} zmm0 = zmm0[3,1,3,0,7,5,7,4,11,9,11,8,15,13,15,12] +; CHECK-NEXT: vshufps {{.*#+}} zmm0 = zmm0[3,1,3,0,7,5,7,4,11,9,11,8,15,13,15,12] ; CHECK-NEXT: retq %res = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> ret <16 x i32> %res @@ -2639,7 +2639,7 @@ define <16 x i32> @test_16xi32_perm_mask3(<16 x i32> %vec) { ; CHECK-LABEL: test_16xi32_perm_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vpermilps {{.*#+}} zmm0 = zmm0[3,2,0,3,7,6,4,7,11,10,8,11,15,14,12,15] +; CHECK-NEXT: vshufps {{.*#+}} zmm0 = zmm0[3,2,0,3,7,6,4,7,11,10,8,11,15,14,12,15] ; CHECK-NEXT: retq %res = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> ret <16 x i32> %res diff --git a/llvm/test/CodeGen/X86/avx512-trunc.ll b/llvm/test/CodeGen/X86/avx512-trunc.ll --- a/llvm/test/CodeGen/X86/avx512-trunc.ll +++ b/llvm/test/CodeGen/X86/avx512-trunc.ll @@ -270,7 +270,7 @@ define <2 x i32> @trunc_qd_128(<2 x i64> %i) #0 { ; ALL-LABEL: trunc_qd_128: ; ALL: ## %bb.0: -; ALL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3] +; ALL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3] ; ALL-NEXT: retq %x = trunc <2 x i64> %i to <2 x i32> ret <2 x i32> %x @@ -279,7 +279,7 @@ define void @trunc_qd_128_mem(<2 x i64> %i, ptr %res) #0 { ; KNL-LABEL: trunc_qd_128_mem: ; KNL: ## %bb.0: -; KNL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3] +; KNL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3] ; KNL-NEXT: vmovlps %xmm0, (%rdi) ; KNL-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/avx512-vec-cmp.ll b/llvm/test/CodeGen/X86/avx512-vec-cmp.ll --- a/llvm/test/CodeGen/X86/avx512-vec-cmp.ll +++ b/llvm/test/CodeGen/X86/avx512-vec-cmp.ll @@ -1211,7 +1211,7 @@ ; AVX512-LABEL: test46: ; AVX512: ## %bb.0: ; AVX512-NEXT: vcmpeqps %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0xc2,0xc1,0x00] -; AVX512-NEXT: vpermilps $212, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x04,0xc0,0xd4] +; AVX512-NEXT: vshufps $212, %xmm0, %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0xc6,0xc0,0xd4] ; AVX512-NEXT: ## xmm0 = xmm0[0,1,1,3] ; AVX512-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0x54,0x05,A,A,A,A] ; AVX512-NEXT: ## fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte diff --git a/llvm/test/CodeGen/X86/avx512fp16-mov.ll b/llvm/test/CodeGen/X86/avx512fp16-mov.ll --- a/llvm/test/CodeGen/X86/avx512fp16-mov.ll +++ b/llvm/test/CodeGen/X86/avx512fp16-mov.ll @@ -1334,7 +1334,7 @@ define half @extract_f16_6(<8 x half> %x) { ; CHECK-LABEL: extract_f16_6: ; CHECK: # %bb.0: -; CHECK-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; CHECK-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; CHECK-NEXT: ret{{[l|q]}} %res = extractelement <8 x half> %x, i32 6 ret half %res @@ -1606,14 +1606,14 @@ define void @extract_store_f16_6(<8 x half> %x, ptr %y) { ; X64-LABEL: extract_store_f16_6: ; X64: # %bb.0: -; X64-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; X64-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; X64-NEXT: vmovsh %xmm0, (%rdi) ; X64-NEXT: retq ; ; X86-LABEL: extract_store_f16_6: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; X86-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; X86-NEXT: vmovsh %xmm0, (%eax) ; X86-NEXT: retl %res = extractelement <8 x half> %x, i32 6 diff --git a/llvm/test/CodeGen/X86/avx512fp16-mscatter.ll b/llvm/test/CodeGen/X86/avx512fp16-mscatter.ll --- a/llvm/test/CodeGen/X86/avx512fp16-mscatter.ll +++ b/llvm/test/CodeGen/X86/avx512fp16-mscatter.ll @@ -31,7 +31,7 @@ ; CHECK-NEXT: vpsrldq {{.*#+}} xmm3 = xmm1[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; CHECK-NEXT: vpextrq $1, %xmm4, %rax ; CHECK-NEXT: vmovsh %xmm3, (%rax) -; CHECK-NEXT: vpermilps {{.*#+}} xmm3 = xmm1[3,3,3,3] +; CHECK-NEXT: vshufps {{.*#+}} xmm3 = xmm1[3,3,3,3] ; CHECK-NEXT: vextracti32x4 $3, %zmm0, %xmm0 ; CHECK-NEXT: vmovq %xmm0, %rax ; CHECK-NEXT: vmovsh %xmm3, (%rax) @@ -58,7 +58,7 @@ ; CHECK-NEXT: vpsrldq {{.*#+}} xmm1 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; CHECK-NEXT: vpextrq $1, %xmm3, %rax ; CHECK-NEXT: vmovsh %xmm1, (%rax) -; CHECK-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,3,3,3] +; CHECK-NEXT: vshufps {{.*#+}} xmm1 = xmm0[3,3,3,3] ; CHECK-NEXT: vextracti32x4 $3, %zmm2, %xmm2 ; CHECK-NEXT: vmovq %xmm2, %rax ; CHECK-NEXT: vmovsh %xmm1, (%rax) diff --git a/llvm/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll b/llvm/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll --- a/llvm/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll +++ b/llvm/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll @@ -891,7 +891,7 @@ define <8 x float>@test_int_x86_avx512_vpermil_ps_256(<8 x float> %x0, <8 x float> %x2) { ; CHECK-LABEL: test_int_x86_avx512_vpermil_ps_256: ; CHECK: # %bb.0: -; CHECK-NEXT: vpermilps $22, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x04,0xc0,0x16] +; CHECK-NEXT: vshufps $22, %ymm0, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0xc6,0xc0,0x16] ; CHECK-NEXT: # ymm0 = ymm0[2,1,1,0,6,5,5,4] ; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] %res = call <8 x float> @llvm.x86.avx512.mask.vpermil.ps.256(<8 x float> %x0, i32 22, <8 x float> %x2, i8 -1) @@ -943,7 +943,7 @@ define <4 x float>@test_int_x86_avx512_vpermil_ps_128(<4 x float> %x0, <4 x float> %x2) { ; CHECK-LABEL: test_int_x86_avx512_vpermil_ps_128: ; CHECK: # %bb.0: -; CHECK-NEXT: vpermilps $22, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x04,0xc0,0x16] +; CHECK-NEXT: vshufps $22, %xmm0, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0xc6,0xc0,0x16] ; CHECK-NEXT: # xmm0 = xmm0[2,1,1,0] ; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] %res = call <4 x float> @llvm.x86.avx512.mask.vpermil.ps.128(<4 x float> %x0, i32 22, <4 x float> %x2, i8 -1) @@ -1967,7 +1967,7 @@ define <4 x i32>@test_int_x86_avx512_pshuf_d_128(<4 x i32> %x0, i32 %x1, <4 x i32> %x2) { ; CHECK-LABEL: test_int_x86_avx512_pshuf_d_128: ; CHECK: # %bb.0: -; CHECK-NEXT: vpermilps $3, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x04,0xc0,0x03] +; CHECK-NEXT: vshufps $3, %xmm0, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0xc6,0xc0,0x03] ; CHECK-NEXT: # xmm0 = xmm0[3,0,0,0] ; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] %res = call <4 x i32> @llvm.x86.avx512.mask.pshuf.d.128(<4 x i32> %x0, i32 3, <4 x i32> %x2, i8 -1) @@ -2019,7 +2019,7 @@ define <8 x i32>@test_int_x86_avx512_pshuf_d_256(<8 x i32> %x0, i32 %x1, <8 x i32> %x2) { ; CHECK-LABEL: test_int_x86_avx512_pshuf_d_256: ; CHECK: # %bb.0: -; CHECK-NEXT: vpermilps $3, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x04,0xc0,0x03] +; CHECK-NEXT: vshufps $3, %ymm0, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0xc6,0xc0,0x03] ; CHECK-NEXT: # ymm0 = ymm0[3,0,0,0,7,4,4,4] ; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] %res = call <8 x i32> @llvm.x86.avx512.mask.pshuf.d.256(<8 x i32> %x0, i32 3, <8 x i32> %x2, i8 -1) diff --git a/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool-sext.ll b/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool-sext.ll --- a/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool-sext.ll +++ b/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool-sext.ll @@ -352,7 +352,7 @@ ; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,1,1,4,5,6,7] ; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,2,3,3,4,5,6,7] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,1,1,4,4,5,5] +; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0,1,1,4,4,5,5] ; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [9241421688590303745,9241421688590303745] @@ -617,7 +617,7 @@ ; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm1[0,0,1,1,4,5,6,7] ; AVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm1[2,2,3,3,4,5,6,7] ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,1,1,4,4,5,5] +; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0,1,1,4,4,5,5] ; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm2 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745] ; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 @@ -627,7 +627,7 @@ ; AVX1-NEXT: vpshufhw {{.*#+}} xmm3 = xmm1[0,1,2,3,4,4,5,5] ; AVX1-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,6,7,7] ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 -; AVX1-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[2,2,3,3,6,6,7,7] +; AVX1-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,2,3,3,6,6,7,7] ; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 ; AVX1-NEXT: vpcmpeqb %xmm2, %xmm3, %xmm3 diff --git a/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool-zext.ll b/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool-zext.ll --- a/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool-zext.ll +++ b/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool-zext.ll @@ -451,7 +451,7 @@ ; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,1,1,4,5,6,7] ; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,2,3,3,4,5,6,7] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,1,1,4,4,5,5] +; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0,1,1,4,4,5,5] ; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [9241421688590303745,9241421688590303745] @@ -806,7 +806,7 @@ ; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm1[0,0,1,1,4,5,6,7] ; AVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm1[2,2,3,3,4,5,6,7] ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,1,1,4,4,5,5] +; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0,1,1,4,4,5,5] ; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm2 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745] ; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 @@ -821,7 +821,7 @@ ; AVX1-NEXT: vpshufhw {{.*#+}} xmm3 = xmm1[0,1,2,3,4,4,5,5] ; AVX1-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,6,7,7] ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 -; AVX1-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[2,2,3,3,6,6,7,7] +; AVX1-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,2,3,3,6,6,7,7] ; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 ; AVX1-NEXT: vpcmpeqb %xmm2, %xmm3, %xmm3 diff --git a/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool.ll b/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool.ll --- a/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool.ll +++ b/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool.ll @@ -246,7 +246,7 @@ ; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,1,1,4,5,6,7] ; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,2,3,3,4,5,6,7] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,1,1,4,4,5,5] +; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0,1,1,4,4,5,5] ; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [9241421688590303745,9241421688590303745] diff --git a/llvm/test/CodeGen/X86/buildvec-extract.ll b/llvm/test/CodeGen/X86/buildvec-extract.ll --- a/llvm/test/CodeGen/X86/buildvec-extract.ll +++ b/llvm/test/CodeGen/X86/buildvec-extract.ll @@ -83,7 +83,7 @@ ; ; AVX-LABEL: extract1_i32_zext_insert0_i64_zero: ; AVX: # %bb.0: -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,1,1] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1,1,1] ; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] ; AVX-NEXT: retq @@ -128,7 +128,7 @@ ; ; AVX-LABEL: extract2_i32_zext_insert0_i64_zero: ; AVX: # %bb.0: -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] ; AVX-NEXT: retq @@ -187,7 +187,7 @@ ; ; AVX-LABEL: extract0_i32_zext_insert1_i64_undef: ; AVX: # %bb.0: -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,0,1] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1,0,1] ; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3] ; AVX-NEXT: retq diff --git a/llvm/test/CodeGen/X86/combine-and.ll b/llvm/test/CodeGen/X86/combine-and.ll --- a/llvm/test/CodeGen/X86/combine-and.ll +++ b/llvm/test/CodeGen/X86/combine-and.ll @@ -624,7 +624,7 @@ ; AVX1-LABEL: neg_scalar_broadcast_v8i64: ; AVX1: # %bb.0: ; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,1,0,1] +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[0,1,0,1] ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,3] @@ -790,7 +790,7 @@ ; ; AVX1-LABEL: casted_neg_scalar_broadcast_v2i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,0,1] +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1,0,1] ; AVX1-NEXT: vandnps %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/combine-concatvectors.ll b/llvm/test/CodeGen/X86/combine-concatvectors.ll --- a/llvm/test/CodeGen/X86/combine-concatvectors.ll +++ b/llvm/test/CodeGen/X86/combine-concatvectors.ll @@ -91,7 +91,7 @@ ; AVX1-NEXT: vmovaps (%rdi), %ymm0 ; AVX1-NEXT: vmovaps (%rsi), %ymm1 ; AVX1-NEXT: vmovaps (%rdx), %ymm2 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,0,0,0] +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,0,0,0] ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 ; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4],ymm2[5,6],ymm0[7] diff --git a/llvm/test/CodeGen/X86/copy-low-subvec-elt-to-high-subvec-elt.ll b/llvm/test/CodeGen/X86/copy-low-subvec-elt-to-high-subvec-elt.ll --- a/llvm/test/CodeGen/X86/copy-low-subvec-elt-to-high-subvec-elt.ll +++ b/llvm/test/CodeGen/X86/copy-low-subvec-elt-to-high-subvec-elt.ll @@ -47,7 +47,7 @@ define <4 x float> @vec128_eltty_float_source_subvec_0_target_subvec_mask_1_unary(<4 x float> %x) nounwind { ; CHECK-LABEL: vec128_eltty_float_source_subvec_0_target_subvec_mask_1_unary: ; CHECK: # %bb.0: -; CHECK-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,2,0] +; CHECK-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1,2,0] ; CHECK-NEXT: retq %r = shufflevector <4 x float> %x, <4 x float> poison, <4 x i32> ret <4 x float> %r @@ -65,7 +65,7 @@ define <4 x i32> @vec128_eltty_i32_source_subvec_0_target_subvec_mask_1_unary(<4 x i32> %x) nounwind { ; CHECK-LABEL: vec128_eltty_i32_source_subvec_0_target_subvec_mask_1_unary: ; CHECK: # %bb.0: -; CHECK-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,2,0] +; CHECK-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1,2,0] ; CHECK-NEXT: retq %r = shufflevector <4 x i32> %x, <4 x i32> poison, <4 x i32> ret <4 x i32> %r @@ -322,7 +322,7 @@ define <4 x i64> @vec256_eltty_i64_source_subvec_1_target_subvec_mask_2_binary(<4 x i64> %x, <4 x i64> %y) nounwind { ; CHECK-LABEL: vec256_eltty_i64_source_subvec_1_target_subvec_mask_2_binary: ; CHECK: # %bb.0: -; CHECK-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,1,0,1,4,5,4,5] +; CHECK-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,0,1,4,5,4,5] ; CHECK-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; CHECK-NEXT: retq %r = shufflevector <4 x i64> %x, <4 x i64> %y, <4 x i32> @@ -448,7 +448,7 @@ ; ; CHECK-FAST-LABEL: vec256_eltty_float_source_subvec_1_target_subvec_mask_2_binary: ; CHECK-FAST: # %bb.0: -; CHECK-FAST-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,0,0,0,4,4,4,4] +; CHECK-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,0,0,0,4,4,4,4] ; CHECK-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] ; CHECK-FAST-NEXT: retq %r = shufflevector <8 x float> %x, <8 x float> %y, <8 x i32> @@ -577,7 +577,7 @@ ; ; CHECK-FAST-LABEL: vec256_eltty_i32_source_subvec_1_target_subvec_mask_2_binary: ; CHECK-FAST: # %bb.0: -; CHECK-FAST-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,0,0,0,4,4,4,4] +; CHECK-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,0,0,0,4,4,4,4] ; CHECK-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] ; CHECK-FAST-NEXT: retq %r = shufflevector <8 x i32> %x, <8 x i32> %y, <8 x i32> @@ -598,7 +598,7 @@ ; CHECK-SLOW-LABEL: vec256_eltty_i32_source_subvec_1_target_subvec_mask_3_binary: ; CHECK-SLOW: # %bb.0: ; CHECK-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,3,2,3] -; CHECK-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,0,0,0,4,4,4,4] +; CHECK-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,0,0,0,4,4,4,4] ; CHECK-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6],ymm1[7] ; CHECK-SLOW-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/extract-concat.ll b/llvm/test/CodeGen/X86/extract-concat.ll --- a/llvm/test/CodeGen/X86/extract-concat.ll +++ b/llvm/test/CodeGen/X86/extract-concat.ll @@ -68,9 +68,9 @@ ; ; AVX1-LABEL: catcat: ; AVX1: # %bb.0: -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,1,0,1] +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[0,1,0,1] ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm4 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] ; AVX1-NEXT: vmovddup {{.*#+}} ymm2 = ymm0[0,0,2,2] diff --git a/llvm/test/CodeGen/X86/extract-store.ll b/llvm/test/CodeGen/X86/extract-store.ll --- a/llvm/test/CodeGen/X86/extract-store.ll +++ b/llvm/test/CodeGen/X86/extract-store.ll @@ -326,7 +326,7 @@ ; AVX-X86-LABEL: extract_i64_1: ; AVX-X86: # %bb.0: ; AVX-X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; AVX-X86-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX-X86-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,0,1] ; AVX-X86-NEXT: vmovlps %xmm0, (%eax) ; AVX-X86-NEXT: retl ; diff --git a/llvm/test/CodeGen/X86/fdiv-combine-vec.ll b/llvm/test/CodeGen/X86/fdiv-combine-vec.ll --- a/llvm/test/CodeGen/X86/fdiv-combine-vec.ll +++ b/llvm/test/CodeGen/X86/fdiv-combine-vec.ll @@ -61,7 +61,7 @@ ; AVX: # %bb.0: ; AVX-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero ; AVX-NEXT: vdivss %xmm1, %xmm2, %xmm1 -; AVX-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,0,0,0] +; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,0,0,0] ; AVX-NEXT: vmulps %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq %vy = insertelement <4 x float> undef, float %y, i32 0 @@ -84,7 +84,7 @@ ; AVX: # %bb.0: ; AVX-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero ; AVX-NEXT: vdivss %xmm1, %xmm2, %xmm1 -; AVX-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,0,0,0] +; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,0,0,0] ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 ; AVX-NEXT: vmulps %ymm1, %ymm0, %ymm0 ; AVX-NEXT: retq @@ -115,7 +115,7 @@ ; AVX-NEXT: vsubss %xmm1, %xmm3, %xmm1 ; AVX-NEXT: vmulss %xmm1, %xmm2, %xmm1 ; AVX-NEXT: vaddss %xmm1, %xmm2, %xmm1 -; AVX-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,0,0,0] +; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,0,0,0] ; AVX-NEXT: vmulps %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq %vy = insertelement <4 x float> undef, float %y, i32 0 @@ -146,7 +146,7 @@ ; AVX-NEXT: vsubss %xmm1, %xmm3, %xmm1 ; AVX-NEXT: vmulss %xmm1, %xmm2, %xmm1 ; AVX-NEXT: vaddss %xmm1, %xmm2, %xmm1 -; AVX-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,0,0,0] +; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,0,0,0] ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 ; AVX-NEXT: vmulps %ymm1, %ymm0, %ymm0 ; AVX-NEXT: retq diff --git a/llvm/test/CodeGen/X86/fmaddsub-combine.ll b/llvm/test/CodeGen/X86/fmaddsub-combine.ll --- a/llvm/test/CodeGen/X86/fmaddsub-combine.ll +++ b/llvm/test/CodeGen/X86/fmaddsub-combine.ll @@ -490,8 +490,8 @@ ; NOFMA-NEXT: vsubss %xmm5, %xmm4, %xmm4 ; NOFMA-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[2,3] ; NOFMA-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0],xmm1[3] -; NOFMA-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] -; NOFMA-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[3,3,3,3] +; NOFMA-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; NOFMA-NEXT: vshufps {{.*#+}} xmm2 = xmm2[3,3,3,3] ; NOFMA-NEXT: vsubss %xmm2, %xmm0, %xmm0 ; NOFMA-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] ; NOFMA-NEXT: retq @@ -578,8 +578,8 @@ ; NOFMA-NEXT: vsubss %xmm9, %xmm8, %xmm8 ; NOFMA-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[2,3] ; NOFMA-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0],xmm1[3] -; NOFMA-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] -; NOFMA-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[3,3,3,3] +; NOFMA-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; NOFMA-NEXT: vshufps {{.*#+}} xmm2 = xmm2[3,3,3,3] ; NOFMA-NEXT: vsubss %xmm2, %xmm0, %xmm0 ; NOFMA-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] ; NOFMA-NEXT: vmovshdup {{.*#+}} xmm1 = xmm4[1,1,3,3] @@ -587,8 +587,8 @@ ; NOFMA-NEXT: vsubss %xmm2, %xmm1, %xmm1 ; NOFMA-NEXT: vinsertps {{.*#+}} xmm1 = xmm6[0],xmm1[0],xmm6[2,3] ; NOFMA-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm7[0],xmm1[3] -; NOFMA-NEXT: vpermilps {{.*#+}} xmm2 = xmm4[3,3,3,3] -; NOFMA-NEXT: vpermilps {{.*#+}} xmm3 = xmm5[3,3,3,3] +; NOFMA-NEXT: vshufps {{.*#+}} xmm2 = xmm4[3,3,3,3] +; NOFMA-NEXT: vshufps {{.*#+}} xmm3 = xmm5[3,3,3,3] ; NOFMA-NEXT: vsubss %xmm3, %xmm2, %xmm2 ; NOFMA-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0] ; NOFMA-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 @@ -719,12 +719,12 @@ ; NOFMA-NEXT: vsubss %xmm15, %xmm14, %xmm14 ; NOFMA-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm14[0],xmm2[2,3] ; NOFMA-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0],xmm2[3] -; NOFMA-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] -; NOFMA-NEXT: vpermilps {{.*#+}} xmm3 = xmm4[3,3,3,3] +; NOFMA-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; NOFMA-NEXT: vshufps {{.*#+}} xmm3 = xmm4[3,3,3,3] ; NOFMA-NEXT: vsubss %xmm3, %xmm0, %xmm0 ; NOFMA-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[0] -; NOFMA-NEXT: vpermilps {{.*#+}} xmm2 = xmm6[3,3,3,3] -; NOFMA-NEXT: vpermilps {{.*#+}} xmm3 = xmm7[3,3,3,3] +; NOFMA-NEXT: vshufps {{.*#+}} xmm2 = xmm6[3,3,3,3] +; NOFMA-NEXT: vshufps {{.*#+}} xmm3 = xmm7[3,3,3,3] ; NOFMA-NEXT: vsubss %xmm3, %xmm2, %xmm2 ; NOFMA-NEXT: vinsertps {{.*#+}} xmm2 = xmm8[0,1,2],xmm2[0] ; NOFMA-NEXT: vmovshdup {{.*#+}} xmm3 = xmm1[1,1,3,3] @@ -732,16 +732,16 @@ ; NOFMA-NEXT: vsubss %xmm4, %xmm3, %xmm3 ; NOFMA-NEXT: vinsertps {{.*#+}} xmm3 = xmm9[0],xmm3[0],xmm9[2,3] ; NOFMA-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1],xmm10[0],xmm3[3] -; NOFMA-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,3,3,3] -; NOFMA-NEXT: vpermilps {{.*#+}} xmm4 = xmm5[3,3,3,3] +; NOFMA-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3] +; NOFMA-NEXT: vshufps {{.*#+}} xmm4 = xmm5[3,3,3,3] ; NOFMA-NEXT: vsubss %xmm4, %xmm1, %xmm1 ; NOFMA-NEXT: vinsertps {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[0] ; NOFMA-NEXT: vmovshdup {{.*#+}} xmm3 = xmm11[1,1,3,3] ; NOFMA-NEXT: vmovshdup {{.*#+}} xmm4 = xmm13[1,1,3,3] ; NOFMA-NEXT: vsubss %xmm4, %xmm3, %xmm3 ; NOFMA-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,0],xmm12[0,0] -; NOFMA-NEXT: vpermilps {{.*#+}} xmm4 = xmm11[3,3,3,3] -; NOFMA-NEXT: vpermilps {{.*#+}} xmm5 = xmm13[3,3,3,3] +; NOFMA-NEXT: vshufps {{.*#+}} xmm4 = xmm11[3,3,3,3] +; NOFMA-NEXT: vshufps {{.*#+}} xmm5 = xmm13[3,3,3,3] ; NOFMA-NEXT: vsubss %xmm5, %xmm4, %xmm4 ; NOFMA-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[0] ; NOFMA-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 diff --git a/llvm/test/CodeGen/X86/haddsub-2.ll b/llvm/test/CodeGen/X86/haddsub-2.ll --- a/llvm/test/CodeGen/X86/haddsub-2.ll +++ b/llvm/test/CodeGen/X86/haddsub-2.ll @@ -907,12 +907,12 @@ ; AVX-LABEL: not_a_hsub_2: ; AVX: # %bb.0: ; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] -; AVX-NEXT: vpermilps {{.*#+}} xmm3 = xmm0[3,3,3,3] +; AVX-NEXT: vshufps {{.*#+}} xmm3 = xmm0[3,3,3,3] ; AVX-NEXT: vsubss %xmm3, %xmm2, %xmm2 ; AVX-NEXT: vmovshdup {{.*#+}} xmm3 = xmm0[1,1,3,3] ; AVX-NEXT: vsubss %xmm3, %xmm0, %xmm0 ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[2,3] -; AVX-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[3,3,3,3] +; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm1[3,3,3,3] ; AVX-NEXT: vpermilpd {{.*#+}} xmm3 = xmm1[1,0] ; AVX-NEXT: vsubss %xmm3, %xmm2, %xmm2 ; AVX-NEXT: vmovshdup {{.*#+}} xmm3 = xmm1[1,1,3,3] diff --git a/llvm/test/CodeGen/X86/haddsub-4.ll b/llvm/test/CodeGen/X86/haddsub-4.ll --- a/llvm/test/CodeGen/X86/haddsub-4.ll +++ b/llvm/test/CodeGen/X86/haddsub-4.ll @@ -66,13 +66,13 @@ ; AVX1: # %bb.0: ; AVX1-NEXT: vhaddps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1] -; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6] +; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6] ; AVX1-NEXT: retq ; ; AVX2-LABEL: hadd_reverse_v8f32: ; AVX2: # %bb.0: ; AVX2-NEXT: vhaddps %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6] +; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6] ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,3,0,1] ; AVX2-NEXT: retq %lhs = shufflevector <8 x float> %a0, <8 x float> %a1, <8 x i32> @@ -96,17 +96,17 @@ ; AVX1-LABEL: hadd_reverse2_v8f32: ; AVX1: # %bb.0: ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1] -; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,0,1] -; AVX1-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[3,2,1,0,7,6,5,4] +; AVX1-NEXT: vshufps {{.*#+}} ymm1 = ymm1[3,2,1,0,7,6,5,4] ; AVX1-NEXT: vhaddps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: hadd_reverse2_v8f32: ; AVX2: # %bb.0: -; AVX2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,3,0,1] -; AVX2-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[3,2,1,0,7,6,5,4] +; AVX2-NEXT: vshufps {{.*#+}} ymm1 = ymm1[3,2,1,0,7,6,5,4] ; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,3,0,1] ; AVX2-NEXT: vhaddps %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq @@ -133,13 +133,13 @@ ; AVX1: # %bb.0: ; AVX1-NEXT: vhaddps %ymm0, %ymm1, %ymm0 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1] -; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6] +; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6] ; AVX1-NEXT: retq ; ; AVX2-LABEL: hadd_reverse3_v8f32: ; AVX2: # %bb.0: ; AVX2-NEXT: vhaddps %ymm0, %ymm1, %ymm0 -; AVX2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6] +; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6] ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,3,0,1] ; AVX2-NEXT: retq %shuf0 = shufflevector <8 x float> %a0, <8 x float> %a1, <8 x i32> @@ -337,17 +337,17 @@ ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm3[2,3] ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 ; AVX1-NEXT: vhaddps %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6] -; AVX1-NEXT: vpermilps {{.*#+}} ymm1 = ymm2[1,0,3,2,5,4,7,6] +; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6] +; AVX1-NEXT: vshufps {{.*#+}} ymm1 = ymm2[1,0,3,2,5,4,7,6] ; AVX1-NEXT: retq ; ; AVX2-LABEL: hadd_reverse_v16f32: ; AVX2: # %bb.0: ; AVX2-NEXT: vhaddps %ymm3, %ymm1, %ymm1 -; AVX2-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[1,0,3,2,5,4,7,6] +; AVX2-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,0,3,2,5,4,7,6] ; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm1[2,0,3,1] ; AVX2-NEXT: vhaddps %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6] +; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6] ; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm0[2,0,3,1] ; AVX2-NEXT: vmovaps %ymm3, %ymm0 ; AVX2-NEXT: retq @@ -380,20 +380,20 @@ ; AVX1: # %bb.0: ; AVX1-NEXT: vhaddps %ymm3, %ymm1, %ymm1 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,0,1] -; AVX1-NEXT: vpermilps {{.*#+}} ymm3 = ymm1[1,0,3,2,5,4,7,6] +; AVX1-NEXT: vshufps {{.*#+}} ymm3 = ymm1[1,0,3,2,5,4,7,6] ; AVX1-NEXT: vhaddps %ymm2, %ymm0, %ymm0 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1] -; AVX1-NEXT: vpermilps {{.*#+}} ymm1 = ymm0[1,0,3,2,5,4,7,6] +; AVX1-NEXT: vshufps {{.*#+}} ymm1 = ymm0[1,0,3,2,5,4,7,6] ; AVX1-NEXT: vmovaps %ymm3, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: hadd_reverse2_v16f32: ; AVX2: # %bb.0: ; AVX2-NEXT: vhaddps %ymm3, %ymm1, %ymm1 -; AVX2-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[1,0,3,2,5,4,7,6] +; AVX2-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,0,3,2,5,4,7,6] ; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm1[2,3,0,1] ; AVX2-NEXT: vhaddps %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6] +; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6] ; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm0[2,3,0,1] ; AVX2-NEXT: vmovaps %ymm3, %ymm0 ; AVX2-NEXT: retq diff --git a/llvm/test/CodeGen/X86/haddsub-undef.ll b/llvm/test/CodeGen/X86/haddsub-undef.ll --- a/llvm/test/CodeGen/X86/haddsub-undef.ll +++ b/llvm/test/CodeGen/X86/haddsub-undef.ll @@ -208,7 +208,7 @@ ; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; AVX-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm1 ; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] -; AVX-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX-SLOW-NEXT: vaddss %xmm0, %xmm2, %xmm0 ; AVX-SLOW-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3] ; AVX-SLOW-NEXT: retq @@ -216,7 +216,7 @@ ; AVX-FAST-LABEL: test8_undef: ; AVX-FAST: # %bb.0: ; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 -; AVX-FAST-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,3] +; AVX-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1,1,3] ; AVX-FAST-NEXT: retq %vecext = extractelement <4 x float> %a, i32 0 %vecext1 = extractelement <4 x float> %a, i32 1 @@ -377,7 +377,7 @@ ; AVX512-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; AVX512-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm1 ; AVX512-SLOW-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] -; AVX512-SLOW-NEXT: vpermilps {{.*#+}} xmm3 = xmm0[3,3,3,3] +; AVX512-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm0[3,3,3,3] ; AVX512-SLOW-NEXT: vaddss %xmm3, %xmm2, %xmm2 ; AVX512-SLOW-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[2,3] ; AVX512-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm0 @@ -385,7 +385,7 @@ ; AVX512-SLOW-NEXT: vaddss %xmm2, %xmm0, %xmm2 ; AVX512-SLOW-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] ; AVX512-SLOW-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] -; AVX512-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX512-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX512-SLOW-NEXT: vaddss %xmm0, %xmm2, %xmm0 ; AVX512-SLOW-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] ; AVX512-SLOW-NEXT: retq @@ -520,15 +520,15 @@ ; ; AVX-SLOW-LABEL: add_ps_030: ; AVX-SLOW: # %bb.0: -; AVX-SLOW-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3] -; AVX-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,0,2,3] +; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm0[3,1,2,3] +; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,0,2,3] ; AVX-SLOW-NEXT: vaddps %xmm1, %xmm0, %xmm0 ; AVX-SLOW-NEXT: retq ; ; AVX-FAST-LABEL: add_ps_030: ; AVX-FAST: # %bb.0: ; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 -; AVX-FAST-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,0,2,3] +; AVX-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,0,2,3] ; AVX-FAST-NEXT: retq %l = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> %r = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> @@ -591,7 +591,7 @@ ; AVX-LABEL: add_ps_016: ; AVX: # %bb.0: ; AVX-NEXT: vhaddps %xmm0, %xmm1, %xmm0 -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,0,3,3] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,0,3,3] ; AVX-NEXT: retq %3 = shufflevector <4 x float> %1, <4 x float> %0, <2 x i32> %4 = shufflevector <4 x float> %1, <4 x float> %0, <2 x i32> @@ -620,8 +620,8 @@ ; ; AVX-SLOW-LABEL: add_ps_017: ; AVX-SLOW: # %bb.0: -; AVX-SLOW-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,3,3,3] -; AVX-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,2,2,2] +; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm0[3,3,3,3] +; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,2,2,2] ; AVX-SLOW-NEXT: vaddps %xmm1, %xmm0, %xmm0 ; AVX-SLOW-NEXT: retq ; @@ -943,8 +943,8 @@ ; ; AVX-SLOW-LABEL: PR45747_1: ; AVX-SLOW: # %bb.0: -; AVX-SLOW-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,2,2,2] -; AVX-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,2,2,2] +; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX-SLOW-NEXT: vaddps %xmm1, %xmm0, %xmm0 ; AVX-SLOW-NEXT: retq ; @@ -976,7 +976,7 @@ ; AVX-SLOW-LABEL: PR45747_2: ; AVX-SLOW: # %bb.0: ; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm0 = xmm1[1,0] -; AVX-SLOW-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,3,1,1] +; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,1,1] ; AVX-SLOW-NEXT: vaddps %xmm0, %xmm1, %xmm0 ; AVX-SLOW-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/haddsub.ll b/llvm/test/CodeGen/X86/haddsub.ll --- a/llvm/test/CodeGen/X86/haddsub.ll +++ b/llvm/test/CodeGen/X86/haddsub.ll @@ -502,7 +502,7 @@ ; AVX-SLOW-LABEL: extract_extract23_v4f32_fadd_f32: ; AVX-SLOW: # %bb.0: ; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX-SLOW-NEXT: vaddss %xmm0, %xmm1, %xmm0 ; AVX-SLOW-NEXT: retq ; @@ -563,7 +563,7 @@ ; AVX-SLOW-LABEL: extract_extract23_v4f32_fadd_f32_commute: ; AVX-SLOW: # %bb.0: ; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm0 ; AVX-SLOW-NEXT: retq ; @@ -683,7 +683,7 @@ ; AVX-SLOW-LABEL: extract_extract23_v4f32_fsub_f32: ; AVX-SLOW: # %bb.0: ; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX-SLOW-NEXT: vsubss %xmm0, %xmm1, %xmm0 ; AVX-SLOW-NEXT: retq ; @@ -729,7 +729,7 @@ ; AVX-LABEL: extract_extract23_v4f32_fsub_f32_commute: ; AVX: # %bb.0: ; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX-NEXT: vsubss %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq %x0 = extractelement <4 x float> %x, i32 2 @@ -837,7 +837,7 @@ ; AVX-SLOW-LABEL: extract_extract23_v8f32_fadd_f32: ; AVX-SLOW: # %bb.0: ; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX-SLOW-NEXT: vaddss %xmm0, %xmm1, %xmm0 ; AVX-SLOW-NEXT: vzeroupper ; AVX-SLOW-NEXT: retq @@ -873,7 +873,7 @@ ; AVX-SLOW: # %bb.0: ; AVX-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX-SLOW-NEXT: vaddss %xmm0, %xmm1, %xmm0 ; AVX-SLOW-NEXT: vzeroupper ; AVX-SLOW-NEXT: retq @@ -939,7 +939,7 @@ ; AVX-SLOW-LABEL: extract_extract23_v8f32_fadd_f32_commute: ; AVX-SLOW: # %bb.0: ; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm0 ; AVX-SLOW-NEXT: vzeroupper ; AVX-SLOW-NEXT: retq @@ -975,7 +975,7 @@ ; AVX-SLOW: # %bb.0: ; AVX-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm0 ; AVX-SLOW-NEXT: vzeroupper ; AVX-SLOW-NEXT: retq @@ -1172,7 +1172,7 @@ ; AVX-SLOW-LABEL: extract_extract23_v8f32_fsub_f32: ; AVX-SLOW: # %bb.0: ; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX-SLOW-NEXT: vsubss %xmm0, %xmm1, %xmm0 ; AVX-SLOW-NEXT: vzeroupper ; AVX-SLOW-NEXT: retq diff --git a/llvm/test/CodeGen/X86/horizontal-reduce-smax.ll b/llvm/test/CodeGen/X86/horizontal-reduce-smax.ll --- a/llvm/test/CodeGen/X86/horizontal-reduce-smax.ll +++ b/llvm/test/CodeGen/X86/horizontal-reduce-smax.ll @@ -457,7 +457,7 @@ ; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; X86-AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 ; X86-AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; X86-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X86-AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 ; X86-AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; X86-AVX1-NEXT: vmovd %xmm0, %eax @@ -470,7 +470,7 @@ ; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; X86-AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 ; X86-AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; X86-AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X86-AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 ; X86-AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; X86-AVX2-NEXT: vmovd %xmm0, %eax @@ -531,7 +531,7 @@ ; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; X64-AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 ; X64-AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; X64-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 ; X64-AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; X64-AVX1-NEXT: vmovq %xmm0, %rax @@ -543,7 +543,7 @@ ; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; X64-AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 ; X64-AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; X64-AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 ; X64-AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; X64-AVX2-NEXT: vmovq %xmm0, %rax @@ -1122,7 +1122,7 @@ ; X86-AVX1-NEXT: vblendvpd %xmm3, %xmm0, %xmm1, %xmm0 ; X86-AVX1-NEXT: vpcmpgtq %xmm2, %xmm0, %xmm1 ; X86-AVX1-NEXT: vblendvpd %xmm1, %xmm0, %xmm2, %xmm0 -; X86-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X86-AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 ; X86-AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; X86-AVX1-NEXT: vmovd %xmm0, %eax @@ -1137,7 +1137,7 @@ ; X86-AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 ; X86-AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 ; X86-AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; X86-AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X86-AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 ; X86-AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; X86-AVX2-NEXT: vmovd %xmm0, %eax @@ -1240,7 +1240,7 @@ ; X64-AVX1-NEXT: vblendvpd %xmm3, %xmm0, %xmm1, %xmm0 ; X64-AVX1-NEXT: vpcmpgtq %xmm2, %xmm0, %xmm1 ; X64-AVX1-NEXT: vblendvpd %xmm1, %xmm0, %xmm2, %xmm0 -; X64-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 ; X64-AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; X64-AVX1-NEXT: vmovq %xmm0, %rax @@ -1254,7 +1254,7 @@ ; X64-AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 ; X64-AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 ; X64-AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; X64-AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 ; X64-AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; X64-AVX2-NEXT: vmovq %xmm0, %rax diff --git a/llvm/test/CodeGen/X86/horizontal-reduce-smin.ll b/llvm/test/CodeGen/X86/horizontal-reduce-smin.ll --- a/llvm/test/CodeGen/X86/horizontal-reduce-smin.ll +++ b/llvm/test/CodeGen/X86/horizontal-reduce-smin.ll @@ -460,7 +460,7 @@ ; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; X86-AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 ; X86-AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; X86-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X86-AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 ; X86-AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; X86-AVX1-NEXT: vmovd %xmm0, %eax @@ -473,7 +473,7 @@ ; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; X86-AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 ; X86-AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; X86-AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X86-AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 ; X86-AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; X86-AVX2-NEXT: vmovd %xmm0, %eax @@ -535,7 +535,7 @@ ; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; X64-AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 ; X64-AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; X64-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 ; X64-AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; X64-AVX1-NEXT: vmovq %xmm0, %rax @@ -547,7 +547,7 @@ ; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; X64-AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 ; X64-AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; X64-AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 ; X64-AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; X64-AVX2-NEXT: vmovq %xmm0, %rax @@ -1126,7 +1126,7 @@ ; X86-AVX1-NEXT: vblendvpd %xmm3, %xmm0, %xmm1, %xmm0 ; X86-AVX1-NEXT: vpcmpgtq %xmm2, %xmm0, %xmm1 ; X86-AVX1-NEXT: vblendvpd %xmm1, %xmm2, %xmm0, %xmm0 -; X86-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X86-AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 ; X86-AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; X86-AVX1-NEXT: vmovd %xmm0, %eax @@ -1141,7 +1141,7 @@ ; X86-AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 ; X86-AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 ; X86-AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; X86-AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X86-AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 ; X86-AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; X86-AVX2-NEXT: vmovd %xmm0, %eax @@ -1244,7 +1244,7 @@ ; X64-AVX1-NEXT: vblendvpd %xmm3, %xmm0, %xmm1, %xmm0 ; X64-AVX1-NEXT: vpcmpgtq %xmm2, %xmm0, %xmm1 ; X64-AVX1-NEXT: vblendvpd %xmm1, %xmm2, %xmm0, %xmm0 -; X64-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 ; X64-AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; X64-AVX1-NEXT: vmovq %xmm0, %rax @@ -1258,7 +1258,7 @@ ; X64-AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 ; X64-AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 ; X64-AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; X64-AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 ; X64-AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; X64-AVX2-NEXT: vmovq %xmm0, %rax diff --git a/llvm/test/CodeGen/X86/horizontal-reduce-umax.ll b/llvm/test/CodeGen/X86/horizontal-reduce-umax.ll --- a/llvm/test/CodeGen/X86/horizontal-reduce-umax.ll +++ b/llvm/test/CodeGen/X86/horizontal-reduce-umax.ll @@ -52,7 +52,7 @@ ; ; X86-AVX1-LABEL: test_reduce_v2i64: ; X86-AVX1: ## %bb.0: -; X86-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X86-AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] ; X86-AVX1-NEXT: ## xmm2 = mem[0,0] ; X86-AVX1-NEXT: vxorps %xmm2, %xmm0, %xmm3 @@ -531,7 +531,7 @@ ; X86-AVX1-NEXT: vxorps %xmm2, %xmm0, %xmm4 ; X86-AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3 ; X86-AVX1-NEXT: vblendvpd %xmm3, %xmm0, %xmm1, %xmm0 -; X86-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X86-AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-AVX1-NEXT: vxorpd %xmm2, %xmm0, %xmm3 ; X86-AVX1-NEXT: vxorpd %xmm2, %xmm1, %xmm2 ; X86-AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 @@ -549,7 +549,7 @@ ; X86-AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm4 ; X86-AVX2-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3 ; X86-AVX2-NEXT: vblendvpd %xmm3, %xmm0, %xmm1, %xmm0 -; X86-AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X86-AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-AVX2-NEXT: vxorpd %xmm2, %xmm0, %xmm3 ; X86-AVX2-NEXT: vxorpd %xmm2, %xmm1, %xmm2 ; X86-AVX2-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 @@ -621,7 +621,7 @@ ; X64-AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm4 ; X64-AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3 ; X64-AVX1-NEXT: vblendvpd %xmm3, %xmm0, %xmm1, %xmm0 -; X64-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-AVX1-NEXT: vxorpd %xmm2, %xmm0, %xmm3 ; X64-AVX1-NEXT: vxorpd %xmm2, %xmm1, %xmm2 ; X64-AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 @@ -638,7 +638,7 @@ ; X64-AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm4 ; X64-AVX2-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3 ; X64-AVX2-NEXT: vblendvpd %xmm3, %xmm0, %xmm1, %xmm0 -; X64-AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-AVX2-NEXT: vxorpd %xmm2, %xmm0, %xmm3 ; X64-AVX2-NEXT: vxorpd %xmm2, %xmm1, %xmm2 ; X64-AVX2-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 @@ -1236,7 +1236,7 @@ ; X86-AVX1-NEXT: vxorpd %xmm2, %xmm3, %xmm4 ; X86-AVX1-NEXT: vpcmpgtq %xmm1, %xmm4, %xmm1 ; X86-AVX1-NEXT: vblendvpd %xmm1, %xmm3, %xmm0, %xmm0 -; X86-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X86-AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-AVX1-NEXT: vxorpd %xmm2, %xmm0, %xmm3 ; X86-AVX1-NEXT: vxorpd %xmm2, %xmm1, %xmm2 ; X86-AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 @@ -1258,7 +1258,7 @@ ; X86-AVX2-NEXT: vxorpd %xmm2, %xmm0, %xmm4 ; X86-AVX2-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3 ; X86-AVX2-NEXT: vblendvpd %xmm3, %xmm0, %xmm1, %xmm0 -; X86-AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X86-AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-AVX2-NEXT: vxorpd %xmm2, %xmm0, %xmm3 ; X86-AVX2-NEXT: vxorpd %xmm2, %xmm1, %xmm2 ; X86-AVX2-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 @@ -1381,7 +1381,7 @@ ; X64-AVX1-NEXT: vxorpd %xmm2, %xmm3, %xmm4 ; X64-AVX1-NEXT: vpcmpgtq %xmm1, %xmm4, %xmm1 ; X64-AVX1-NEXT: vblendvpd %xmm1, %xmm3, %xmm0, %xmm0 -; X64-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-AVX1-NEXT: vxorpd %xmm2, %xmm0, %xmm3 ; X64-AVX1-NEXT: vxorpd %xmm2, %xmm1, %xmm2 ; X64-AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 @@ -1402,7 +1402,7 @@ ; X64-AVX2-NEXT: vxorpd %xmm2, %xmm0, %xmm4 ; X64-AVX2-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3 ; X64-AVX2-NEXT: vblendvpd %xmm3, %xmm0, %xmm1, %xmm0 -; X64-AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-AVX2-NEXT: vxorpd %xmm2, %xmm0, %xmm3 ; X64-AVX2-NEXT: vxorpd %xmm2, %xmm1, %xmm2 ; X64-AVX2-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 diff --git a/llvm/test/CodeGen/X86/horizontal-reduce-umin.ll b/llvm/test/CodeGen/X86/horizontal-reduce-umin.ll --- a/llvm/test/CodeGen/X86/horizontal-reduce-umin.ll +++ b/llvm/test/CodeGen/X86/horizontal-reduce-umin.ll @@ -53,7 +53,7 @@ ; ; X86-AVX1-LABEL: test_reduce_v2i64: ; X86-AVX1: ## %bb.0: -; X86-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X86-AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] ; X86-AVX1-NEXT: ## xmm2 = mem[0,0] ; X86-AVX1-NEXT: vxorps %xmm2, %xmm0, %xmm3 @@ -475,7 +475,7 @@ ; X86-AVX1-NEXT: vxorps %xmm1, %xmm3, %xmm4 ; X86-AVX1-NEXT: vpcmpgtq %xmm2, %xmm4, %xmm2 ; X86-AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm3, %xmm0 -; X86-AVX1-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[2,3,2,3] +; X86-AVX1-NEXT: vshufps {{.*#+}} xmm2 = xmm0[2,3,2,3] ; X86-AVX1-NEXT: vxorpd %xmm1, %xmm0, %xmm3 ; X86-AVX1-NEXT: vxorpd %xmm1, %xmm2, %xmm1 ; X86-AVX1-NEXT: vpcmpgtq %xmm3, %xmm1, %xmm1 @@ -493,7 +493,7 @@ ; X86-AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm4 ; X86-AVX2-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3 ; X86-AVX2-NEXT: vblendvpd %xmm3, %xmm0, %xmm1, %xmm0 -; X86-AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X86-AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-AVX2-NEXT: vxorpd %xmm2, %xmm0, %xmm3 ; X86-AVX2-NEXT: vxorpd %xmm2, %xmm1, %xmm2 ; X86-AVX2-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2 @@ -567,7 +567,7 @@ ; X64-AVX1-NEXT: vpxor %xmm1, %xmm3, %xmm4 ; X64-AVX1-NEXT: vpcmpgtq %xmm2, %xmm4, %xmm2 ; X64-AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm3, %xmm0 -; X64-AVX1-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[2,3,2,3] +; X64-AVX1-NEXT: vshufps {{.*#+}} xmm2 = xmm0[2,3,2,3] ; X64-AVX1-NEXT: vxorpd %xmm1, %xmm0, %xmm3 ; X64-AVX1-NEXT: vxorpd %xmm1, %xmm2, %xmm1 ; X64-AVX1-NEXT: vpcmpgtq %xmm3, %xmm1, %xmm1 @@ -584,7 +584,7 @@ ; X64-AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm4 ; X64-AVX2-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3 ; X64-AVX2-NEXT: vblendvpd %xmm3, %xmm0, %xmm1, %xmm0 -; X64-AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-AVX2-NEXT: vxorpd %xmm2, %xmm0, %xmm3 ; X64-AVX2-NEXT: vxorpd %xmm2, %xmm1, %xmm2 ; X64-AVX2-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2 @@ -1152,7 +1152,7 @@ ; X86-AVX1-NEXT: vxorpd %xmm2, %xmm3, %xmm4 ; X86-AVX1-NEXT: vpcmpgtq %xmm1, %xmm4, %xmm1 ; X86-AVX1-NEXT: vblendvpd %xmm1, %xmm0, %xmm3, %xmm0 -; X86-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X86-AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-AVX1-NEXT: vxorpd %xmm2, %xmm0, %xmm3 ; X86-AVX1-NEXT: vxorpd %xmm2, %xmm1, %xmm2 ; X86-AVX1-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2 @@ -1174,7 +1174,7 @@ ; X86-AVX2-NEXT: vxorpd %xmm2, %xmm1, %xmm4 ; X86-AVX2-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3 ; X86-AVX2-NEXT: vblendvpd %xmm3, %xmm0, %xmm1, %xmm0 -; X86-AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X86-AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-AVX2-NEXT: vxorpd %xmm2, %xmm0, %xmm3 ; X86-AVX2-NEXT: vxorpd %xmm2, %xmm1, %xmm2 ; X86-AVX2-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2 @@ -1299,7 +1299,7 @@ ; X64-AVX1-NEXT: vxorpd %xmm3, %xmm2, %xmm4 ; X64-AVX1-NEXT: vpcmpgtq %xmm1, %xmm4, %xmm1 ; X64-AVX1-NEXT: vblendvpd %xmm1, %xmm0, %xmm2, %xmm0 -; X64-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-AVX1-NEXT: vxorpd %xmm3, %xmm0, %xmm2 ; X64-AVX1-NEXT: vxorpd %xmm3, %xmm1, %xmm3 ; X64-AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 @@ -1320,7 +1320,7 @@ ; X64-AVX2-NEXT: vxorpd %xmm2, %xmm1, %xmm4 ; X64-AVX2-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3 ; X64-AVX2-NEXT: vblendvpd %xmm3, %xmm0, %xmm1, %xmm0 -; X64-AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-AVX2-NEXT: vxorpd %xmm2, %xmm0, %xmm3 ; X64-AVX2-NEXT: vxorpd %xmm2, %xmm1, %xmm2 ; X64-AVX2-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2 diff --git a/llvm/test/CodeGen/X86/horizontal-shuffle-2.ll b/llvm/test/CodeGen/X86/horizontal-shuffle-2.ll --- a/llvm/test/CodeGen/X86/horizontal-shuffle-2.ll +++ b/llvm/test/CodeGen/X86/horizontal-shuffle-2.ll @@ -14,7 +14,7 @@ ; AVX-LABEL: test_unpacklo_hadd_v4f32: ; AVX: ## %bb.0: ; AVX-NEXT: vhaddps %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,1,3] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,1,3] ; AVX-NEXT: ret{{[l|q]}} %5 = tail call <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float> %0, <4 x float> %1) #4 %6 = tail call <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float> %2, <4 x float> %3) #4 @@ -33,7 +33,7 @@ ; AVX-LABEL: test_unpackhi_hadd_v4f32: ; AVX: ## %bb.0: ; AVX-NEXT: vhaddps %xmm3, %xmm1, %xmm0 -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,1,3] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,1,3] ; AVX-NEXT: ret{{[l|q]}} %5 = tail call <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float> %0, <4 x float> %1) #4 %6 = tail call <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float> %2, <4 x float> %3) #4 @@ -51,7 +51,7 @@ ; AVX-LABEL: test_unpacklo_hsub_v4f32: ; AVX: ## %bb.0: ; AVX-NEXT: vhsubps %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,1,3] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,1,3] ; AVX-NEXT: ret{{[l|q]}} %5 = tail call <4 x float> @llvm.x86.sse3.hsub.ps(<4 x float> %0, <4 x float> %1) #4 %6 = tail call <4 x float> @llvm.x86.sse3.hsub.ps(<4 x float> %2, <4 x float> %3) #4 @@ -70,7 +70,7 @@ ; AVX-LABEL: test_unpackhi_hsub_v4f32: ; AVX: ## %bb.0: ; AVX-NEXT: vhsubps %xmm3, %xmm1, %xmm0 -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,1,3] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,1,3] ; AVX-NEXT: ret{{[l|q]}} %5 = tail call <4 x float> @llvm.x86.sse3.hsub.ps(<4 x float> %0, <4 x float> %1) #4 %6 = tail call <4 x float> @llvm.x86.sse3.hsub.ps(<4 x float> %2, <4 x float> %3) #4 @@ -164,7 +164,7 @@ ; AVX-LABEL: test_unpacklo_hadd_v4f32_unary: ; AVX: ## %bb.0: ; AVX-NEXT: vhaddps %xmm0, %xmm0, %xmm0 -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,1,1] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0,1,1] ; AVX-NEXT: ret{{[l|q]}} %2 = tail call <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float> %0, <4 x float> %0) #4 %3 = shufflevector <4 x float> %2, <4 x float> %2, <4 x i32> diff --git a/llvm/test/CodeGen/X86/horizontal-shuffle-3.ll b/llvm/test/CodeGen/X86/horizontal-shuffle-3.ll --- a/llvm/test/CodeGen/X86/horizontal-shuffle-3.ll +++ b/llvm/test/CodeGen/X86/horizontal-shuffle-3.ll @@ -6,7 +6,7 @@ ; CHECK-LABEL: test_unpacklo_hadd_v8f32: ; CHECK: ## %bb.0: ; CHECK-NEXT: vhaddps %ymm2, %ymm0, %ymm0 -; CHECK-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7] +; CHECK-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7] ; CHECK-NEXT: ret{{[l|q]}} %5 = tail call <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float> %0, <8 x float> %1) #4 %6 = tail call <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float> %2, <8 x float> %3) #4 @@ -18,7 +18,7 @@ ; CHECK-LABEL: test_unpackhi_hadd_v8f32: ; CHECK: ## %bb.0: ; CHECK-NEXT: vhaddps %ymm3, %ymm1, %ymm0 -; CHECK-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7] +; CHECK-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7] ; CHECK-NEXT: ret{{[l|q]}} %5 = tail call <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float> %0, <8 x float> %1) #4 %6 = tail call <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float> %2, <8 x float> %3) #4 @@ -30,7 +30,7 @@ ; CHECK-LABEL: test_unpacklo_hsub_v8f32: ; CHECK: ## %bb.0: ; CHECK-NEXT: vhsubps %ymm2, %ymm0, %ymm0 -; CHECK-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7] +; CHECK-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7] ; CHECK-NEXT: ret{{[l|q]}} %5 = tail call <8 x float> @llvm.x86.avx.hsub.ps.256(<8 x float> %0, <8 x float> %1) #4 %6 = tail call <8 x float> @llvm.x86.avx.hsub.ps.256(<8 x float> %2, <8 x float> %3) #4 @@ -42,7 +42,7 @@ ; CHECK-LABEL: test_unpackhi_hsub_v8f32: ; CHECK: ## %bb.0: ; CHECK-NEXT: vhsubps %ymm3, %ymm1, %ymm0 -; CHECK-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7] +; CHECK-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7] ; CHECK-NEXT: ret{{[l|q]}} %5 = tail call <8 x float> @llvm.x86.avx.hsub.ps.256(<8 x float> %0, <8 x float> %1) #4 %6 = tail call <8 x float> @llvm.x86.avx.hsub.ps.256(<8 x float> %2, <8 x float> %3) #4 diff --git a/llvm/test/CodeGen/X86/horizontal-shuffle-4.ll b/llvm/test/CodeGen/X86/horizontal-shuffle-4.ll --- a/llvm/test/CodeGen/X86/horizontal-shuffle-4.ll +++ b/llvm/test/CodeGen/X86/horizontal-shuffle-4.ll @@ -61,7 +61,7 @@ ; CHECK-NEXT: vhaddps %ymm1, %ymm0, %ymm0 ; CHECK-NEXT: vhaddps %ymm3, %ymm2, %ymm1 ; CHECK-NEXT: vhaddps %ymm1, %ymm0, %ymm0 -; CHECK-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,2,3,0,5,6,7,4] +; CHECK-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,2,3,0,5,6,7,4] ; CHECK-NEXT: ret{{[l|q]}} %1 = call <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float> %a0, <8 x float> %a1) %2 = call <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float> %a2, <8 x float> %a3) diff --git a/llvm/test/CodeGen/X86/horizontal-sum.ll b/llvm/test/CodeGen/X86/horizontal-sum.ll --- a/llvm/test/CodeGen/X86/horizontal-sum.ll +++ b/llvm/test/CodeGen/X86/horizontal-sum.ll @@ -218,8 +218,8 @@ ; AVX1-SLOW-LABEL: pair_sum_v8f32_v4f32: ; AVX1-SLOW: # %bb.0: ; AVX1-SLOW-NEXT: vhaddps %xmm1, %xmm0, %xmm0 -; AVX1-SLOW-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,3,1,3] -; AVX1-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,1,3] +; AVX1-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm0[1,3,1,3] +; AVX1-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,1,3] ; AVX1-SLOW-NEXT: vaddps %xmm1, %xmm0, %xmm0 ; AVX1-SLOW-NEXT: vhaddps %xmm4, %xmm4, %xmm1 ; AVX1-SLOW-NEXT: vhaddps %xmm5, %xmm5, %xmm4 @@ -262,8 +262,8 @@ ; AVX2-SLOW-LABEL: pair_sum_v8f32_v4f32: ; AVX2-SLOW: # %bb.0: ; AVX2-SLOW-NEXT: vhaddps %xmm1, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,3,1,3] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,1,3] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm0[1,3,1,3] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,1,3] ; AVX2-SLOW-NEXT: vaddps %xmm1, %xmm0, %xmm0 ; AVX2-SLOW-NEXT: vhaddps %xmm4, %xmm4, %xmm1 ; AVX2-SLOW-NEXT: vhaddps %xmm5, %xmm5, %xmm4 @@ -893,27 +893,27 @@ ; AVX-SLOW-NEXT: vaddss %xmm4, %xmm0, %xmm4 ; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm5 = xmm0[1,0] ; AVX-SLOW-NEXT: vaddss %xmm5, %xmm4, %xmm4 -; AVX-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX-SLOW-NEXT: vaddss %xmm0, %xmm4, %xmm0 ; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm4 = xmm1[1,1,3,3] ; AVX-SLOW-NEXT: vaddss %xmm4, %xmm1, %xmm4 ; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm5 = xmm1[1,0] ; AVX-SLOW-NEXT: vaddss %xmm5, %xmm4, %xmm4 -; AVX-SLOW-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,3,3,3] +; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3] ; AVX-SLOW-NEXT: vaddss %xmm1, %xmm4, %xmm1 ; AVX-SLOW-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] ; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm2[1,1,3,3] ; AVX-SLOW-NEXT: vaddss %xmm1, %xmm2, %xmm1 ; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm4 = xmm2[1,0] ; AVX-SLOW-NEXT: vaddss %xmm4, %xmm1, %xmm1 -; AVX-SLOW-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[3,3,3,3] +; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm2[3,3,3,3] ; AVX-SLOW-NEXT: vaddss %xmm2, %xmm1, %xmm1 ; AVX-SLOW-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3] ; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm3[1,1,3,3] ; AVX-SLOW-NEXT: vaddss %xmm1, %xmm3, %xmm1 ; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm2 = xmm3[1,0] ; AVX-SLOW-NEXT: vaddss %xmm2, %xmm1, %xmm1 -; AVX-SLOW-NEXT: vpermilps {{.*#+}} xmm2 = xmm3[3,3,3,3] +; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm3[3,3,3,3] ; AVX-SLOW-NEXT: vaddss %xmm2, %xmm1, %xmm1 ; AVX-SLOW-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] ; AVX-SLOW-NEXT: retq @@ -923,24 +923,24 @@ ; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm4 ; AVX-FAST-NEXT: vpermilpd {{.*#+}} xmm5 = xmm0[1,0] ; AVX-FAST-NEXT: vaddss %xmm5, %xmm4, %xmm4 -; AVX-FAST-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX-FAST-NEXT: vaddss %xmm0, %xmm4, %xmm0 ; AVX-FAST-NEXT: vhaddps %xmm1, %xmm1, %xmm4 ; AVX-FAST-NEXT: vpermilpd {{.*#+}} xmm5 = xmm1[1,0] ; AVX-FAST-NEXT: vaddss %xmm5, %xmm4, %xmm4 -; AVX-FAST-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,3,3,3] +; AVX-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3] ; AVX-FAST-NEXT: vaddss %xmm1, %xmm4, %xmm1 ; AVX-FAST-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] ; AVX-FAST-NEXT: vhaddps %xmm2, %xmm2, %xmm1 ; AVX-FAST-NEXT: vpermilpd {{.*#+}} xmm4 = xmm2[1,0] ; AVX-FAST-NEXT: vaddss %xmm4, %xmm1, %xmm1 -; AVX-FAST-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[3,3,3,3] +; AVX-FAST-NEXT: vshufps {{.*#+}} xmm2 = xmm2[3,3,3,3] ; AVX-FAST-NEXT: vaddss %xmm2, %xmm1, %xmm1 ; AVX-FAST-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3] ; AVX-FAST-NEXT: vhaddps %xmm3, %xmm3, %xmm1 ; AVX-FAST-NEXT: vpermilpd {{.*#+}} xmm2 = xmm3[1,0] ; AVX-FAST-NEXT: vaddss %xmm2, %xmm1, %xmm1 -; AVX-FAST-NEXT: vpermilps {{.*#+}} xmm2 = xmm3[3,3,3,3] +; AVX-FAST-NEXT: vshufps {{.*#+}} xmm2 = xmm3[3,3,3,3] ; AVX-FAST-NEXT: vaddss %xmm2, %xmm1, %xmm1 ; AVX-FAST-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] ; AVX-FAST-NEXT: retq diff --git a/llvm/test/CodeGen/X86/i64-to-float.ll b/llvm/test/CodeGen/X86/i64-to-float.ll --- a/llvm/test/CodeGen/X86/i64-to-float.ll +++ b/llvm/test/CodeGen/X86/i64-to-float.ll @@ -299,7 +299,7 @@ ; X86-AVX-NEXT: # xmm1 = mem[0,0] ; X86-AVX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 ; X86-AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; X86-AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X86-AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3] ; X86-AVX-NEXT: vcvtdq2pd %xmm0, %xmm0 ; X86-AVX-NEXT: retl ; @@ -359,7 +359,7 @@ ; X64-AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255] ; X64-AVX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 ; X64-AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; X64-AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X64-AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3] ; X64-AVX-NEXT: vcvtdq2pd %xmm0, %xmm0 ; X64-AVX-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/insertelement-var-index.ll b/llvm/test/CodeGen/X86/insertelement-var-index.ll --- a/llvm/test/CodeGen/X86/insertelement-var-index.ll +++ b/llvm/test/CodeGen/X86/insertelement-var-index.ll @@ -187,7 +187,7 @@ ; ; AVX1-LABEL: arg_f32_v4f32_undef: ; AVX1: # %bb.0: -; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0] +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0,0,0] ; AVX1-NEXT: retq ; ; AVX2-LABEL: arg_f32_v4f32_undef: @@ -578,7 +578,7 @@ ; ; AVX1-LABEL: arg_f32_v8f32_undef: ; AVX1: # %bb.0: -; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0] +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0,0,0] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-NEXT: retq ; @@ -1053,7 +1053,7 @@ ; ; AVX1-LABEL: arg_f32_v4f32: ; AVX1: # %bb.0: -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,0,0,0] +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,0,0,0] ; AVX1-NEXT: vmovd %edi, %xmm2 ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] ; AVX1-NEXT: vpcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 @@ -1777,7 +1777,7 @@ ; ; AVX1-LABEL: arg_f32_v8f32: ; AVX1: # %bb.0: -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,0,0,0] +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,0,0,0] ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 ; AVX1-NEXT: vmovd %edi, %xmm2 ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] diff --git a/llvm/test/CodeGen/X86/known-bits-vector.ll b/llvm/test/CodeGen/X86/known-bits-vector.ll --- a/llvm/test/CodeGen/X86/known-bits-vector.ll +++ b/llvm/test/CodeGen/X86/known-bits-vector.ll @@ -136,14 +136,14 @@ define <4 x float> @knownbits_mask_shuffle_uitofp(<4 x i32> %a0) nounwind { ; X86-LABEL: knownbits_mask_shuffle_uitofp: ; X86: # %bb.0: -; X86-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,2,3,3] +; X86-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,2,3,3] ; X86-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 ; X86-NEXT: vcvtdq2ps %xmm0, %xmm0 ; X86-NEXT: retl ; ; X64-LABEL: knownbits_mask_shuffle_uitofp: ; X64: # %bb.0: -; X64-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,2,3,3] +; X64-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,2,3,3] ; X64-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; X64-NEXT: vcvtdq2ps %xmm0, %xmm0 ; X64-NEXT: retq @@ -173,7 +173,7 @@ define <4 x float> @knownbits_mask_xor_shuffle_uitofp(<4 x i32> %a0) nounwind { ; X86-LABEL: knownbits_mask_xor_shuffle_uitofp: ; X86: # %bb.0: -; X86-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,2,3,3] +; X86-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,2,3,3] ; X86-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 ; X86-NEXT: vxorps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 ; X86-NEXT: vcvtdq2ps %xmm0, %xmm0 @@ -181,7 +181,7 @@ ; ; X64-LABEL: knownbits_mask_xor_shuffle_uitofp: ; X64: # %bb.0: -; X64-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,2,3,3] +; X64-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,2,3,3] ; X64-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; X64-NEXT: vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; X64-NEXT: vcvtdq2ps %xmm0, %xmm0 @@ -384,10 +384,10 @@ define <8 x float> @knownbits_mask_concat_uitofp(<4 x i32> %a0, <4 x i32> %a1) nounwind { ; X86-LABEL: knownbits_mask_concat_uitofp: ; X86: # %bb.0: -; X86-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[1,3,1,3] +; X86-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,3,1,3] ; X86-NEXT: vmovaps {{.*#+}} xmm2 = [131071,131071,131071,131071] ; X86-NEXT: vandps %xmm2, %xmm1, %xmm1 -; X86-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,0,2] +; X86-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,0,2] ; X86-NEXT: vandps %xmm2, %xmm0, %xmm0 ; X86-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; X86-NEXT: vcvtdq2ps %ymm0, %ymm0 @@ -395,10 +395,10 @@ ; ; X64-LABEL: knownbits_mask_concat_uitofp: ; X64: # %bb.0: -; X64-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[1,3,1,3] +; X64-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,3,1,3] ; X64-NEXT: vmovaps {{.*#+}} xmm2 = [131071,131071,131071,131071] ; X64-NEXT: vandps %xmm2, %xmm1, %xmm1 -; X64-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,0,2] +; X64-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,0,2] ; X64-NEXT: vandps %xmm2, %xmm0, %xmm0 ; X64-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; X64-NEXT: vcvtdq2ps %ymm0, %ymm0 @@ -601,7 +601,7 @@ ; X86-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm3, %xmm3 ; X86-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; X86-NEXT: vblendvps %xmm0, %xmm2, %xmm3, %xmm0 -; X86-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,2,2] +; X86-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0,2,2] ; X86-NEXT: vcvtdq2ps %xmm0, %xmm0 ; X86-NEXT: movl %ebp, %esp ; X86-NEXT: popl %ebp @@ -613,7 +613,7 @@ ; X64-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 ; X64-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; X64-NEXT: vblendvps %xmm0, %xmm2, %xmm3, %xmm0 -; X64-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,2,2] +; X64-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0,2,2] ; X64-NEXT: vcvtdq2ps %xmm0, %xmm0 ; X64-NEXT: retq %1 = and <4 x i32> %a2, @@ -637,7 +637,7 @@ ; X86-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm3, %xmm3 ; X86-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; X86-NEXT: vblendvps %xmm0, %xmm2, %xmm3, %xmm0 -; X86-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,2,2] +; X86-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0,2,2] ; X86-NEXT: vcvtdq2ps %xmm0, %xmm0 ; X86-NEXT: movl %ebp, %esp ; X86-NEXT: popl %ebp @@ -649,7 +649,7 @@ ; X64-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 ; X64-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; X64-NEXT: vblendvps %xmm0, %xmm2, %xmm3, %xmm0 -; X64-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,2,2] +; X64-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0,2,2] ; X64-NEXT: vcvtdq2ps %xmm0, %xmm0 ; X64-NEXT: retq %1 = lshr <4 x i32> %a2, diff --git a/llvm/test/CodeGen/X86/known-signbits-vector.ll b/llvm/test/CodeGen/X86/known-signbits-vector.ll --- a/llvm/test/CodeGen/X86/known-signbits-vector.ll +++ b/llvm/test/CodeGen/X86/known-signbits-vector.ll @@ -148,7 +148,7 @@ ; X86-LABEL: signbits_ashr_extract_sitofp_0: ; X86: # %bb.0: ; X86-NEXT: pushl %eax -; X86-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,1,1] +; X86-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1,1,1] ; X86-NEXT: vcvtdq2ps %xmm0, %xmm0 ; X86-NEXT: vmovss %xmm0, (%esp) ; X86-NEXT: flds (%esp) @@ -157,7 +157,7 @@ ; ; X64-LABEL: signbits_ashr_extract_sitofp_0: ; X64: # %bb.0: -; X64-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,1,1] +; X64-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1,1,1] ; X64-NEXT: vcvtdq2ps %xmm0, %xmm0 ; X64-NEXT: retq %1 = ashr <2 x i64> %a0, @@ -170,7 +170,7 @@ ; X86-LABEL: signbits_ashr_extract_sitofp_1: ; X86: # %bb.0: ; X86-NEXT: pushl %eax -; X86-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,1,1] +; X86-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1,1,1] ; X86-NEXT: vcvtdq2ps %xmm0, %xmm0 ; X86-NEXT: vmovss %xmm0, (%esp) ; X86-NEXT: flds (%esp) @@ -179,7 +179,7 @@ ; ; X64-LABEL: signbits_ashr_extract_sitofp_1: ; X64: # %bb.0: -; X64-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,1,1] +; X64-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1,1,1] ; X64-NEXT: vcvtdq2ps %xmm0, %xmm0 ; X64-NEXT: retq %1 = ashr <2 x i64> %a0, @@ -322,7 +322,7 @@ define <2 x double> @signbits_ashr_concat_ashr_extract_sitofp(<2 x i64> %a0, <4 x i64> %a1) nounwind { ; CHECK-LABEL: signbits_ashr_concat_ashr_extract_sitofp: ; CHECK: # %bb.0: -; CHECK-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,3,2,3] +; CHECK-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3,2,3] ; CHECK-NEXT: vcvtdq2pd %xmm0, %xmm0 ; CHECK-NEXT: ret{{[l|q]}} %1 = ashr <2 x i64> %a0, @@ -415,7 +415,7 @@ ; X86-NEXT: vpsrad $1, %xmm2, %xmm2 ; X86-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] ; X86-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm5[2,3],xmm2[4,5],xmm5[6,7] -; X86-NEXT: vpermilps {{.*#+}} xmm5 = xmm3[2,2,3,3] +; X86-NEXT: vshufps {{.*#+}} xmm5 = xmm3[2,2,3,3] ; X86-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm6 ; X86-NEXT: vextractf128 $1, %ymm1, %xmm1 ; X86-NEXT: vextractf128 $1, %ymm0, %xmm0 @@ -443,7 +443,7 @@ ; X64-AVX1-NEXT: vpsrad $1, %xmm2, %xmm2 ; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] ; X64-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm5[2,3],xmm2[4,5],xmm5[6,7] -; X64-AVX1-NEXT: vpermilps {{.*#+}} xmm5 = xmm3[2,2,3,3] +; X64-AVX1-NEXT: vshufps {{.*#+}} xmm5 = xmm3[2,2,3,3] ; X64-AVX1-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm6 ; X64-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 ; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 @@ -465,7 +465,7 @@ ; X64-AVX2-NEXT: vpmovzxdq {{.*#+}} ymm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero ; X64-AVX2-NEXT: vpcmpeqq %ymm1, %ymm0, %ymm0 ; X64-AVX2-NEXT: vblendvpd %ymm0, %ymm2, %ymm3, %ymm0 -; X64-AVX2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,1,0,1,4,5,4,5] +; X64-AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,0,1,4,5,4,5] ; X64-AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 ; X64-AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] ; X64-AVX2-NEXT: vcvtdq2ps %xmm0, %xmm0 diff --git a/llvm/test/CodeGen/X86/masked_store.ll b/llvm/test/CodeGen/X86/masked_store.ll --- a/llvm/test/CodeGen/X86/masked_store.ll +++ b/llvm/test/CodeGen/X86/masked_store.ll @@ -4958,7 +4958,7 @@ ; X86-AVX512-NEXT: vextractf32x4 $3, %zmm0, %xmm0 ; X86-AVX512-NEXT: vmovlps %xmm0, 48(%eax) ; X86-AVX512-NEXT: vextractf128 $1, %ymm1, %xmm0 -; X86-AVX512-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,0,1] +; X86-AVX512-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,0,1] ; X86-AVX512-NEXT: vmovlps %xmm0, 88(%eax) ; X86-AVX512-NEXT: vzeroupper ; X86-AVX512-NEXT: retl diff --git a/llvm/test/CodeGen/X86/masked_store_trunc.ll b/llvm/test/CodeGen/X86/masked_store_trunc.ll --- a/llvm/test/CodeGen/X86/masked_store_trunc.ll +++ b/llvm/test/CodeGen/X86/masked_store_trunc.ll @@ -1649,7 +1649,7 @@ ; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 ; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,2],zero,zero -; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3] ; AVX1-NEXT: vmaskmovps %xmm0, %xmm1, (%rdi) ; AVX1-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/masked_store_trunc_ssat.ll b/llvm/test/CodeGen/X86/masked_store_trunc_ssat.ll --- a/llvm/test/CodeGen/X86/masked_store_trunc_ssat.ll +++ b/llvm/test/CodeGen/X86/masked_store_trunc_ssat.ll @@ -2440,7 +2440,7 @@ ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [18446744071562067968,18446744071562067968] ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm0, %xmm3 ; AVX1-NEXT: vblendvpd %xmm3, %xmm0, %xmm2, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3] ; AVX1-NEXT: vmaskmovps %xmm0, %xmm1, (%rdi) ; AVX1-NEXT: retq ; @@ -2457,7 +2457,7 @@ ; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [18446744071562067968,18446744071562067968] ; AVX2-NEXT: vpcmpgtq %xmm2, %xmm0, %xmm3 ; AVX2-NEXT: vblendvpd %xmm3, %xmm0, %xmm2, %xmm0 -; AVX2-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3] ; AVX2-NEXT: vpmaskmovd %xmm0, %xmm1, (%rdi) ; AVX2-NEXT: retq ; @@ -2597,7 +2597,7 @@ ; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [18446744073709518848,18446744073709518848] ; AVX-NEXT: vpcmpgtq %xmm3, %xmm0, %xmm4 ; AVX-NEXT: vblendvpd %xmm4, %xmm0, %xmm3, %xmm0 -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3] ; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] ; AVX-NEXT: vpcmpeqq %xmm2, %xmm1, %xmm1 ; AVX-NEXT: vmovmskpd %xmm1, %eax diff --git a/llvm/test/CodeGen/X86/masked_store_trunc_usat.ll b/llvm/test/CodeGen/X86/masked_store_trunc_usat.ll --- a/llvm/test/CodeGen/X86/masked_store_trunc_usat.ll +++ b/llvm/test/CodeGen/X86/masked_store_trunc_usat.ll @@ -2142,7 +2142,7 @@ ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [9223372041149743103,9223372041149743103] ; AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3 ; AVX1-NEXT: vblendvpd %xmm3, %xmm0, %xmm2, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3] ; AVX1-NEXT: vmaskmovps %xmm0, %xmm1, (%rdi) ; AVX1-NEXT: retq ; @@ -2158,7 +2158,7 @@ ; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = [9223372041149743103,9223372041149743103] ; AVX2-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3 ; AVX2-NEXT: vblendvpd %xmm3, %xmm0, %xmm2, %xmm0 -; AVX2-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3] ; AVX2-NEXT: vpmaskmovd %xmm0, %xmm1, (%rdi) ; AVX2-NEXT: retq ; @@ -2279,7 +2279,7 @@ ; AVX-NEXT: vmovdqa {{.*#+}} xmm5 = [9223372036854841343,9223372036854841343] ; AVX-NEXT: vpcmpgtq %xmm4, %xmm5, %xmm4 ; AVX-NEXT: vblendvpd %xmm4, %xmm0, %xmm3, %xmm0 -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3] ; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] ; AVX-NEXT: vpcmpeqq %xmm2, %xmm1, %xmm1 ; AVX-NEXT: vmovmskpd %xmm1, %eax diff --git a/llvm/test/CodeGen/X86/matrix-multiply.ll b/llvm/test/CodeGen/X86/matrix-multiply.ll --- a/llvm/test/CodeGen/X86/matrix-multiply.ll +++ b/llvm/test/CodeGen/X86/matrix-multiply.ll @@ -43,9 +43,9 @@ ; AVX1-NEXT: vmovshdup {{.*#+}} xmm4 = xmm1[1,1,3,3] ; AVX1-NEXT: vmulps %xmm4, %xmm2, %xmm4 ; AVX1-NEXT: vaddps %xmm4, %xmm3, %xmm3 -; AVX1-NEXT: vpermilps {{.*#+}} xmm4 = xmm1[2,2,2,2] +; AVX1-NEXT: vshufps {{.*#+}} xmm4 = xmm1[2,2,2,2] ; AVX1-NEXT: vmulps %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,3,3,3] +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3] ; AVX1-NEXT: vmulps %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vaddps %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm3[0],xmm0[0] @@ -59,9 +59,9 @@ ; AVX2-NEXT: vmovshdup {{.*#+}} xmm4 = xmm1[1,1,3,3] ; AVX2-NEXT: vmulps %xmm4, %xmm2, %xmm4 ; AVX2-NEXT: vaddps %xmm4, %xmm3, %xmm3 -; AVX2-NEXT: vpermilps {{.*#+}} xmm4 = xmm1[2,2,2,2] +; AVX2-NEXT: vshufps {{.*#+}} xmm4 = xmm1[2,2,2,2] ; AVX2-NEXT: vmulps %xmm4, %xmm0, %xmm0 -; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,3,3,3] +; AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3] ; AVX2-NEXT: vmulps %xmm1, %xmm2, %xmm1 ; AVX2-NEXT: vaddps %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovlhps {{.*#+}} xmm0 = xmm3[0],xmm0[0] @@ -75,9 +75,9 @@ ; AVX512-NEXT: vmovshdup {{.*#+}} xmm4 = xmm1[1,1,3,3] ; AVX512-NEXT: vmulps %xmm4, %xmm2, %xmm4 ; AVX512-NEXT: vaddps %xmm4, %xmm3, %xmm3 -; AVX512-NEXT: vpermilps {{.*#+}} xmm4 = xmm1[2,2,2,2] +; AVX512-NEXT: vshufps {{.*#+}} xmm4 = xmm1[2,2,2,2] ; AVX512-NEXT: vmulps %xmm4, %xmm0, %xmm0 -; AVX512-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,3,3,3] +; AVX512-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3] ; AVX512-NEXT: vmulps %xmm1, %xmm2, %xmm1 ; AVX512-NEXT: vaddps %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vmovlhps {{.*#+}} xmm0 = xmm3[0],xmm0[0] @@ -287,7 +287,7 @@ ; AVX1-NEXT: vmulss %xmm3, %xmm8, %xmm3 ; AVX1-NEXT: vaddss %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm7[0] -; AVX1-NEXT: vpermilps {{.*#+}} xmm3 = xmm4[1,2,2,3] +; AVX1-NEXT: vshufps {{.*#+}} xmm3 = xmm4[1,2,2,3] ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 ; AVX1-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 @@ -365,9 +365,9 @@ ; AVX512F-NEXT: vmulps %xmm6, %xmm2, %xmm4 ; AVX512F-NEXT: vaddps %xmm4, %xmm3, %xmm4 ; AVX512F-NEXT: vpermilpd {{.*#+}} xmm3 = xmm5[1,0] -; AVX512F-NEXT: vpermilps {{.*#+}} xmm7 = xmm1[3,3,3,3] +; AVX512F-NEXT: vshufps {{.*#+}} xmm7 = xmm1[3,3,3,3] ; AVX512F-NEXT: vpermilpd {{.*#+}} xmm8 = xmm1[1,0] -; AVX512F-NEXT: vpermilps {{.*#+}} xmm9 = xmm1[2,2,2,2] +; AVX512F-NEXT: vshufps {{.*#+}} xmm9 = xmm1[2,2,2,2] ; AVX512F-NEXT: vmulps %xmm3, %xmm9, %xmm9 ; AVX512F-NEXT: vaddps %xmm4, %xmm9, %xmm9 ; AVX512F-NEXT: vpermilpd {{.*#+}} xmm4 = xmm0[1,0] @@ -393,9 +393,9 @@ ; AVX512F-NEXT: vmulss %xmm11, %xmm10, %xmm11 ; AVX512F-NEXT: vaddss %xmm7, %xmm11, %xmm7 ; AVX512F-NEXT: vinsertps {{.*#+}} xmm7 = xmm8[0,1],xmm7[0],xmm8[3] -; AVX512F-NEXT: vpermilps {{.*#+}} xmm8 = xmm9[3,3,3,3] +; AVX512F-NEXT: vshufps {{.*#+}} xmm8 = xmm9[3,3,3,3] ; AVX512F-NEXT: vpermilpd {{.*#+}} xmm11 = xmm9[1,0] -; AVX512F-NEXT: vpermilps {{.*#+}} xmm9 = xmm9[2,2,2,2] +; AVX512F-NEXT: vshufps {{.*#+}} xmm9 = xmm9[2,2,2,2] ; AVX512F-NEXT: vmulps %xmm0, %xmm9, %xmm0 ; AVX512F-NEXT: vmulps %xmm2, %xmm8, %xmm2 ; AVX512F-NEXT: vaddps %xmm2, %xmm0, %xmm0 @@ -424,9 +424,9 @@ ; AVX512VL-NEXT: vmulps %xmm5, %xmm2, %xmm6 ; AVX512VL-NEXT: vaddps %xmm6, %xmm3, %xmm3 ; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm6 = xmm4[1,0] -; AVX512VL-NEXT: vpermilps {{.*#+}} xmm7 = xmm1[3,3,3,3] +; AVX512VL-NEXT: vshufps {{.*#+}} xmm7 = xmm1[3,3,3,3] ; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm8 = xmm1[1,0] -; AVX512VL-NEXT: vpermilps {{.*#+}} xmm9 = xmm1[2,2,2,2] +; AVX512VL-NEXT: vshufps {{.*#+}} xmm9 = xmm1[2,2,2,2] ; AVX512VL-NEXT: vmulps %xmm6, %xmm9, %xmm9 ; AVX512VL-NEXT: vaddps %xmm3, %xmm9, %xmm3 ; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm9 = xmm0[1,0] @@ -452,9 +452,9 @@ ; AVX512VL-NEXT: vmulss %xmm11, %xmm10, %xmm11 ; AVX512VL-NEXT: vaddss %xmm7, %xmm11, %xmm7 ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm5 = xmm5[0,1],xmm7[0],xmm5[3] -; AVX512VL-NEXT: vpermilps {{.*#+}} xmm7 = xmm8[3,3,3,3] +; AVX512VL-NEXT: vshufps {{.*#+}} xmm7 = xmm8[3,3,3,3] ; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm11 = xmm8[1,0] -; AVX512VL-NEXT: vpermilps {{.*#+}} xmm8 = xmm8[2,2,2,2] +; AVX512VL-NEXT: vshufps {{.*#+}} xmm8 = xmm8[2,2,2,2] ; AVX512VL-NEXT: vmulps %xmm0, %xmm8, %xmm0 ; AVX512VL-NEXT: vmulps %xmm7, %xmm2, %xmm2 ; AVX512VL-NEXT: vaddps %xmm2, %xmm0, %xmm0 @@ -1008,60 +1008,60 @@ ; ; AVX1-LABEL: test_mul4x4_f32: ; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vpermilps {{.*#+}} ymm4 = ymm2[1,1,1,1,5,5,5,5] +; AVX1-NEXT: vshufps {{.*#+}} ymm4 = ymm2[1,1,1,1,5,5,5,5] ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm0[2,3,2,3] ; AVX1-NEXT: vmulps %ymm4, %ymm5, %ymm4 -; AVX1-NEXT: vpermilps {{.*#+}} ymm6 = ymm2[0,0,0,0,4,4,4,4] +; AVX1-NEXT: vshufps {{.*#+}} ymm6 = ymm2[0,0,0,0,4,4,4,4] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm7 ; AVX1-NEXT: vmulps %ymm6, %ymm7, %ymm0 ; AVX1-NEXT: vaddps %ymm4, %ymm0, %ymm0 -; AVX1-NEXT: vpermilps {{.*#+}} ymm4 = ymm2[2,2,2,2,6,6,6,6] +; AVX1-NEXT: vshufps {{.*#+}} ymm4 = ymm2[2,2,2,2,6,6,6,6] ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm6 ; AVX1-NEXT: vmulps %ymm4, %ymm6, %ymm4 ; AVX1-NEXT: vaddps %ymm4, %ymm0, %ymm0 -; AVX1-NEXT: vpermilps {{.*#+}} ymm2 = ymm2[3,3,3,3,7,7,7,7] +; AVX1-NEXT: vshufps {{.*#+}} ymm2 = ymm2[3,3,3,3,7,7,7,7] ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,2,3] ; AVX1-NEXT: vmulps %ymm2, %ymm1, %ymm2 ; AVX1-NEXT: vaddps %ymm2, %ymm0, %ymm0 -; AVX1-NEXT: vpermilps {{.*#+}} ymm2 = ymm3[1,1,1,1,5,5,5,5] +; AVX1-NEXT: vshufps {{.*#+}} ymm2 = ymm3[1,1,1,1,5,5,5,5] ; AVX1-NEXT: vmulps %ymm2, %ymm5, %ymm2 -; AVX1-NEXT: vpermilps {{.*#+}} ymm4 = ymm3[0,0,0,0,4,4,4,4] +; AVX1-NEXT: vshufps {{.*#+}} ymm4 = ymm3[0,0,0,0,4,4,4,4] ; AVX1-NEXT: vmulps %ymm4, %ymm7, %ymm4 ; AVX1-NEXT: vaddps %ymm2, %ymm4, %ymm2 -; AVX1-NEXT: vpermilps {{.*#+}} ymm4 = ymm3[2,2,2,2,6,6,6,6] +; AVX1-NEXT: vshufps {{.*#+}} ymm4 = ymm3[2,2,2,2,6,6,6,6] ; AVX1-NEXT: vmulps %ymm4, %ymm6, %ymm4 ; AVX1-NEXT: vaddps %ymm4, %ymm2, %ymm2 -; AVX1-NEXT: vpermilps {{.*#+}} ymm3 = ymm3[3,3,3,3,7,7,7,7] +; AVX1-NEXT: vshufps {{.*#+}} ymm3 = ymm3[3,3,3,3,7,7,7,7] ; AVX1-NEXT: vmulps %ymm3, %ymm1, %ymm1 ; AVX1-NEXT: vaddps %ymm1, %ymm2, %ymm1 ; AVX1-NEXT: retq ; ; AVX2-LABEL: test_mul4x4_f32: ; AVX2: # %bb.0: # %entry -; AVX2-NEXT: vpermilps {{.*#+}} ymm4 = ymm2[1,1,1,1,5,5,5,5] +; AVX2-NEXT: vshufps {{.*#+}} ymm4 = ymm2[1,1,1,1,5,5,5,5] ; AVX2-NEXT: vpermpd {{.*#+}} ymm5 = ymm0[2,3,2,3] ; AVX2-NEXT: vmulps %ymm4, %ymm5, %ymm4 -; AVX2-NEXT: vpermilps {{.*#+}} ymm6 = ymm2[0,0,0,0,4,4,4,4] +; AVX2-NEXT: vshufps {{.*#+}} ymm6 = ymm2[0,0,0,0,4,4,4,4] ; AVX2-NEXT: vpermpd {{.*#+}} ymm7 = ymm0[0,1,0,1] ; AVX2-NEXT: vmulps %ymm6, %ymm7, %ymm0 ; AVX2-NEXT: vaddps %ymm4, %ymm0, %ymm0 -; AVX2-NEXT: vpermilps {{.*#+}} ymm4 = ymm2[2,2,2,2,6,6,6,6] +; AVX2-NEXT: vshufps {{.*#+}} ymm4 = ymm2[2,2,2,2,6,6,6,6] ; AVX2-NEXT: vpermpd {{.*#+}} ymm6 = ymm1[0,1,0,1] ; AVX2-NEXT: vmulps %ymm4, %ymm6, %ymm4 ; AVX2-NEXT: vaddps %ymm4, %ymm0, %ymm0 -; AVX2-NEXT: vpermilps {{.*#+}} ymm2 = ymm2[3,3,3,3,7,7,7,7] +; AVX2-NEXT: vshufps {{.*#+}} ymm2 = ymm2[3,3,3,3,7,7,7,7] ; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,3,2,3] ; AVX2-NEXT: vmulps %ymm2, %ymm1, %ymm2 ; AVX2-NEXT: vaddps %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpermilps {{.*#+}} ymm2 = ymm3[1,1,1,1,5,5,5,5] +; AVX2-NEXT: vshufps {{.*#+}} ymm2 = ymm3[1,1,1,1,5,5,5,5] ; AVX2-NEXT: vmulps %ymm2, %ymm5, %ymm2 -; AVX2-NEXT: vpermilps {{.*#+}} ymm4 = ymm3[0,0,0,0,4,4,4,4] +; AVX2-NEXT: vshufps {{.*#+}} ymm4 = ymm3[0,0,0,0,4,4,4,4] ; AVX2-NEXT: vmulps %ymm4, %ymm7, %ymm4 ; AVX2-NEXT: vaddps %ymm2, %ymm4, %ymm2 -; AVX2-NEXT: vpermilps {{.*#+}} ymm4 = ymm3[2,2,2,2,6,6,6,6] +; AVX2-NEXT: vshufps {{.*#+}} ymm4 = ymm3[2,2,2,2,6,6,6,6] ; AVX2-NEXT: vmulps %ymm4, %ymm6, %ymm4 ; AVX2-NEXT: vaddps %ymm4, %ymm2, %ymm2 -; AVX2-NEXT: vpermilps {{.*#+}} ymm3 = ymm3[3,3,3,3,7,7,7,7] +; AVX2-NEXT: vshufps {{.*#+}} ymm3 = ymm3[3,3,3,3,7,7,7,7] ; AVX2-NEXT: vmulps %ymm3, %ymm1, %ymm1 ; AVX2-NEXT: vaddps %ymm1, %ymm2, %ymm1 ; AVX2-NEXT: retq @@ -1074,20 +1074,20 @@ ; AVX512-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 ; AVX512-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ; AVX512-NEXT: vinsertf64x4 $1, %ymm3, %zmm1, %zmm2 -; AVX512-NEXT: vpermilps {{.*#+}} zmm4 = zmm2[1,1,1,1,5,5,5,5,9,9,9,9,13,13,13,13] +; AVX512-NEXT: vshufps {{.*#+}} zmm4 = zmm2[1,1,1,1,5,5,5,5,9,9,9,9,13,13,13,13] ; AVX512-NEXT: vshuff64x2 {{.*#+}} zmm5 = zmm0[2,3,2,3,2,3,2,3] ; AVX512-NEXT: vmulps %zmm4, %zmm5, %zmm4 -; AVX512-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,0,0,0,4,4,4,4] -; AVX512-NEXT: vpermilps {{.*#+}} ymm3 = ymm3[0,0,0,0,4,4,4,4] +; AVX512-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,0,0,0,4,4,4,4] +; AVX512-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,0,0,0,4,4,4,4] ; AVX512-NEXT: vinsertf64x4 $1, %ymm3, %zmm1, %zmm1 ; AVX512-NEXT: vshuff64x2 {{.*#+}} zmm3 = zmm0[0,1,0,1,0,1,0,1] ; AVX512-NEXT: vmulps %zmm1, %zmm3, %zmm1 ; AVX512-NEXT: vaddps %zmm4, %zmm1, %zmm1 -; AVX512-NEXT: vpermilps {{.*#+}} zmm3 = zmm2[2,2,2,2,6,6,6,6,10,10,10,10,14,14,14,14] +; AVX512-NEXT: vshufps {{.*#+}} zmm3 = zmm2[2,2,2,2,6,6,6,6,10,10,10,10,14,14,14,14] ; AVX512-NEXT: vshuff64x2 {{.*#+}} zmm4 = zmm0[4,5,4,5,4,5,4,5] ; AVX512-NEXT: vmulps %zmm3, %zmm4, %zmm3 ; AVX512-NEXT: vaddps %zmm3, %zmm1, %zmm1 -; AVX512-NEXT: vpermilps {{.*#+}} zmm2 = zmm2[3,3,3,3,7,7,7,7,11,11,11,11,15,15,15,15] +; AVX512-NEXT: vshufps {{.*#+}} zmm2 = zmm2[3,3,3,3,7,7,7,7,11,11,11,11,15,15,15,15] ; AVX512-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm0[6,7,6,7,6,7,6,7] ; AVX512-NEXT: vmulps %zmm2, %zmm0, %zmm0 ; AVX512-NEXT: vaddps %zmm0, %zmm1, %zmm0 @@ -2400,10 +2400,10 @@ ; ; AVX512F-LABEL: test_mul8x8_f32: ; AVX512F: # %bb.0: # %entry -; AVX512F-NEXT: vpermilps {{.*#+}} xmm8 = xmm4[2,2,2,2] +; AVX512F-NEXT: vshufps {{.*#+}} xmm8 = xmm4[2,2,2,2] ; AVX512F-NEXT: vbroadcastsd %xmm8, %ymm9 ; AVX512F-NEXT: vextractf64x4 $1, %zmm4, %ymm8 -; AVX512F-NEXT: vpermilps {{.*#+}} xmm10 = xmm8[2,2,2,2] +; AVX512F-NEXT: vshufps {{.*#+}} xmm10 = xmm8[2,2,2,2] ; AVX512F-NEXT: vbroadcastsd %xmm10, %ymm10 ; AVX512F-NEXT: vinsertf64x4 $1, %ymm10, %zmm9, %zmm16 ; AVX512F-NEXT: vbroadcastss %xmm4, %ymm10 @@ -2415,20 +2415,20 @@ ; AVX512F-NEXT: vmovshdup {{.*#+}} xmm12 = xmm8[1,1,3,3] ; AVX512F-NEXT: vbroadcastsd %xmm12, %ymm12 ; AVX512F-NEXT: vinsertf64x4 $1, %ymm12, %zmm11, %zmm18 -; AVX512F-NEXT: vpermilps {{.*#+}} xmm12 = xmm4[3,3,3,3] +; AVX512F-NEXT: vshufps {{.*#+}} xmm12 = xmm4[3,3,3,3] ; AVX512F-NEXT: vbroadcastsd %xmm12, %ymm12 ; AVX512F-NEXT: vextractf128 $1, %ymm4, %xmm13 ; AVX512F-NEXT: vbroadcastss %xmm13, %ymm13 -; AVX512F-NEXT: vpermilps {{.*#+}} xmm8 = xmm8[3,3,3,3] +; AVX512F-NEXT: vshufps {{.*#+}} xmm8 = xmm8[3,3,3,3] ; AVX512F-NEXT: vbroadcastsd %xmm8, %ymm8 ; AVX512F-NEXT: vinsertf64x4 $1, %ymm8, %zmm12, %zmm19 ; AVX512F-NEXT: vextractf32x4 $3, %zmm4, %xmm12 ; AVX512F-NEXT: vbroadcastss %xmm12, %ymm12 -; AVX512F-NEXT: vpermilps {{.*#+}} xmm14 = xmm5[2,2,2,2] +; AVX512F-NEXT: vshufps {{.*#+}} xmm14 = xmm5[2,2,2,2] ; AVX512F-NEXT: vbroadcastsd %xmm14, %ymm14 ; AVX512F-NEXT: vinsertf64x4 $1, %ymm12, %zmm13, %zmm20 ; AVX512F-NEXT: vextractf64x4 $1, %zmm5, %ymm13 -; AVX512F-NEXT: vpermilps {{.*#+}} xmm15 = xmm13[2,2,2,2] +; AVX512F-NEXT: vshufps {{.*#+}} xmm15 = xmm13[2,2,2,2] ; AVX512F-NEXT: vbroadcastsd %xmm15, %ymm15 ; AVX512F-NEXT: vinsertf64x4 $1, %ymm15, %zmm14, %zmm21 ; AVX512F-NEXT: vbroadcastss %xmm5, %ymm15 @@ -2440,20 +2440,20 @@ ; AVX512F-NEXT: vmovshdup {{.*#+}} xmm10 = xmm13[1,1,3,3] ; AVX512F-NEXT: vbroadcastsd %xmm10, %ymm10 ; AVX512F-NEXT: vinsertf64x4 $1, %ymm10, %zmm15, %zmm10 -; AVX512F-NEXT: vpermilps {{.*#+}} xmm15 = xmm5[3,3,3,3] +; AVX512F-NEXT: vshufps {{.*#+}} xmm15 = xmm5[3,3,3,3] ; AVX512F-NEXT: vbroadcastsd %xmm15, %ymm15 ; AVX512F-NEXT: vextractf128 $1, %ymm5, %xmm11 ; AVX512F-NEXT: vbroadcastss %xmm11, %ymm11 -; AVX512F-NEXT: vpermilps {{.*#+}} xmm13 = xmm13[3,3,3,3] +; AVX512F-NEXT: vshufps {{.*#+}} xmm13 = xmm13[3,3,3,3] ; AVX512F-NEXT: vbroadcastsd %xmm13, %ymm13 ; AVX512F-NEXT: vinsertf64x4 $1, %ymm13, %zmm15, %zmm15 ; AVX512F-NEXT: vextractf32x4 $3, %zmm5, %xmm13 ; AVX512F-NEXT: vbroadcastss %xmm13, %ymm13 -; AVX512F-NEXT: vpermilps {{.*#+}} xmm8 = xmm6[2,2,2,2] +; AVX512F-NEXT: vshufps {{.*#+}} xmm8 = xmm6[2,2,2,2] ; AVX512F-NEXT: vbroadcastsd %xmm8, %ymm8 ; AVX512F-NEXT: vinsertf64x4 $1, %ymm13, %zmm11, %zmm11 ; AVX512F-NEXT: vextractf64x4 $1, %zmm6, %ymm13 -; AVX512F-NEXT: vpermilps {{.*#+}} xmm12 = xmm13[2,2,2,2] +; AVX512F-NEXT: vshufps {{.*#+}} xmm12 = xmm13[2,2,2,2] ; AVX512F-NEXT: vbroadcastsd %xmm12, %ymm12 ; AVX512F-NEXT: vinsertf64x4 $1, %ymm12, %zmm8, %zmm8 ; AVX512F-NEXT: vbroadcastss %xmm6, %ymm12 @@ -2472,8 +2472,8 @@ ; AVX512F-NEXT: vinsertf64x4 $1, %ymm2, %zmm2, %zmm26 ; AVX512F-NEXT: vshuff64x2 {{.*#+}} zmm27 = zmm2[4,5,6,7,4,5,6,7] ; AVX512F-NEXT: vmovshdup {{.*#+}} zmm0 = zmm4[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15] -; AVX512F-NEXT: vpermilps {{.*#+}} zmm1 = zmm4[2,2,2,2,6,6,6,6,10,10,10,10,14,14,14,14] -; AVX512F-NEXT: vpermilps {{.*#+}} zmm2 = zmm4[3,3,3,3,7,7,7,7,11,11,11,11,15,15,15,15] +; AVX512F-NEXT: vshufps {{.*#+}} zmm1 = zmm4[2,2,2,2,6,6,6,6,10,10,10,10,14,14,14,14] +; AVX512F-NEXT: vshufps {{.*#+}} zmm2 = zmm4[3,3,3,3,7,7,7,7,11,11,11,11,15,15,15,15] ; AVX512F-NEXT: vinsertf64x4 $1, %ymm3, %zmm3, %zmm4 ; AVX512F-NEXT: vshuff64x2 {{.*#+}} zmm3 = zmm3[4,5,6,7,4,5,6,7] ; AVX512F-NEXT: vmulps %zmm17, %zmm14, %zmm17 @@ -2507,11 +2507,11 @@ ; AVX512F-NEXT: vpermpd {{.*#+}} zmm2 = zmm2[2,2,2,2,6,6,6,6] ; AVX512F-NEXT: vmulps %zmm2, %zmm27, %zmm2 ; AVX512F-NEXT: vaddps %zmm2, %zmm1, %zmm1 -; AVX512F-NEXT: vpermilps {{.*#+}} zmm2 = zmm5[2,2,2,2,6,6,6,6,10,10,10,10,14,14,14,14] +; AVX512F-NEXT: vshufps {{.*#+}} zmm2 = zmm5[2,2,2,2,6,6,6,6,10,10,10,10,14,14,14,14] ; AVX512F-NEXT: vpermpd {{.*#+}} zmm2 = zmm2[2,2,2,2,6,6,6,6] ; AVX512F-NEXT: vmulps %zmm2, %zmm4, %zmm2 ; AVX512F-NEXT: vaddps %zmm2, %zmm1, %zmm1 -; AVX512F-NEXT: vpermilps {{.*#+}} zmm2 = zmm5[3,3,3,3,7,7,7,7,11,11,11,11,15,15,15,15] +; AVX512F-NEXT: vshufps {{.*#+}} zmm2 = zmm5[3,3,3,3,7,7,7,7,11,11,11,11,15,15,15,15] ; AVX512F-NEXT: vpermpd {{.*#+}} zmm2 = zmm2[2,2,2,2,6,6,6,6] ; AVX512F-NEXT: vmulps %zmm2, %zmm3, %zmm2 ; AVX512F-NEXT: vaddps %zmm2, %zmm1, %zmm1 @@ -2520,9 +2520,9 @@ ; AVX512F-NEXT: vaddps %zmm5, %zmm2, %zmm2 ; AVX512F-NEXT: vmulps %zmm8, %zmm24, %zmm5 ; AVX512F-NEXT: vaddps %zmm5, %zmm2, %zmm2 -; AVX512F-NEXT: vpermilps {{.*#+}} xmm5 = xmm6[3,3,3,3] +; AVX512F-NEXT: vshufps {{.*#+}} xmm5 = xmm6[3,3,3,3] ; AVX512F-NEXT: vbroadcastsd %xmm5, %ymm5 -; AVX512F-NEXT: vpermilps {{.*#+}} xmm8 = xmm13[3,3,3,3] +; AVX512F-NEXT: vshufps {{.*#+}} xmm8 = xmm13[3,3,3,3] ; AVX512F-NEXT: vbroadcastsd %xmm8, %ymm8 ; AVX512F-NEXT: vinsertf64x4 $1, %ymm8, %zmm5, %zmm5 ; AVX512F-NEXT: vextractf128 $1, %ymm6, %xmm8 @@ -2532,7 +2532,7 @@ ; AVX512F-NEXT: vextractf32x4 $3, %zmm6, %xmm5 ; AVX512F-NEXT: vbroadcastss %xmm5, %ymm5 ; AVX512F-NEXT: vinsertf64x4 $1, %ymm5, %zmm8, %zmm5 -; AVX512F-NEXT: vpermilps {{.*#+}} xmm8 = xmm7[2,2,2,2] +; AVX512F-NEXT: vshufps {{.*#+}} xmm8 = xmm7[2,2,2,2] ; AVX512F-NEXT: vbroadcastsd %xmm8, %ymm8 ; AVX512F-NEXT: vmulps %zmm5, %zmm26, %zmm5 ; AVX512F-NEXT: vaddps %zmm5, %zmm2, %zmm2 @@ -2540,16 +2540,16 @@ ; AVX512F-NEXT: vpermpd {{.*#+}} zmm5 = zmm5[2,2,2,2,6,6,6,6] ; AVX512F-NEXT: vmulps %zmm5, %zmm27, %zmm5 ; AVX512F-NEXT: vaddps %zmm5, %zmm2, %zmm2 -; AVX512F-NEXT: vpermilps {{.*#+}} zmm5 = zmm6[2,2,2,2,6,6,6,6,10,10,10,10,14,14,14,14] +; AVX512F-NEXT: vshufps {{.*#+}} zmm5 = zmm6[2,2,2,2,6,6,6,6,10,10,10,10,14,14,14,14] ; AVX512F-NEXT: vpermpd {{.*#+}} zmm5 = zmm5[2,2,2,2,6,6,6,6] ; AVX512F-NEXT: vmulps %zmm5, %zmm4, %zmm5 ; AVX512F-NEXT: vaddps %zmm5, %zmm2, %zmm2 ; AVX512F-NEXT: vextractf64x4 $1, %zmm7, %ymm5 -; AVX512F-NEXT: vpermilps {{.*#+}} zmm6 = zmm6[3,3,3,3,7,7,7,7,11,11,11,11,15,15,15,15] +; AVX512F-NEXT: vshufps {{.*#+}} zmm6 = zmm6[3,3,3,3,7,7,7,7,11,11,11,11,15,15,15,15] ; AVX512F-NEXT: vpermpd {{.*#+}} zmm6 = zmm6[2,2,2,2,6,6,6,6] ; AVX512F-NEXT: vmulps %zmm6, %zmm3, %zmm6 ; AVX512F-NEXT: vaddps %zmm6, %zmm2, %zmm2 -; AVX512F-NEXT: vpermilps {{.*#+}} xmm6 = xmm5[2,2,2,2] +; AVX512F-NEXT: vshufps {{.*#+}} xmm6 = xmm5[2,2,2,2] ; AVX512F-NEXT: vbroadcastsd %xmm6, %ymm6 ; AVX512F-NEXT: vinsertf64x4 $1, %ymm6, %zmm8, %zmm6 ; AVX512F-NEXT: vbroadcastss %xmm7, %ymm8 @@ -2566,9 +2566,9 @@ ; AVX512F-NEXT: vmulps %zmm9, %zmm23, %zmm9 ; AVX512F-NEXT: vaddps %zmm9, %zmm8, %zmm8 ; AVX512F-NEXT: vaddps %zmm6, %zmm8, %zmm6 -; AVX512F-NEXT: vpermilps {{.*#+}} xmm8 = xmm7[3,3,3,3] +; AVX512F-NEXT: vshufps {{.*#+}} xmm8 = xmm7[3,3,3,3] ; AVX512F-NEXT: vbroadcastsd %xmm8, %ymm8 -; AVX512F-NEXT: vpermilps {{.*#+}} xmm5 = xmm5[3,3,3,3] +; AVX512F-NEXT: vshufps {{.*#+}} xmm5 = xmm5[3,3,3,3] ; AVX512F-NEXT: vbroadcastsd %xmm5, %ymm5 ; AVX512F-NEXT: vinsertf64x4 $1, %ymm5, %zmm8, %zmm5 ; AVX512F-NEXT: vmulps %zmm5, %zmm25, %zmm5 @@ -2584,11 +2584,11 @@ ; AVX512F-NEXT: vpermpd {{.*#+}} zmm6 = zmm6[2,2,2,2,6,6,6,6] ; AVX512F-NEXT: vmulps %zmm6, %zmm27, %zmm6 ; AVX512F-NEXT: vaddps %zmm6, %zmm5, %zmm5 -; AVX512F-NEXT: vpermilps {{.*#+}} zmm6 = zmm7[2,2,2,2,6,6,6,6,10,10,10,10,14,14,14,14] +; AVX512F-NEXT: vshufps {{.*#+}} zmm6 = zmm7[2,2,2,2,6,6,6,6,10,10,10,10,14,14,14,14] ; AVX512F-NEXT: vpermpd {{.*#+}} zmm6 = zmm6[2,2,2,2,6,6,6,6] ; AVX512F-NEXT: vmulps %zmm6, %zmm4, %zmm4 ; AVX512F-NEXT: vaddps %zmm4, %zmm5, %zmm4 -; AVX512F-NEXT: vpermilps {{.*#+}} zmm5 = zmm7[3,3,3,3,7,7,7,7,11,11,11,11,15,15,15,15] +; AVX512F-NEXT: vshufps {{.*#+}} zmm5 = zmm7[3,3,3,3,7,7,7,7,11,11,11,11,15,15,15,15] ; AVX512F-NEXT: vpermpd {{.*#+}} zmm5 = zmm5[2,2,2,2,6,6,6,6] ; AVX512F-NEXT: vmulps %zmm5, %zmm3, %zmm3 ; AVX512F-NEXT: vaddps %zmm3, %zmm4, %zmm3 @@ -2599,9 +2599,9 @@ ; AVX512VL-NEXT: vbroadcastss %xmm4, %ymm11 ; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm8 = xmm4[1,1,3,3] ; AVX512VL-NEXT: vbroadcastsd %xmm8, %ymm10 -; AVX512VL-NEXT: vpermilps {{.*#+}} xmm8 = xmm4[2,2,2,2] +; AVX512VL-NEXT: vshufps {{.*#+}} xmm8 = xmm4[2,2,2,2] ; AVX512VL-NEXT: vbroadcastsd %xmm8, %ymm14 -; AVX512VL-NEXT: vpermilps {{.*#+}} xmm8 = xmm4[3,3,3,3] +; AVX512VL-NEXT: vshufps {{.*#+}} xmm8 = xmm4[3,3,3,3] ; AVX512VL-NEXT: vbroadcastsd %xmm8, %ymm9 ; AVX512VL-NEXT: vextractf128 $1, %ymm4, %xmm8 ; AVX512VL-NEXT: vbroadcastss %xmm8, %ymm8 @@ -2610,9 +2610,9 @@ ; AVX512VL-NEXT: vextractf64x4 $1, %zmm4, %ymm12 ; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm13 = xmm12[1,1,3,3] ; AVX512VL-NEXT: vbroadcastsd %xmm13, %ymm13 -; AVX512VL-NEXT: vpermilps {{.*#+}} xmm16 = xmm12[2,2,2,2] +; AVX512VL-NEXT: vshufps {{.*#+}} xmm16 = xmm12[2,2,2,2] ; AVX512VL-NEXT: vbroadcastsd %xmm16, %ymm16 -; AVX512VL-NEXT: vpermilps {{.*#+}} xmm12 = xmm12[3,3,3,3] +; AVX512VL-NEXT: vshufps {{.*#+}} xmm12 = xmm12[3,3,3,3] ; AVX512VL-NEXT: vbroadcastsd %xmm12, %ymm12 ; AVX512VL-NEXT: vinsertf64x4 $1, %ymm16, %zmm14, %zmm16 ; AVX512VL-NEXT: vextractf32x4 $3, %zmm4, %xmm14 @@ -2631,7 +2631,7 @@ ; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm13 = xmm5[1,1,3,3] ; AVX512VL-NEXT: vbroadcastsd %xmm13, %ymm16 ; AVX512VL-NEXT: vinsertf64x4 $1, %ymm12, %zmm9, %zmm12 -; AVX512VL-NEXT: vpermilps {{.*#+}} xmm9 = xmm5[2,2,2,2] +; AVX512VL-NEXT: vshufps {{.*#+}} xmm9 = xmm5[2,2,2,2] ; AVX512VL-NEXT: vbroadcastsd %xmm9, %ymm18 ; AVX512VL-NEXT: vshuff64x2 {{.*#+}} zmm9 = zmm1[4,5,6,7,4,5,6,7] ; AVX512VL-NEXT: vmulps %zmm12, %zmm9, %zmm1 @@ -2645,18 +2645,18 @@ ; AVX512VL-NEXT: vshuff64x2 {{.*#+}} zmm12 = zmm2[4,5,6,7,4,5,6,7] ; AVX512VL-NEXT: vmulps %zmm1, %zmm12, %zmm1 ; AVX512VL-NEXT: vaddps %zmm1, %zmm0, %zmm0 -; AVX512VL-NEXT: vpermilps {{.*#+}} zmm1 = zmm4[2,2,2,2,6,6,6,6,10,10,10,10,14,14,14,14] +; AVX512VL-NEXT: vshufps {{.*#+}} zmm1 = zmm4[2,2,2,2,6,6,6,6,10,10,10,10,14,14,14,14] ; AVX512VL-NEXT: vpermpd {{.*#+}} zmm1 = zmm1[2,2,2,2,6,6,6,6] ; AVX512VL-NEXT: vinsertf64x4 $1, %ymm3, %zmm3, %zmm13 ; AVX512VL-NEXT: vmulps %zmm1, %zmm13, %zmm1 ; AVX512VL-NEXT: vaddps %zmm1, %zmm0, %zmm0 ; AVX512VL-NEXT: vextractf64x4 $1, %zmm5, %ymm1 -; AVX512VL-NEXT: vpermilps {{.*#+}} zmm2 = zmm4[3,3,3,3,7,7,7,7,11,11,11,11,15,15,15,15] +; AVX512VL-NEXT: vshufps {{.*#+}} zmm2 = zmm4[3,3,3,3,7,7,7,7,11,11,11,11,15,15,15,15] ; AVX512VL-NEXT: vpermpd {{.*#+}} zmm2 = zmm2[2,2,2,2,6,6,6,6] ; AVX512VL-NEXT: vshuff64x2 {{.*#+}} zmm3 = zmm3[4,5,6,7,4,5,6,7] ; AVX512VL-NEXT: vmulps %zmm2, %zmm3, %zmm2 ; AVX512VL-NEXT: vaddps %zmm2, %zmm0, %zmm0 -; AVX512VL-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[2,2,2,2] +; AVX512VL-NEXT: vshufps {{.*#+}} xmm2 = xmm1[2,2,2,2] ; AVX512VL-NEXT: vbroadcastsd %xmm2, %ymm2 ; AVX512VL-NEXT: vinsertf64x4 $1, %ymm2, %zmm18, %zmm2 ; AVX512VL-NEXT: vextractf32x4 $2, %zmm5, %xmm4 @@ -2665,14 +2665,14 @@ ; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm15 = xmm1[1,1,3,3] ; AVX512VL-NEXT: vbroadcastsd %xmm15, %ymm15 ; AVX512VL-NEXT: vinsertf64x4 $1, %ymm15, %zmm16, %zmm15 -; AVX512VL-NEXT: vpermilps {{.*#+}} xmm16 = xmm5[3,3,3,3] +; AVX512VL-NEXT: vshufps {{.*#+}} xmm16 = xmm5[3,3,3,3] ; AVX512VL-NEXT: vbroadcastsd %xmm16, %ymm16 ; AVX512VL-NEXT: vmulps %zmm4, %zmm11, %zmm4 ; AVX512VL-NEXT: vmulps %zmm15, %zmm10, %zmm15 ; AVX512VL-NEXT: vaddps %zmm15, %zmm4, %zmm4 ; AVX512VL-NEXT: vextractf128 $1, %ymm5, %xmm15 ; AVX512VL-NEXT: vbroadcastss %xmm15, %ymm15 -; AVX512VL-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,3,3,3] +; AVX512VL-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3] ; AVX512VL-NEXT: vbroadcastsd %xmm1, %ymm1 ; AVX512VL-NEXT: vmulps %zmm2, %zmm14, %zmm2 ; AVX512VL-NEXT: vaddps %zmm2, %zmm4, %zmm2 @@ -2685,7 +2685,7 @@ ; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm2 = xmm6[1,1,3,3] ; AVX512VL-NEXT: vbroadcastsd %xmm2, %ymm2 ; AVX512VL-NEXT: vinsertf64x4 $1, %ymm4, %zmm15, %zmm4 -; AVX512VL-NEXT: vpermilps {{.*#+}} xmm15 = xmm6[2,2,2,2] +; AVX512VL-NEXT: vshufps {{.*#+}} xmm15 = xmm6[2,2,2,2] ; AVX512VL-NEXT: vbroadcastsd %xmm15, %ymm15 ; AVX512VL-NEXT: vmulps %zmm4, %zmm8, %zmm4 ; AVX512VL-NEXT: vaddps %zmm4, %zmm1, %zmm1 @@ -2693,16 +2693,16 @@ ; AVX512VL-NEXT: vpermpd {{.*#+}} zmm4 = zmm4[2,2,2,2,6,6,6,6] ; AVX512VL-NEXT: vmulps %zmm4, %zmm12, %zmm4 ; AVX512VL-NEXT: vaddps %zmm4, %zmm1, %zmm1 -; AVX512VL-NEXT: vpermilps {{.*#+}} zmm4 = zmm5[2,2,2,2,6,6,6,6,10,10,10,10,14,14,14,14] +; AVX512VL-NEXT: vshufps {{.*#+}} zmm4 = zmm5[2,2,2,2,6,6,6,6,10,10,10,10,14,14,14,14] ; AVX512VL-NEXT: vpermpd {{.*#+}} zmm4 = zmm4[2,2,2,2,6,6,6,6] ; AVX512VL-NEXT: vmulps %zmm4, %zmm13, %zmm4 ; AVX512VL-NEXT: vaddps %zmm4, %zmm1, %zmm1 ; AVX512VL-NEXT: vextractf64x4 $1, %zmm6, %ymm4 -; AVX512VL-NEXT: vpermilps {{.*#+}} zmm5 = zmm5[3,3,3,3,7,7,7,7,11,11,11,11,15,15,15,15] +; AVX512VL-NEXT: vshufps {{.*#+}} zmm5 = zmm5[3,3,3,3,7,7,7,7,11,11,11,11,15,15,15,15] ; AVX512VL-NEXT: vpermpd {{.*#+}} zmm5 = zmm5[2,2,2,2,6,6,6,6] ; AVX512VL-NEXT: vmulps %zmm5, %zmm3, %zmm5 ; AVX512VL-NEXT: vaddps %zmm5, %zmm1, %zmm1 -; AVX512VL-NEXT: vpermilps {{.*#+}} xmm5 = xmm4[2,2,2,2] +; AVX512VL-NEXT: vshufps {{.*#+}} xmm5 = xmm4[2,2,2,2] ; AVX512VL-NEXT: vbroadcastsd %xmm5, %ymm5 ; AVX512VL-NEXT: vinsertf64x4 $1, %ymm5, %zmm15, %zmm5 ; AVX512VL-NEXT: vextractf32x4 $2, %zmm6, %xmm15 @@ -2716,9 +2716,9 @@ ; AVX512VL-NEXT: vaddps %zmm2, %zmm15, %zmm2 ; AVX512VL-NEXT: vmulps %zmm5, %zmm14, %zmm5 ; AVX512VL-NEXT: vaddps %zmm5, %zmm2, %zmm2 -; AVX512VL-NEXT: vpermilps {{.*#+}} xmm5 = xmm6[3,3,3,3] +; AVX512VL-NEXT: vshufps {{.*#+}} xmm5 = xmm6[3,3,3,3] ; AVX512VL-NEXT: vbroadcastsd %xmm5, %ymm5 -; AVX512VL-NEXT: vpermilps {{.*#+}} xmm4 = xmm4[3,3,3,3] +; AVX512VL-NEXT: vshufps {{.*#+}} xmm4 = xmm4[3,3,3,3] ; AVX512VL-NEXT: vbroadcastsd %xmm4, %ymm4 ; AVX512VL-NEXT: vinsertf64x4 $1, %ymm4, %zmm5, %zmm4 ; AVX512VL-NEXT: vextractf128 $1, %ymm6, %xmm5 @@ -2728,7 +2728,7 @@ ; AVX512VL-NEXT: vextractf32x4 $3, %zmm6, %xmm4 ; AVX512VL-NEXT: vbroadcastss %xmm4, %ymm4 ; AVX512VL-NEXT: vinsertf64x4 $1, %ymm4, %zmm5, %zmm4 -; AVX512VL-NEXT: vpermilps {{.*#+}} xmm5 = xmm7[2,2,2,2] +; AVX512VL-NEXT: vshufps {{.*#+}} xmm5 = xmm7[2,2,2,2] ; AVX512VL-NEXT: vbroadcastsd %xmm5, %ymm5 ; AVX512VL-NEXT: vmulps %zmm4, %zmm8, %zmm4 ; AVX512VL-NEXT: vaddps %zmm4, %zmm2, %zmm2 @@ -2736,16 +2736,16 @@ ; AVX512VL-NEXT: vpermpd {{.*#+}} zmm4 = zmm4[2,2,2,2,6,6,6,6] ; AVX512VL-NEXT: vmulps %zmm4, %zmm12, %zmm4 ; AVX512VL-NEXT: vaddps %zmm4, %zmm2, %zmm2 -; AVX512VL-NEXT: vpermilps {{.*#+}} zmm4 = zmm6[2,2,2,2,6,6,6,6,10,10,10,10,14,14,14,14] +; AVX512VL-NEXT: vshufps {{.*#+}} zmm4 = zmm6[2,2,2,2,6,6,6,6,10,10,10,10,14,14,14,14] ; AVX512VL-NEXT: vpermpd {{.*#+}} zmm4 = zmm4[2,2,2,2,6,6,6,6] ; AVX512VL-NEXT: vmulps %zmm4, %zmm13, %zmm4 ; AVX512VL-NEXT: vaddps %zmm4, %zmm2, %zmm2 ; AVX512VL-NEXT: vextractf64x4 $1, %zmm7, %ymm4 -; AVX512VL-NEXT: vpermilps {{.*#+}} zmm6 = zmm6[3,3,3,3,7,7,7,7,11,11,11,11,15,15,15,15] +; AVX512VL-NEXT: vshufps {{.*#+}} zmm6 = zmm6[3,3,3,3,7,7,7,7,11,11,11,11,15,15,15,15] ; AVX512VL-NEXT: vpermpd {{.*#+}} zmm6 = zmm6[2,2,2,2,6,6,6,6] ; AVX512VL-NEXT: vmulps %zmm6, %zmm3, %zmm6 ; AVX512VL-NEXT: vaddps %zmm6, %zmm2, %zmm2 -; AVX512VL-NEXT: vpermilps {{.*#+}} xmm6 = xmm4[2,2,2,2] +; AVX512VL-NEXT: vshufps {{.*#+}} xmm6 = xmm4[2,2,2,2] ; AVX512VL-NEXT: vbroadcastsd %xmm6, %ymm6 ; AVX512VL-NEXT: vinsertf64x4 $1, %ymm6, %zmm5, %zmm5 ; AVX512VL-NEXT: vbroadcastss %xmm7, %ymm6 @@ -2762,9 +2762,9 @@ ; AVX512VL-NEXT: vmulps %zmm11, %zmm10, %zmm10 ; AVX512VL-NEXT: vaddps %zmm10, %zmm6, %zmm6 ; AVX512VL-NEXT: vaddps %zmm5, %zmm6, %zmm5 -; AVX512VL-NEXT: vpermilps {{.*#+}} xmm6 = xmm7[3,3,3,3] +; AVX512VL-NEXT: vshufps {{.*#+}} xmm6 = xmm7[3,3,3,3] ; AVX512VL-NEXT: vbroadcastsd %xmm6, %ymm6 -; AVX512VL-NEXT: vpermilps {{.*#+}} xmm4 = xmm4[3,3,3,3] +; AVX512VL-NEXT: vshufps {{.*#+}} xmm4 = xmm4[3,3,3,3] ; AVX512VL-NEXT: vbroadcastsd %xmm4, %ymm4 ; AVX512VL-NEXT: vinsertf64x4 $1, %ymm4, %zmm6, %zmm4 ; AVX512VL-NEXT: vmulps %zmm4, %zmm9, %zmm4 @@ -2780,11 +2780,11 @@ ; AVX512VL-NEXT: vpermpd {{.*#+}} zmm5 = zmm5[2,2,2,2,6,6,6,6] ; AVX512VL-NEXT: vmulps %zmm5, %zmm12, %zmm5 ; AVX512VL-NEXT: vaddps %zmm5, %zmm4, %zmm4 -; AVX512VL-NEXT: vpermilps {{.*#+}} zmm5 = zmm7[2,2,2,2,6,6,6,6,10,10,10,10,14,14,14,14] +; AVX512VL-NEXT: vshufps {{.*#+}} zmm5 = zmm7[2,2,2,2,6,6,6,6,10,10,10,10,14,14,14,14] ; AVX512VL-NEXT: vpermpd {{.*#+}} zmm5 = zmm5[2,2,2,2,6,6,6,6] ; AVX512VL-NEXT: vmulps %zmm5, %zmm13, %zmm5 ; AVX512VL-NEXT: vaddps %zmm5, %zmm4, %zmm4 -; AVX512VL-NEXT: vpermilps {{.*#+}} zmm5 = zmm7[3,3,3,3,7,7,7,7,11,11,11,11,15,15,15,15] +; AVX512VL-NEXT: vshufps {{.*#+}} zmm5 = zmm7[3,3,3,3,7,7,7,7,11,11,11,11,15,15,15,15] ; AVX512VL-NEXT: vpermpd {{.*#+}} zmm5 = zmm5[2,2,2,2,6,6,6,6] ; AVX512VL-NEXT: vmulps %zmm5, %zmm3, %zmm3 ; AVX512VL-NEXT: vaddps %zmm3, %zmm4, %zmm3 diff --git a/llvm/test/CodeGen/X86/oddshuffles.ll b/llvm/test/CodeGen/X86/oddshuffles.ll --- a/llvm/test/CodeGen/X86/oddshuffles.ll +++ b/llvm/test/CodeGen/X86/oddshuffles.ll @@ -129,7 +129,7 @@ ; ; AVX-LABEL: v5i32: ; AVX: # %bb.0: -; AVX-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[1,2,2,3] +; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,2,2,3] ; AVX-NEXT: vunpcklps {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; AVX-NEXT: vextractps $3, %xmm0, 16(%rdi) ; AVX-NEXT: vmovaps %xmm1, (%rdi) @@ -161,7 +161,7 @@ ; AVX-LABEL: v5f32: ; AVX: # %bb.0: ; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm0[0,1],xmm1[1,2] -; AVX-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,2,1,3] +; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2,1,3] ; AVX-NEXT: vextractps $3, %xmm0, 16(%rdi) ; AVX-NEXT: vmovaps %xmm1, (%rdi) ; AVX-NEXT: retq @@ -313,9 +313,9 @@ ; AVX-LABEL: v7i32: ; AVX: # %bb.0: ; AVX-NEXT: vblendps {{.*#+}} xmm2 = xmm0[0,1],xmm1[2],xmm0[3] -; AVX-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[0,2,3,2] +; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,2,3,2] ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,3,2,3] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3,2,3] ; AVX-NEXT: vmovss %xmm1, 24(%rdi) ; AVX-NEXT: vmovlps %xmm0, 16(%rdi) ; AVX-NEXT: vmovaps %xmm2, (%rdi) @@ -513,12 +513,12 @@ ; AVX1-NEXT: vmovsldup {{.*#+}} ymm2 = ymm2[0,0,2,2,4,4,6,6] ; AVX1-NEXT: vpermilps {{.*#+}} ymm3 = ymm0[0,u,u,1,5,u,u,6] ; AVX1-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3,4,5],ymm2[6],ymm3[7] -; AVX1-NEXT: vpermilps {{.*#+}} xmm3 = xmm1[0,1,0,1] +; AVX1-NEXT: vshufps {{.*#+}} xmm3 = xmm1[0,1,0,1] ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm3 ; AVX1-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7] ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 ; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3],xmm3[3,3] -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[2,3,2,3] +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3] ; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3] ; AVX1-NEXT: vmovaps %xmm0, 32(%rdi) ; AVX1-NEXT: vmovaps %ymm2, (%rdi) @@ -533,7 +533,7 @@ ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7] ; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm3 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3],xmm3[3,3] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[2,3,2,3] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3] ; AVX2-SLOW-NEXT: vmovaps %xmm0, 32(%rdi) ; AVX2-SLOW-NEXT: vmovaps %ymm2, (%rdi) @@ -548,7 +548,7 @@ ; AVX2-FAST-ALL-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7] ; AVX2-FAST-ALL-NEXT: vmovaps {{.*#+}} ymm3 = ; AVX2-FAST-ALL-NEXT: vpermps %ymm0, %ymm3, %ymm0 -; AVX2-FAST-ALL-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[2,3,2,3] +; AVX2-FAST-ALL-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3] ; AVX2-FAST-ALL-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3] ; AVX2-FAST-ALL-NEXT: vmovaps %xmm0, 32(%rdi) ; AVX2-FAST-ALL-NEXT: vmovaps %ymm2, (%rdi) @@ -563,7 +563,7 @@ ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7] ; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm0, %xmm3 ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3],xmm3[3,3] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[2,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3] ; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm0, 32(%rdi) ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, (%rdi) @@ -574,12 +574,12 @@ ; XOP: # %bb.0: ; XOP-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3,0,1] ; XOP-NEXT: vpermil2ps {{.*#+}} ymm2 = ymm0[0],ymm2[0],ymm0[u,1,5,u],ymm2[6],ymm0[6] -; XOP-NEXT: vpermilps {{.*#+}} xmm3 = xmm1[0,1,0,1] +; XOP-NEXT: vshufps {{.*#+}} xmm3 = xmm1[0,1,0,1] ; XOP-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm3 ; XOP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7] ; XOP-NEXT: vextractf128 $1, %ymm0, %xmm3 ; XOP-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3],xmm3[3,3] -; XOP-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[2,3,2,3] +; XOP-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3] ; XOP-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3] ; XOP-NEXT: vmovaps %xmm0, 32(%rdi) ; XOP-NEXT: vmovaps %ymm2, (%rdi) @@ -1479,14 +1479,14 @@ ; AVX1-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,3],ymm4[0,2],ymm3[4,7],ymm4[4,6] ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm0[2,3,0,1] ; AVX1-NEXT: vshufps {{.*#+}} ymm5 = ymm0[1,0],ymm4[2,0],ymm0[5,4],ymm4[6,4] -; AVX1-NEXT: vpermilps {{.*#+}} ymm5 = ymm5[0,1,2,0,4,5,6,4] +; AVX1-NEXT: vshufps {{.*#+}} ymm5 = ymm5[0,1,2,0,4,5,6,4] ; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm5[6,7] ; AVX1-NEXT: vshufps {{.*#+}} ymm5 = ymm0[2,0],ymm4[3,0],ymm0[6,4],ymm4[7,4] ; AVX1-NEXT: vshufps {{.*#+}} ymm5 = ymm4[0,0],ymm5[2,0],ymm4[4,4],ymm5[6,4] ; AVX1-NEXT: vmovups 16(%rdi), %xmm6 ; AVX1-NEXT: vblendps {{.*#+}} ymm7 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] ; AVX1-NEXT: vshufps {{.*#+}} ymm7 = ymm7[1,2],ymm6[0,3],ymm7[5,6],ymm6[4,7] -; AVX1-NEXT: vpermilps {{.*#+}} ymm7 = ymm7[0,2,3,1,4,6,7,5] +; AVX1-NEXT: vshufps {{.*#+}} ymm7 = ymm7[0,2,3,1,4,6,7,5] ; AVX1-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0,1,2,3,4],ymm5[5,6,7] ; AVX1-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7] ; AVX1-NEXT: vshufps {{.*#+}} ymm2 = ymm6[1,0],ymm2[2,0],ymm6[5,4],ymm2[6,4] @@ -1520,7 +1520,7 @@ ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] ; AVX2-SLOW-NEXT: vmovaps {{.*#+}} ymm1 = <2,5,0,3,6,u,u,u> ; AVX2-SLOW-NEXT: vpermps %ymm0, %ymm1, %ymm0 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm2[0,1,0,3,4,5,4,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,1,0,3,4,5,4,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm3, (%rsi) @@ -1580,7 +1580,7 @@ ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} ymm1 = <2,5,0,3,6,u,u,u> ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm0, %ymm1, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm2[0,1,0,3,4,5,4,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,1,0,3,4,5,4,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm3, (%rsi) @@ -1600,14 +1600,14 @@ ; XOP-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,3],ymm4[0,2],ymm3[4,7],ymm4[4,6] ; XOP-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm0[2,3,0,1] ; XOP-NEXT: vshufps {{.*#+}} ymm5 = ymm0[1,0],ymm4[2,0],ymm0[5,4],ymm4[6,4] -; XOP-NEXT: vpermilps {{.*#+}} ymm5 = ymm5[0,1,2,0,4,5,6,4] +; XOP-NEXT: vshufps {{.*#+}} ymm5 = ymm5[0,1,2,0,4,5,6,4] ; XOP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm5[6,7] ; XOP-NEXT: vshufps {{.*#+}} ymm5 = ymm0[2,0],ymm4[3,0],ymm0[6,4],ymm4[7,4] ; XOP-NEXT: vshufps {{.*#+}} ymm5 = ymm4[0,0],ymm5[2,0],ymm4[4,4],ymm5[6,4] ; XOP-NEXT: vmovups 16(%rdi), %xmm6 ; XOP-NEXT: vblendps {{.*#+}} ymm7 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] ; XOP-NEXT: vshufps {{.*#+}} ymm7 = ymm7[1,2],ymm6[0,3],ymm7[5,6],ymm6[4,7] -; XOP-NEXT: vpermilps {{.*#+}} ymm7 = ymm7[0,2,3,1,4,6,7,5] +; XOP-NEXT: vshufps {{.*#+}} ymm7 = ymm7[0,2,3,1,4,6,7,5] ; XOP-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0,1,2,3,4],ymm5[5,6,7] ; XOP-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7] ; XOP-NEXT: vshufps {{.*#+}} ymm2 = ymm6[1,0],ymm2[2,0],ymm6[5,4],ymm2[6,4] @@ -1746,7 +1746,7 @@ ; AVX2-SLOW-NEXT: vmovups (%rdx), %ymm1 ; AVX2-SLOW-NEXT: vmovups (%rcx), %ymm2 ; AVX2-SLOW-NEXT: vbroadcastsd 24(%rsi), %ymm3 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm4 = ymm1[1,2,3,3,5,6,7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm4 = ymm1[1,2,3,3,5,6,7,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[2,2,2,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm4 = ymm2[2,1,3,3] @@ -1757,7 +1757,7 @@ ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5,6],ymm4[7] ; AVX2-SLOW-NEXT: vbroadcastsd (%rcx), %ymm5 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,0,3,3,4,4,7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,0,3,3,4,4,7,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,1,2,2] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm2[1,1,2,2] @@ -1786,7 +1786,7 @@ ; AVX2-FAST-ALL-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5,6],ymm4[7] ; AVX2-FAST-ALL-NEXT: vbroadcastsd (%rcx), %ymm5 ; AVX2-FAST-ALL-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7] -; AVX2-FAST-ALL-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,0,3,3,4,4,7,7] +; AVX2-FAST-ALL-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,0,3,3,4,4,7,7] ; AVX2-FAST-ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,1,2,2] ; AVX2-FAST-ALL-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] ; AVX2-FAST-ALL-NEXT: vpermpd {{.*#+}} ymm1 = ymm2[1,1,2,2] @@ -1803,7 +1803,7 @@ ; AVX2-FAST-PERLANE-NEXT: vmovups (%rdx), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovups (%rcx), %ymm2 ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 24(%rsi), %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm4 = ymm1[1,2,3,3,5,6,7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm4 = ymm1[1,2,3,3,5,6,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[2,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm4 = ymm2[2,1,3,3] @@ -1814,7 +1814,7 @@ ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5,6],ymm4[7] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd (%rcx), %ymm5 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,0,3,3,4,4,7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,0,3,3,4,4,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm2[1,1,2,2] @@ -2409,7 +2409,7 @@ ; AVX1-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],mem[0],ymm1[1],mem[1],ymm1[4],mem[4],ymm1[5],mem[5] ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[2],mem[2] -; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,1,2,0,4,5,6,4] +; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,2,0,4,5,6,4] ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] ; AVX1-NEXT: vpsrld $16, %xmm0, %xmm0 @@ -2420,7 +2420,7 @@ ; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,1,3,3] ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 ; AVX1-NEXT: vmovshdup {{.*#+}} ymm3 = ymm1[1,1,3,3,5,5,7,7] -; AVX1-NEXT: vpermilps {{.*#+}} ymm4 = ymm1[3,3,3,3,7,7,7,7] +; AVX1-NEXT: vshufps {{.*#+}} ymm4 = ymm1[3,3,3,3,7,7,7,7] ; AVX1-NEXT: vpermilpd {{.*#+}} ymm5 = ymm1[0,0,3,2] ; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[1,1,1,1] ; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm0[3,3,3,3] @@ -2481,7 +2481,7 @@ ; XOP-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],mem[0],ymm1[1],mem[1],ymm1[4],mem[4],ymm1[5],mem[5] ; XOP-NEXT: vextractf128 $1, %ymm1, %xmm1 ; XOP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[2],mem[2] -; XOP-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,1,2,0,4,5,6,4] +; XOP-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,2,0,4,5,6,4] ; XOP-NEXT: vextractf128 $1, %ymm0, %xmm0 ; XOP-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] ; XOP-NEXT: vpsrld $16, %xmm0, %xmm0 @@ -2492,7 +2492,7 @@ ; XOP-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,1,3,3] ; XOP-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 ; XOP-NEXT: vmovshdup {{.*#+}} ymm3 = ymm1[1,1,3,3,5,5,7,7] -; XOP-NEXT: vpermilps {{.*#+}} ymm4 = ymm1[3,3,3,3,7,7,7,7] +; XOP-NEXT: vshufps {{.*#+}} ymm4 = ymm1[3,3,3,3,7,7,7,7] ; XOP-NEXT: vpermilpd {{.*#+}} ymm5 = ymm1[0,0,3,2] ; XOP-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[1,1,1,1] ; XOP-NEXT: vpshufd {{.*#+}} xmm7 = xmm0[3,3,3,3] diff --git a/llvm/test/CodeGen/X86/opt-pipeline.ll b/llvm/test/CodeGen/X86/opt-pipeline.ll --- a/llvm/test/CodeGen/X86/opt-pipeline.ll +++ b/llvm/test/CodeGen/X86/opt-pipeline.ll @@ -202,6 +202,7 @@ ; CHECK-NEXT: Lazy Machine Block Frequency Analysis ; CHECK-NEXT: X86 Atom pad short functions ; CHECK-NEXT: X86 LEA Fixup +; CHECK-NEXT: X86 Fixup Inst Tuning ; CHECK-NEXT: Compressing EVEX instrs to VEX encoding when possible ; CHECK-NEXT: X86 Discriminate Memory Operands ; CHECK-NEXT: X86 Insert Cache Prefetches diff --git a/llvm/test/CodeGen/X86/packss.ll b/llvm/test/CodeGen/X86/packss.ll --- a/llvm/test/CodeGen/X86/packss.ll +++ b/llvm/test/CodeGen/X86/packss.ll @@ -185,7 +185,7 @@ ; X86-AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm1 ; X86-AVX1-NEXT: vpaddq %xmm3, %xmm1, %xmm1 ; X86-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; X86-AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4] +; X86-AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4] ; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; X86-AVX1-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 ; X86-AVX1-NEXT: vzeroupper @@ -236,7 +236,7 @@ ; X64-AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm1 ; X64-AVX1-NEXT: vpsubq %xmm2, %xmm1, %xmm1 ; X64-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; X64-AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4] +; X64-AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4] ; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; X64-AVX1-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 ; X64-AVX1-NEXT: vzeroupper diff --git a/llvm/test/CodeGen/X86/palignr.ll b/llvm/test/CodeGen/X86/palignr.ll --- a/llvm/test/CodeGen/X86/palignr.ll +++ b/llvm/test/CodeGen/X86/palignr.ll @@ -11,7 +11,7 @@ ; ; CHECK-AVX-LABEL: test1: ; CHECK-AVX: # %bb.0: -; CHECK-AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,2,3,0] +; CHECK-AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,2,3,0] ; CHECK-AVX-NEXT: retl %C = shufflevector <4 x i32> %A, <4 x i32> undef, <4 x i32> < i32 1, i32 2, i32 3, i32 0 > ret <4 x i32> %C diff --git a/llvm/test/CodeGen/X86/pr31956.ll b/llvm/test/CodeGen/X86/pr31956.ll --- a/llvm/test/CodeGen/X86/pr31956.ll +++ b/llvm/test/CodeGen/X86/pr31956.ll @@ -12,7 +12,7 @@ ; CHECK-NEXT: vmovaps G2(%rip), %xmm0 ; CHECK-NEXT: vmovlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3] ; CHECK-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,0],mem[0,2] -; CHECK-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,0,3,1] +; CHECK-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,0,3,1] ; CHECK-NEXT: retq entry: %V = load <2 x float>, ptr @G1, align 8 diff --git a/llvm/test/CodeGen/X86/pr40730.ll b/llvm/test/CodeGen/X86/pr40730.ll --- a/llvm/test/CodeGen/X86/pr40730.ll +++ b/llvm/test/CodeGen/X86/pr40730.ll @@ -6,7 +6,7 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm2 ; CHECK-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm0[1,2,3] -; CHECK-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[3,1,1,0] +; CHECK-NEXT: vshufps {{.*#+}} xmm2 = xmm2[3,1,1,0] ; CHECK-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; CHECK-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,2,3] ; CHECK-NEXT: vpermilpd {{.*#+}} ymm1 = ymm1[0,0,3,2] @@ -27,7 +27,7 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm1 ; CHECK-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm0[1,2,3] -; CHECK-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,1,1,0] +; CHECK-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,1,1,0] ; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; CHECK-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],mem[1,2,3],ymm0[4],mem[5],ymm0[6,7] ; CHECK-NEXT: retq diff --git a/llvm/test/CodeGen/X86/pr40811.ll b/llvm/test/CodeGen/X86/pr40811.ll --- a/llvm/test/CodeGen/X86/pr40811.ll +++ b/llvm/test/CodeGen/X86/pr40811.ll @@ -6,8 +6,8 @@ ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vmovaps (%rdi), %xmm0 ; CHECK-NEXT: vblendps {{.*#+}} xmm1 = xmm0[0],mem[1],xmm0[2,3] -; CHECK-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,3,1,0] -; CHECK-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,2,1,0] +; CHECK-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3,1,0] +; CHECK-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2,1,0] ; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; CHECK-NEXT: retq entry: diff --git a/llvm/test/CodeGen/X86/pr50609.ll b/llvm/test/CodeGen/X86/pr50609.ll --- a/llvm/test/CodeGen/X86/pr50609.ll +++ b/llvm/test/CodeGen/X86/pr50609.ll @@ -13,7 +13,7 @@ ; CHECK-NEXT: vpaddd %xmm3, %xmm2, %xmm2 ; CHECK-NEXT: vpsrad $2, %xmm2, %xmm2 ; CHECK-NEXT: vcvtdq2ps %ymm2, %ymm2 -; CHECK-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[0,0,0,0] +; CHECK-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,0,0,0] ; CHECK-NEXT: vinsertf128 $1, %xmm2, %ymm2, %ymm2 ; CHECK-NEXT: vmaskmovps %ymm2, %ymm0, (%rdi) ; CHECK-NEXT: vmaskmovps %ymm2, %ymm1, 32(%rdi) diff --git a/llvm/test/CodeGen/X86/rotate_vec.ll b/llvm/test/CodeGen/X86/rotate_vec.ll --- a/llvm/test/CodeGen/X86/rotate_vec.ll +++ b/llvm/test/CodeGen/X86/rotate_vec.ll @@ -80,7 +80,7 @@ define <4 x i32> @rot_v4i32_zero_non_splat(<4 x i32> %x) { ; XOPAVX1-LABEL: rot_v4i32_zero_non_splat: ; XOPAVX1: # %bb.0: -; XOPAVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0] +; XOPAVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0,0,0] ; XOPAVX1-NEXT: retq ; ; XOPAVX2-LABEL: rot_v4i32_zero_non_splat: diff --git a/llvm/test/CodeGen/X86/scalarize-fp.ll b/llvm/test/CodeGen/X86/scalarize-fp.ll --- a/llvm/test/CodeGen/X86/scalarize-fp.ll +++ b/llvm/test/CodeGen/X86/scalarize-fp.ll @@ -394,7 +394,7 @@ ; AVX-LABEL: fmul_splat_splat_v4f32: ; AVX: # %bb.0: ; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0,0,0] ; AVX-NEXT: retq %splatx = shufflevector <4 x float> %vx, <4 x float> undef, <4 x i32> zeroinitializer %splaty = shufflevector <4 x float> %vy, <4 x float> undef, <4 x i32> zeroinitializer @@ -413,7 +413,7 @@ ; AVX-LABEL: fdiv_splat_splat_v8f32: ; AVX: # %bb.0: ; AVX-NEXT: vdivss %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0,0,0] ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX-NEXT: retq %splatx = shufflevector <8 x float> %vx, <8 x float> undef, <8 x i32> zeroinitializer @@ -555,7 +555,7 @@ ; AVX-LABEL: fmul_splat_const_op1_v4f32: ; AVX: # %bb.0: ; AVX-NEXT: vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0,0,0] ; AVX-NEXT: retq %splatx = shufflevector <4 x float> %vx, <4 x float> undef, <4 x i32> zeroinitializer %r = fmul fast <4 x float> %splatx, @@ -575,7 +575,7 @@ ; AVX: # %bb.0: ; AVX-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; AVX-NEXT: vdivss %xmm0, %xmm1, %xmm0 -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0,0,0] ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX-NEXT: retq %splatx = shufflevector <8 x float> , <8 x float> undef, <8 x i32> zeroinitializer @@ -597,7 +597,7 @@ ; AVX: # %bb.0: ; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; AVX-NEXT: vdivss %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0,0,0] ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX-NEXT: retq %splatx = shufflevector <8 x float> %vx, <8 x float> undef, <8 x i32> zeroinitializer @@ -654,7 +654,7 @@ ; AVX-LABEL: splat0_fmul_v4f32: ; AVX: # %bb.0: ; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0,0,0] ; AVX-NEXT: retq %b = fmul fast <4 x float> %vx, %vy %r = shufflevector <4 x float> %b, <4 x float> undef, <4 x i32> zeroinitializer @@ -672,7 +672,7 @@ ; AVX-LABEL: splat0_fdiv_v8f32: ; AVX: # %bb.0: ; AVX-NEXT: vdivss %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0,0,0] ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX-NEXT: retq %b = fdiv fast <8 x float> %vx, %vy @@ -729,7 +729,7 @@ ; AVX-LABEL: splat0_fmul_const_op1_v4f32: ; AVX: # %bb.0: ; AVX-NEXT: vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0,0,0] ; AVX-NEXT: retq %b = fmul fast <4 x float> %vx, %r = shufflevector <4 x float> %b, <4 x float> undef, <4 x i32> zeroinitializer @@ -745,7 +745,7 @@ ; ; AVX-LABEL: splat0_fdiv_const_op1_v8f32: ; AVX: # %bb.0: -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0,0,0] ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX-NEXT: retq %b = fdiv fast <8 x float> %vx, @@ -766,7 +766,7 @@ ; AVX: # %bb.0: ; AVX-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; AVX-NEXT: vdivss %xmm0, %xmm1, %xmm0 -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0,0,0] ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX-NEXT: retq %b = fdiv fast <8 x float> , %vx @@ -786,7 +786,7 @@ ; AVX-LABEL: multi_use_binop: ; AVX: # %bb.0: ; AVX-NEXT: vmulps %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,0,0,0] +; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm0[0,0,0,0] ; AVX-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] ; AVX-NEXT: vaddps %xmm0, %xmm1, %xmm0 ; AVX-NEXT: retq diff --git a/llvm/test/CodeGen/X86/shuffle-of-shift.ll b/llvm/test/CodeGen/X86/shuffle-of-shift.ll --- a/llvm/test/CodeGen/X86/shuffle-of-shift.ll +++ b/llvm/test/CodeGen/X86/shuffle-of-shift.ll @@ -156,7 +156,7 @@ ; X64-AVX2-NEXT: pushq %rax ; X64-AVX2-NEXT: movl $63, %edi ; X64-AVX2-NEXT: callq llvm.x86.sse2.psrai.q@PLT -; X64-AVX2-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0] +; X64-AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,2,1,0] ; X64-AVX2-NEXT: popq %rax ; X64-AVX2-NEXT: retq ; @@ -173,7 +173,7 @@ ; X86-AVX2-NEXT: pushl $63 ; X86-AVX2-NEXT: calll llvm.x86.sse2.psrai.q@PLT ; X86-AVX2-NEXT: addl $4, %esp -; X86-AVX2-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0] +; X86-AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,2,1,0] ; X86-AVX2-NEXT: retl %i1 = tail call <2 x i64> @llvm.x86.sse2.psrai.q(<2 x i64> %x, i32 63) %i2 = bitcast <2 x i64> %i1 to <4 x i32> @@ -336,7 +336,7 @@ ; X64-AVX2-NEXT: pushq %rax ; X64-AVX2-NEXT: movl $63, %edi ; X64-AVX2-NEXT: callq llvm.x86.sse2.psrai.q@PLT -; X64-AVX2-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,0,1] +; X64-AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,0,1] ; X64-AVX2-NEXT: popq %rax ; X64-AVX2-NEXT: retq ; @@ -353,7 +353,7 @@ ; X86-AVX2-NEXT: pushl $63 ; X86-AVX2-NEXT: calll llvm.x86.sse2.psrai.q@PLT ; X86-AVX2-NEXT: addl $4, %esp -; X86-AVX2-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,0,1] +; X86-AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,0,1] ; X86-AVX2-NEXT: retl %i1 = tail call <2 x i64> @llvm.x86.sse2.psrai.q(<2 x i64> %x, i32 63) %i2 = bitcast <2 x i64> %i1 to <2 x i64> diff --git a/llvm/test/CodeGen/X86/shuffle-of-splat-multiuses.ll b/llvm/test/CodeGen/X86/shuffle-of-splat-multiuses.ll --- a/llvm/test/CodeGen/X86/shuffle-of-splat-multiuses.ll +++ b/llvm/test/CodeGen/X86/shuffle-of-splat-multiuses.ll @@ -58,7 +58,7 @@ define <4 x i32> @undef_splatmask(<4 x i32> %v) nounwind { ; AVX2-LABEL: undef_splatmask: ; AVX2: # %bb.0: -; AVX2-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,2,3,3] +; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,2,3,3] ; AVX2-NEXT: retq %res = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> %res1 = shufflevector <4 x i32> %res, <4 x i32> undef, <4 x i32> @@ -68,7 +68,7 @@ define <4 x i32> @undef_splatmask2(<4 x i32> %v) nounwind { ; AVX2-LABEL: undef_splatmask2: ; AVX2: # %bb.0: -; AVX2-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,2,3,3] +; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,2,3,3] ; AVX2-NEXT: retq %res = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> %res1 = shufflevector <4 x i32> %res, <4 x i32> undef, <4 x i32> @@ -78,7 +78,7 @@ define <4 x i32> @undef_splatmask3(<4 x i32> %v) nounwind { ; AVX2-LABEL: undef_splatmask3: ; AVX2: # %bb.0: -; AVX2-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,2,3,3] +; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,2,3,3] ; AVX2-NEXT: retq %res = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> %res1 = shufflevector <4 x i32> %res, <4 x i32> undef, <4 x i32> @@ -88,8 +88,8 @@ define <4 x i32> @undef_splatmask4(<4 x i32> %v, ptr %p) nounwind { ; AVX2-LABEL: undef_splatmask4: ; AVX2: # %bb.0: -; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,2,3,3] -; AVX2-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,2,3,3] +; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX2-NEXT: vmovaps %xmm0, (%rdi) ; AVX2-NEXT: vmovaps %xmm1, %xmm0 ; AVX2-NEXT: retq diff --git a/llvm/test/CodeGen/X86/sse-fsignum.ll b/llvm/test/CodeGen/X86/sse-fsignum.ll --- a/llvm/test/CodeGen/X86/sse-fsignum.ll +++ b/llvm/test/CodeGen/X86/sse-fsignum.ll @@ -38,10 +38,10 @@ ; AVX-NEXT: vmovapd (%rdi), %xmm0 ; AVX-NEXT: vxorpd %xmm1, %xmm1, %xmm1 ; AVX-NEXT: vcmpltpd %xmm1, %xmm0, %xmm2 -; AVX-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[0,2,2,3] +; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,2,2,3] ; AVX-NEXT: vcvtdq2pd %xmm2, %xmm2 ; AVX-NEXT: vcmpltpd %xmm0, %xmm1, %xmm0 -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3] ; AVX-NEXT: vcvtdq2pd %xmm0, %xmm0 ; AVX-NEXT: vsubpd %xmm0, %xmm2, %xmm0 ; AVX-NEXT: vmovapd %xmm0, (%rdi) diff --git a/llvm/test/CodeGen/X86/sse-intrinsics-fast-isel.ll b/llvm/test/CodeGen/X86/sse-intrinsics-fast-isel.ll --- a/llvm/test/CodeGen/X86/sse-intrinsics-fast-isel.ll +++ b/llvm/test/CodeGen/X86/sse-intrinsics-fast-isel.ll @@ -2104,7 +2104,7 @@ ; X86-AVX1: # %bb.0: ; X86-AVX1-NEXT: vmovss {{[0-9]+}}(%esp), %xmm0 # encoding: [0xc5,0xfa,0x10,0x44,0x24,0x04] ; X86-AVX1-NEXT: # xmm0 = mem[0],zero,zero,zero -; X86-AVX1-NEXT: vpermilps $0, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x04,0xc0,0x00] +; X86-AVX1-NEXT: vshufps $0, %xmm0, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0xc6,0xc0,0x00] ; X86-AVX1-NEXT: # xmm0 = xmm0[0,0,0,0] ; X86-AVX1-NEXT: retl # encoding: [0xc3] ; @@ -2123,7 +2123,7 @@ ; ; X64-AVX1-LABEL: test_mm_set_ps1: ; X64-AVX1: # %bb.0: -; X64-AVX1-NEXT: vpermilps $0, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x04,0xc0,0x00] +; X64-AVX1-NEXT: vshufps $0, %xmm0, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0xc6,0xc0,0x00] ; X64-AVX1-NEXT: # xmm0 = xmm0[0,0,0,0] ; X64-AVX1-NEXT: retq # encoding: [0xc3] ; @@ -2265,7 +2265,7 @@ ; X86-AVX1: # %bb.0: ; X86-AVX1-NEXT: vmovss {{[0-9]+}}(%esp), %xmm0 # encoding: [0xc5,0xfa,0x10,0x44,0x24,0x04] ; X86-AVX1-NEXT: # xmm0 = mem[0],zero,zero,zero -; X86-AVX1-NEXT: vpermilps $0, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x04,0xc0,0x00] +; X86-AVX1-NEXT: vshufps $0, %xmm0, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0xc6,0xc0,0x00] ; X86-AVX1-NEXT: # xmm0 = xmm0[0,0,0,0] ; X86-AVX1-NEXT: retl # encoding: [0xc3] ; @@ -2284,7 +2284,7 @@ ; ; X64-AVX1-LABEL: test_mm_set1_ps: ; X64-AVX1: # %bb.0: -; X64-AVX1-NEXT: vpermilps $0, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x04,0xc0,0x00] +; X64-AVX1-NEXT: vshufps $0, %xmm0, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0xc6,0xc0,0x00] ; X64-AVX1-NEXT: # xmm0 = xmm0[0,0,0,0] ; X64-AVX1-NEXT: retq # encoding: [0xc3] ; @@ -2623,7 +2623,7 @@ ; X86-AVX1-LABEL: test_mm_store_ps1: ; X86-AVX1: # %bb.0: ; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-AVX1-NEXT: vpermilps $0, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x04,0xc0,0x00] +; X86-AVX1-NEXT: vshufps $0, %xmm0, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0xc6,0xc0,0x00] ; X86-AVX1-NEXT: # xmm0 = xmm0[0,0,0,0] ; X86-AVX1-NEXT: vmovaps %xmm0, (%eax) # encoding: [0xc5,0xf8,0x29,0x00] ; X86-AVX1-NEXT: retl # encoding: [0xc3] @@ -2644,7 +2644,7 @@ ; ; X64-AVX1-LABEL: test_mm_store_ps1: ; X64-AVX1: # %bb.0: -; X64-AVX1-NEXT: vpermilps $0, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x04,0xc0,0x00] +; X64-AVX1-NEXT: vshufps $0, %xmm0, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0xc6,0xc0,0x00] ; X64-AVX1-NEXT: # xmm0 = xmm0[0,0,0,0] ; X64-AVX1-NEXT: vmovaps %xmm0, (%rdi) # encoding: [0xc5,0xf8,0x29,0x07] ; X64-AVX1-NEXT: retq # encoding: [0xc3] @@ -2710,7 +2710,7 @@ ; X86-AVX1-LABEL: test_mm_store1_ps: ; X86-AVX1: # %bb.0: ; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-AVX1-NEXT: vpermilps $0, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x04,0xc0,0x00] +; X86-AVX1-NEXT: vshufps $0, %xmm0, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0xc6,0xc0,0x00] ; X86-AVX1-NEXT: # xmm0 = xmm0[0,0,0,0] ; X86-AVX1-NEXT: vmovaps %xmm0, (%eax) # encoding: [0xc5,0xf8,0x29,0x00] ; X86-AVX1-NEXT: retl # encoding: [0xc3] @@ -2731,7 +2731,7 @@ ; ; X64-AVX1-LABEL: test_mm_store1_ps: ; X64-AVX1: # %bb.0: -; X64-AVX1-NEXT: vpermilps $0, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x04,0xc0,0x00] +; X64-AVX1-NEXT: vshufps $0, %xmm0, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0xc6,0xc0,0x00] ; X64-AVX1-NEXT: # xmm0 = xmm0[0,0,0,0] ; X64-AVX1-NEXT: vmovaps %xmm0, (%rdi) # encoding: [0xc5,0xf8,0x29,0x07] ; X64-AVX1-NEXT: retq # encoding: [0xc3] @@ -2972,7 +2972,7 @@ ; X86-AVX1-LABEL: test_mm_storer_ps: ; X86-AVX1: # %bb.0: ; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-AVX1-NEXT: vpermilps $27, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x04,0xc0,0x1b] +; X86-AVX1-NEXT: vshufps $27, %xmm0, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0xc6,0xc0,0x1b] ; X86-AVX1-NEXT: # xmm0 = xmm0[3,2,1,0] ; X86-AVX1-NEXT: vmovaps %xmm0, (%eax) # encoding: [0xc5,0xf8,0x29,0x00] ; X86-AVX1-NEXT: retl # encoding: [0xc3] @@ -2980,7 +2980,7 @@ ; X86-AVX512-LABEL: test_mm_storer_ps: ; X86-AVX512: # %bb.0: ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-AVX512-NEXT: vpermilps $27, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x04,0xc0,0x1b] +; X86-AVX512-NEXT: vshufps $27, %xmm0, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0xc6,0xc0,0x1b] ; X86-AVX512-NEXT: # xmm0 = xmm0[3,2,1,0] ; X86-AVX512-NEXT: vmovaps %xmm0, (%eax) # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x29,0x00] ; X86-AVX512-NEXT: retl # encoding: [0xc3] @@ -2994,14 +2994,14 @@ ; ; X64-AVX1-LABEL: test_mm_storer_ps: ; X64-AVX1: # %bb.0: -; X64-AVX1-NEXT: vpermilps $27, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x04,0xc0,0x1b] +; X64-AVX1-NEXT: vshufps $27, %xmm0, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0xc6,0xc0,0x1b] ; X64-AVX1-NEXT: # xmm0 = xmm0[3,2,1,0] ; X64-AVX1-NEXT: vmovaps %xmm0, (%rdi) # encoding: [0xc5,0xf8,0x29,0x07] ; X64-AVX1-NEXT: retq # encoding: [0xc3] ; ; X64-AVX512-LABEL: test_mm_storer_ps: ; X64-AVX512: # %bb.0: -; X64-AVX512-NEXT: vpermilps $27, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x04,0xc0,0x1b] +; X64-AVX512-NEXT: vshufps $27, %xmm0, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0xc6,0xc0,0x1b] ; X64-AVX512-NEXT: # xmm0 = xmm0[3,2,1,0] ; X64-AVX512-NEXT: vmovaps %xmm0, (%rdi) # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x29,0x07] ; X64-AVX512-NEXT: retq # encoding: [0xc3] diff --git a/llvm/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll b/llvm/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll --- a/llvm/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll +++ b/llvm/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll @@ -4523,7 +4523,7 @@ ; X86-AVX1: # %bb.0: ; X86-AVX1-NEXT: vmovss {{[0-9]+}}(%esp), %xmm0 # encoding: [0xc5,0xfa,0x10,0x44,0x24,0x04] ; X86-AVX1-NEXT: # xmm0 = mem[0],zero,zero,zero -; X86-AVX1-NEXT: vpermilps $0, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x04,0xc0,0x00] +; X86-AVX1-NEXT: vshufps $0, %xmm0, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0xc6,0xc0,0x00] ; X86-AVX1-NEXT: # xmm0 = xmm0[0,0,0,0] ; X86-AVX1-NEXT: retl # encoding: [0xc3] ; @@ -5636,7 +5636,7 @@ ; ; AVX1-LABEL: test_mm_shuffle_epi32: ; AVX1: # %bb.0: -; AVX1-NEXT: vpermilps $0, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x04,0xc0,0x00] +; AVX1-NEXT: vshufps $0, %xmm0, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0xc6,0xc0,0x00] ; AVX1-NEXT: # xmm0 = xmm0[0,0,0,0] ; AVX1-NEXT: ret{{[l|q]}} # encoding: [0xc3] ; diff --git a/llvm/test/CodeGen/X86/sse2-intrinsics-x86-upgrade.ll b/llvm/test/CodeGen/X86/sse2-intrinsics-x86-upgrade.ll --- a/llvm/test/CodeGen/X86/sse2-intrinsics-x86-upgrade.ll +++ b/llvm/test/CodeGen/X86/sse2-intrinsics-x86-upgrade.ll @@ -402,13 +402,13 @@ ; ; AVX1-LABEL: test_x86_sse2_pshuf_d: ; AVX1: ## %bb.0: ## %entry -; AVX1-NEXT: vpermilps $27, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x04,0xc0,0x1b] +; AVX1-NEXT: vshufps $27, %xmm0, %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0xc6,0xc0,0x1b] ; AVX1-NEXT: ## xmm0 = xmm0[3,2,1,0] ; AVX1-NEXT: ret{{[l|q]}} ## encoding: [0xc3] ; ; AVX512-LABEL: test_x86_sse2_pshuf_d: ; AVX512: ## %bb.0: ## %entry -; AVX512-NEXT: vpermilps $27, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x04,0xc0,0x1b] +; AVX512-NEXT: vshufps $27, %xmm0, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0xc6,0xc0,0x1b] ; AVX512-NEXT: ## xmm0 = xmm0[3,2,1,0] ; AVX512-NEXT: ret{{[l|q]}} ## encoding: [0xc3] entry: diff --git a/llvm/test/CodeGen/X86/sse2.ll b/llvm/test/CodeGen/X86/sse2.ll --- a/llvm/test/CodeGen/X86/sse2.ll +++ b/llvm/test/CodeGen/X86/sse2.ll @@ -144,7 +144,7 @@ ; X86-AVX-LABEL: test4: ; X86-AVX: # %bb.0: ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,1,3,3] +; X86-AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,1,3,3] ; X86-AVX-NEXT: vmovaps %xmm0, (%eax) ; X86-AVX-NEXT: retl ; @@ -156,7 +156,7 @@ ; ; X64-AVX-LABEL: test4: ; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,1,3,3] +; X64-AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,1,3,3] ; X64-AVX-NEXT: vmovaps %xmm0, (%rdi) ; X64-AVX-NEXT: retq %tmp5 = shufflevector <4 x float> %X, <4 x float> undef, <4 x i32> < i32 2, i32 6, i32 3, i32 7 > ; <<4 x float>> [#uses=1] @@ -448,7 +448,7 @@ ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-AVX-NEXT: vmovaps (%edx), %xmm0 ; X86-AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1],mem[0,1] -; X86-AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,1,3] +; X86-AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,1,3] ; X86-AVX-NEXT: vmovaps %xmm0, (%eax) ; X86-AVX-NEXT: retl ; @@ -464,7 +464,7 @@ ; X64-AVX: # %bb.0: ; X64-AVX-NEXT: vmovaps (%rdx), %xmm0 ; X64-AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1],mem[0,1] -; X64-AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,1,3] +; X64-AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,1,3] ; X64-AVX-NEXT: vmovaps %xmm0, (%rdi) ; X64-AVX-NEXT: retq %tmp3 = load <4 x float>, ptr %B ; <<4 x float>> [#uses=1] diff --git a/llvm/test/CodeGen/X86/sse3-avx-addsub-2.ll b/llvm/test/CodeGen/X86/sse3-avx-addsub-2.ll --- a/llvm/test/CodeGen/X86/sse3-avx-addsub-2.ll +++ b/llvm/test/CodeGen/X86/sse3-avx-addsub-2.ll @@ -338,16 +338,16 @@ ; ; AVX1-LABEL: test13: ; AVX1: # %bb.0: -; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,3,3,3] +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3] ; AVX1-NEXT: vaddss %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0] +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0,0,0] ; AVX1-NEXT: retq ; ; AVX512-LABEL: test13: ; AVX512: # %bb.0: -; AVX512-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] -; AVX512-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,3,3,3] +; AVX512-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX512-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3] ; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vbroadcastss %xmm0, %xmm0 ; AVX512-NEXT: retq @@ -407,8 +407,8 @@ ; AVX1-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] ; AVX1-NEXT: vmovshdup {{.*#+}} xmm3 = xmm1[1,1,3,3] ; AVX1-NEXT: vaddss %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,3,3,3] +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3] ; AVX1-NEXT: vaddss %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vmovsldup {{.*#+}} xmm1 = xmm2[0,0,2,2] ; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] @@ -419,8 +419,8 @@ ; AVX512-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] ; AVX512-NEXT: vmovshdup {{.*#+}} xmm3 = xmm1[1,1,3,3] ; AVX512-NEXT: vaddss %xmm3, %xmm2, %xmm2 -; AVX512-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] -; AVX512-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,3,3,3] +; AVX512-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX512-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3] ; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vbroadcastss %xmm2, %xmm1 ; AVX512-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] @@ -469,8 +469,8 @@ ; AVX-NEXT: vaddss %xmm2, %xmm5, %xmm2 ; AVX-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3] ; AVX-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm4[0],xmm2[3] -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] -; AVX-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,3,3,3] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3] ; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[0] ; AVX-NEXT: retq diff --git a/llvm/test/CodeGen/X86/sse41.ll b/llvm/test/CodeGen/X86/sse41.ll --- a/llvm/test/CodeGen/X86/sse41.ll +++ b/llvm/test/CodeGen/X86/sse41.ll @@ -194,7 +194,7 @@ ; X86-AVX1-LABEL: ext_1: ; X86-AVX1: ## %bb.0: ; X86-AVX1-NEXT: pushl %eax ## encoding: [0x50] -; X86-AVX1-NEXT: vpermilps $255, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x04,0xc0,0xff] +; X86-AVX1-NEXT: vshufps $255, %xmm0, %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0xc6,0xc0,0xff] ; X86-AVX1-NEXT: ## xmm0 = xmm0[3,3,3,3] ; X86-AVX1-NEXT: vaddss {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 ## encoding: [0xc5,0xfa,0x58,0x05,A,A,A,A] ; X86-AVX1-NEXT: ## fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 @@ -206,7 +206,7 @@ ; X86-AVX512-LABEL: ext_1: ; X86-AVX512: ## %bb.0: ; X86-AVX512-NEXT: pushl %eax ## encoding: [0x50] -; X86-AVX512-NEXT: vpermilps $255, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x04,0xc0,0xff] +; X86-AVX512-NEXT: vshufps $255, %xmm0, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0xc6,0xc0,0xff] ; X86-AVX512-NEXT: ## xmm0 = xmm0[3,3,3,3] ; X86-AVX512-NEXT: vaddss {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x58,0x05,A,A,A,A] ; X86-AVX512-NEXT: ## fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 @@ -225,7 +225,7 @@ ; ; X64-AVX1-LABEL: ext_1: ; X64-AVX1: ## %bb.0: -; X64-AVX1-NEXT: vpermilps $255, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x04,0xc0,0xff] +; X64-AVX1-NEXT: vshufps $255, %xmm0, %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0xc6,0xc0,0xff] ; X64-AVX1-NEXT: ## xmm0 = xmm0[3,3,3,3] ; X64-AVX1-NEXT: vaddss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ## encoding: [0xc5,0xfa,0x58,0x05,A,A,A,A] ; X64-AVX1-NEXT: ## fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte @@ -233,7 +233,7 @@ ; ; X64-AVX512-LABEL: ext_1: ; X64-AVX512: ## %bb.0: -; X64-AVX512-NEXT: vpermilps $255, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x04,0xc0,0xff] +; X64-AVX512-NEXT: vshufps $255, %xmm0, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0xc6,0xc0,0xff] ; X64-AVX512-NEXT: ## xmm0 = xmm0[3,3,3,3] ; X64-AVX512-NEXT: vaddss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x58,0x05,A,A,A,A] ; X64-AVX512-NEXT: ## fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte @@ -257,7 +257,7 @@ ; X86-AVX1-LABEL: ext_2: ; X86-AVX1: ## %bb.0: ; X86-AVX1-NEXT: pushl %eax ## encoding: [0x50] -; X86-AVX1-NEXT: vpermilps $255, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x04,0xc0,0xff] +; X86-AVX1-NEXT: vshufps $255, %xmm0, %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0xc6,0xc0,0xff] ; X86-AVX1-NEXT: ## xmm0 = xmm0[3,3,3,3] ; X86-AVX1-NEXT: vmovss %xmm0, (%esp) ## encoding: [0xc5,0xfa,0x11,0x04,0x24] ; X86-AVX1-NEXT: flds (%esp) ## encoding: [0xd9,0x04,0x24] @@ -267,7 +267,7 @@ ; X86-AVX512-LABEL: ext_2: ; X86-AVX512: ## %bb.0: ; X86-AVX512-NEXT: pushl %eax ## encoding: [0x50] -; X86-AVX512-NEXT: vpermilps $255, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x04,0xc0,0xff] +; X86-AVX512-NEXT: vshufps $255, %xmm0, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0xc6,0xc0,0xff] ; X86-AVX512-NEXT: ## xmm0 = xmm0[3,3,3,3] ; X86-AVX512-NEXT: vmovss %xmm0, (%esp) ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x11,0x04,0x24] ; X86-AVX512-NEXT: flds (%esp) ## encoding: [0xd9,0x04,0x24] @@ -282,13 +282,13 @@ ; ; X64-AVX1-LABEL: ext_2: ; X64-AVX1: ## %bb.0: -; X64-AVX1-NEXT: vpermilps $255, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x04,0xc0,0xff] +; X64-AVX1-NEXT: vshufps $255, %xmm0, %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0xc6,0xc0,0xff] ; X64-AVX1-NEXT: ## xmm0 = xmm0[3,3,3,3] ; X64-AVX1-NEXT: retq ## encoding: [0xc3] ; ; X64-AVX512-LABEL: ext_2: ; X64-AVX512: ## %bb.0: -; X64-AVX512-NEXT: vpermilps $255, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x04,0xc0,0xff] +; X64-AVX512-NEXT: vshufps $255, %xmm0, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0xc6,0xc0,0xff] ; X64-AVX512-NEXT: ## xmm0 = xmm0[3,3,3,3] ; X64-AVX512-NEXT: retq ## encoding: [0xc3] %s = extractelement <4 x float> %v, i32 3 @@ -696,7 +696,7 @@ ; ; AVX1-LABEL: insertps_from_shufflevector_i32_2: ; AVX1: ## %bb.0: ## %entry -; AVX1-NEXT: vpermilps $238, %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x79,0x04,0xc9,0xee] +; AVX1-NEXT: vshufps $238, %xmm1, %xmm1, %xmm1 ## encoding: [0xc5,0xf0,0xc6,0xc9,0xee] ; AVX1-NEXT: ## xmm1 = xmm1[2,3,2,3] ; AVX1-NEXT: vblendps $2, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x02] ; AVX1-NEXT: ## xmm0 = xmm0[0],xmm1[1],xmm0[2,3] @@ -704,7 +704,7 @@ ; ; AVX512-LABEL: insertps_from_shufflevector_i32_2: ; AVX512: ## %bb.0: ## %entry -; AVX512-NEXT: vpermilps $238, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x04,0xc9,0xee] +; AVX512-NEXT: vshufps $238, %xmm1, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0xc6,0xc9,0xee] ; AVX512-NEXT: ## xmm1 = xmm1[2,3,2,3] ; AVX512-NEXT: vblendps $2, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x02] ; AVX512-NEXT: ## xmm0 = xmm0[0],xmm1[1],xmm0[2,3] @@ -1096,7 +1096,7 @@ ; ; AVX1-LABEL: i32_shuf_XYY0: ; AVX1: ## %bb.0: -; AVX1-NEXT: vpermilps $212, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x04,0xc0,0xd4] +; AVX1-NEXT: vshufps $212, %xmm0, %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0xc6,0xc0,0xd4] ; AVX1-NEXT: ## xmm0 = xmm0[0,1,1,3] ; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 ## encoding: [0xc5,0xf0,0x57,0xc9] ; AVX1-NEXT: vblendps $8, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x08] @@ -1105,7 +1105,7 @@ ; ; AVX512-LABEL: i32_shuf_XYY0: ; AVX512: ## %bb.0: -; AVX512-NEXT: vpermilps $212, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x04,0xc0,0xd4] +; AVX512-NEXT: vshufps $212, %xmm0, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0xc6,0xc0,0xd4] ; AVX512-NEXT: ## xmm0 = xmm0[0,1,1,3] ; AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0x57,0xc9] ; AVX512-NEXT: vblendps $8, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x08] @@ -1132,7 +1132,7 @@ ; ; AVX1-LABEL: i32_shuf_XYW0: ; AVX1: ## %bb.0: -; AVX1-NEXT: vpermilps $244, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x04,0xc0,0xf4] +; AVX1-NEXT: vshufps $244, %xmm0, %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0xc6,0xc0,0xf4] ; AVX1-NEXT: ## xmm0 = xmm0[0,1,3,3] ; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 ## encoding: [0xc5,0xf0,0x57,0xc9] ; AVX1-NEXT: vblendps $8, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x08] @@ -1141,7 +1141,7 @@ ; ; AVX512-LABEL: i32_shuf_XYW0: ; AVX512: ## %bb.0: -; AVX512-NEXT: vpermilps $244, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x04,0xc0,0xf4] +; AVX512-NEXT: vshufps $244, %xmm0, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0xc6,0xc0,0xf4] ; AVX512-NEXT: ## xmm0 = xmm0[0,1,3,3] ; AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0x57,0xc9] ; AVX512-NEXT: vblendps $8, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x08] @@ -1169,7 +1169,7 @@ ; ; AVX1-LABEL: i32_shuf_W00W: ; AVX1: ## %bb.0: -; AVX1-NEXT: vpermilps $255, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x04,0xc0,0xff] +; AVX1-NEXT: vshufps $255, %xmm0, %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0xc6,0xc0,0xff] ; AVX1-NEXT: ## xmm0 = xmm0[3,3,3,3] ; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 ## encoding: [0xc5,0xf0,0x57,0xc9] ; AVX1-NEXT: vblendps $6, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x06] @@ -1178,7 +1178,7 @@ ; ; AVX512-LABEL: i32_shuf_W00W: ; AVX512: ## %bb.0: -; AVX512-NEXT: vpermilps $255, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x04,0xc0,0xff] +; AVX512-NEXT: vshufps $255, %xmm0, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0xc6,0xc0,0xff] ; AVX512-NEXT: ## xmm0 = xmm0[3,3,3,3] ; AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0x57,0xc9] ; AVX512-NEXT: vblendps $6, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x06] @@ -1209,7 +1209,7 @@ ; AVX1-NEXT: vxorps %xmm2, %xmm2, %xmm2 ## encoding: [0xc5,0xe8,0x57,0xd2] ; AVX1-NEXT: vblendps $1, %xmm0, %xmm2, %xmm0 ## encoding: [0xc4,0xe3,0x69,0x0c,0xc0,0x01] ; AVX1-NEXT: ## xmm0 = xmm0[0],xmm2[1,2,3] -; AVX1-NEXT: vpermilps $0, %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x79,0x04,0xc9,0x00] +; AVX1-NEXT: vshufps $0, %xmm1, %xmm1, %xmm1 ## encoding: [0xc5,0xf0,0xc6,0xc9,0x00] ; AVX1-NEXT: ## xmm1 = xmm1[0,0,0,0] ; AVX1-NEXT: vblendps $8, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x08] ; AVX1-NEXT: ## xmm0 = xmm0[0,1,2],xmm1[3] @@ -1245,7 +1245,7 @@ ; AVX1-LABEL: i32_shuf_X00X: ; AVX1: ## %bb.0: ; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 ## encoding: [0xc5,0xf0,0x57,0xc9] -; AVX1-NEXT: vpermilps $0, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x04,0xc0,0x00] +; AVX1-NEXT: vshufps $0, %xmm0, %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0xc6,0xc0,0x00] ; AVX1-NEXT: ## xmm0 = xmm0[0,0,0,0] ; AVX1-NEXT: vblendps $6, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x06] ; AVX1-NEXT: ## xmm0 = xmm0[0],xmm1[1,2],xmm0[3] @@ -1880,7 +1880,7 @@ ; X86-AVX1-LABEL: insertps_pr20411: ; X86-AVX1: ## %bb.0: ; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] -; X86-AVX1-NEXT: vpermilps $238, %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x79,0x04,0xc9,0xee] +; X86-AVX1-NEXT: vshufps $238, %xmm1, %xmm1, %xmm1 ## encoding: [0xc5,0xf0,0xc6,0xc9,0xee] ; X86-AVX1-NEXT: ## xmm1 = xmm1[2,3,2,3] ; X86-AVX1-NEXT: vblendps $2, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x02] ; X86-AVX1-NEXT: ## xmm0 = xmm0[0],xmm1[1],xmm0[2,3] @@ -1890,7 +1890,7 @@ ; X86-AVX512-LABEL: insertps_pr20411: ; X86-AVX512: ## %bb.0: ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] -; X86-AVX512-NEXT: vpermilps $238, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x04,0xc9,0xee] +; X86-AVX512-NEXT: vshufps $238, %xmm1, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0xc6,0xc9,0xee] ; X86-AVX512-NEXT: ## xmm1 = xmm1[2,3,2,3] ; X86-AVX512-NEXT: vblendps $2, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x02] ; X86-AVX512-NEXT: ## xmm0 = xmm0[0],xmm1[1],xmm0[2,3] @@ -1908,7 +1908,7 @@ ; ; X64-AVX1-LABEL: insertps_pr20411: ; X64-AVX1: ## %bb.0: -; X64-AVX1-NEXT: vpermilps $238, %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x79,0x04,0xc9,0xee] +; X64-AVX1-NEXT: vshufps $238, %xmm1, %xmm1, %xmm1 ## encoding: [0xc5,0xf0,0xc6,0xc9,0xee] ; X64-AVX1-NEXT: ## xmm1 = xmm1[2,3,2,3] ; X64-AVX1-NEXT: vblendps $2, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x02] ; X64-AVX1-NEXT: ## xmm0 = xmm0[0],xmm1[1],xmm0[2,3] @@ -1917,7 +1917,7 @@ ; ; X64-AVX512-LABEL: insertps_pr20411: ; X64-AVX512: ## %bb.0: -; X64-AVX512-NEXT: vpermilps $238, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x04,0xc9,0xee] +; X64-AVX512-NEXT: vshufps $238, %xmm1, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0xc6,0xc9,0xee] ; X64-AVX512-NEXT: ## xmm1 = xmm1[2,3,2,3] ; X64-AVX512-NEXT: vblendps $2, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x02] ; X64-AVX512-NEXT: ## xmm0 = xmm0[0],xmm1[1],xmm0[2,3] diff --git a/llvm/test/CodeGen/X86/swizzle-avx2.ll b/llvm/test/CodeGen/X86/swizzle-avx2.ll --- a/llvm/test/CodeGen/X86/swizzle-avx2.ll +++ b/llvm/test/CodeGen/X86/swizzle-avx2.ll @@ -25,7 +25,7 @@ define <8 x i32> @swizzle_2(<8 x i32> %v) { ; CHECK-LABEL: swizzle_2: ; CHECK: # %bb.0: -; CHECK-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5] +; CHECK-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5] ; CHECK-NEXT: retq %1 = shufflevector <8 x i32> %v, <8 x i32> undef, <8 x i32> %2 = shufflevector <8 x i32> %1, <8 x i32> undef, <8 x i32> @@ -35,7 +35,7 @@ define <8 x i32> @swizzle_3(<8 x i32> %v) { ; CHECK-LABEL: swizzle_3: ; CHECK: # %bb.0: -; CHECK-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5] +; CHECK-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5] ; CHECK-NEXT: retq %1 = shufflevector <8 x i32> %v, <8 x i32> undef, <8 x i32> %2 = shufflevector <8 x i32> %1, <8 x i32> undef, <8 x i32> diff --git a/llvm/test/CodeGen/X86/vec-strict-fptoint-256.ll b/llvm/test/CodeGen/X86/vec-strict-fptoint-256.ll --- a/llvm/test/CodeGen/X86/vec-strict-fptoint-256.ll +++ b/llvm/test/CodeGen/X86/vec-strict-fptoint-256.ll @@ -607,7 +607,7 @@ ; ; AVX-64-LABEL: strict_vector_fptosi_v4f32_to_v4i64: ; AVX-64: # %bb.0: -; AVX-64-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,3,3,3] +; AVX-64-NEXT: vshufps {{.*#+}} xmm1 = xmm0[3,3,3,3] ; AVX-64-NEXT: vcvttss2si %xmm1, %rax ; AVX-64-NEXT: vmovq %rax, %xmm1 ; AVX-64-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] @@ -661,7 +661,7 @@ ; ; AVX512F-64-LABEL: strict_vector_fptosi_v4f32_to_v4i64: ; AVX512F-64: # %bb.0: -; AVX512F-64-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,3,3,3] +; AVX512F-64-NEXT: vshufps {{.*#+}} xmm1 = xmm0[3,3,3,3] ; AVX512F-64-NEXT: vcvttss2si %xmm1, %rax ; AVX512F-64-NEXT: vmovq %rax, %xmm1 ; AVX512F-64-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] @@ -715,7 +715,7 @@ ; ; AVX512VL-64-LABEL: strict_vector_fptosi_v4f32_to_v4i64: ; AVX512VL-64: # %bb.0: -; AVX512VL-64-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,3,3,3] +; AVX512VL-64-NEXT: vshufps {{.*#+}} xmm1 = xmm0[3,3,3,3] ; AVX512VL-64-NEXT: vcvttss2si %xmm1, %rax ; AVX512VL-64-NEXT: vmovq %rax, %xmm1 ; AVX512VL-64-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] @@ -774,7 +774,7 @@ ; AVX-32-NEXT: movzbl %al, %eax ; AVX-32-NEXT: shll $31, %eax ; AVX-32-NEXT: xorl {{[0-9]+}}(%esp), %eax -; AVX-32-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[3,3,3,3] +; AVX-32-NEXT: vshufps {{.*#+}} xmm2 = xmm0[3,3,3,3] ; AVX-32-NEXT: vcomiss %xmm1, %xmm2 ; AVX-32-NEXT: vmovaps %xmm1, %xmm3 ; AVX-32-NEXT: jae .LBB3_4 @@ -836,7 +836,7 @@ ; ; AVX-64-LABEL: strict_vector_fptoui_v4f32_to_v4i64: ; AVX-64: # %bb.0: -; AVX-64-NEXT: vpermilps {{.*#+}} xmm3 = xmm0[3,3,3,3] +; AVX-64-NEXT: vshufps {{.*#+}} xmm3 = xmm0[3,3,3,3] ; AVX-64-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; AVX-64-NEXT: vcomiss %xmm1, %xmm3 ; AVX-64-NEXT: vxorps %xmm2, %xmm2, %xmm2 @@ -908,7 +908,7 @@ ; AVX512F-32-NEXT: andl $-8, %esp ; AVX512F-32-NEXT: subl $40, %esp ; AVX512F-32-NEXT: .cfi_offset %ebx, -12 -; AVX512F-32-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[3,3,3,3] +; AVX512F-32-NEXT: vshufps {{.*#+}} xmm2 = xmm0[3,3,3,3] ; AVX512F-32-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; AVX512F-32-NEXT: xorl %eax, %eax ; AVX512F-32-NEXT: vcomiss %xmm1, %xmm2 @@ -974,7 +974,7 @@ ; ; AVX512F-64-LABEL: strict_vector_fptoui_v4f32_to_v4i64: ; AVX512F-64: # %bb.0: -; AVX512F-64-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,3,3,3] +; AVX512F-64-NEXT: vshufps {{.*#+}} xmm1 = xmm0[3,3,3,3] ; AVX512F-64-NEXT: vcvttss2usi %xmm1, %rax ; AVX512F-64-NEXT: vmovq %rax, %xmm1 ; AVX512F-64-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] @@ -1001,7 +1001,7 @@ ; AVX512VL-32-NEXT: andl $-8, %esp ; AVX512VL-32-NEXT: subl $40, %esp ; AVX512VL-32-NEXT: .cfi_offset %ebx, -12 -; AVX512VL-32-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[3,3,3,3] +; AVX512VL-32-NEXT: vshufps {{.*#+}} xmm2 = xmm0[3,3,3,3] ; AVX512VL-32-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; AVX512VL-32-NEXT: xorl %eax, %eax ; AVX512VL-32-NEXT: vcomiss %xmm1, %xmm2 @@ -1067,7 +1067,7 @@ ; ; AVX512VL-64-LABEL: strict_vector_fptoui_v4f32_to_v4i64: ; AVX512VL-64: # %bb.0: -; AVX512VL-64-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,3,3,3] +; AVX512VL-64-NEXT: vshufps {{.*#+}} xmm1 = xmm0[3,3,3,3] ; AVX512VL-64-NEXT: vcvttss2usi %xmm1, %rax ; AVX512VL-64-NEXT: vmovq %rax, %xmm1 ; AVX512VL-64-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] diff --git a/llvm/test/CodeGen/X86/vec-strict-fptoint-512.ll b/llvm/test/CodeGen/X86/vec-strict-fptoint-512.ll --- a/llvm/test/CodeGen/X86/vec-strict-fptoint-512.ll +++ b/llvm/test/CodeGen/X86/vec-strict-fptoint-512.ll @@ -385,7 +385,7 @@ ; AVX512VL-64-LABEL: strict_vector_fptosi_v8f32_to_v8i64: ; AVX512VL-64: # %bb.0: ; AVX512VL-64-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX512VL-64-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[3,3,3,3] +; AVX512VL-64-NEXT: vshufps {{.*#+}} xmm2 = xmm1[3,3,3,3] ; AVX512VL-64-NEXT: vcvttss2si %xmm2, %rax ; AVX512VL-64-NEXT: vmovq %rax, %xmm2 ; AVX512VL-64-NEXT: vpermilpd {{.*#+}} xmm3 = xmm1[1,0] @@ -399,7 +399,7 @@ ; AVX512VL-64-NEXT: vmovq %rax, %xmm1 ; AVX512VL-64-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm3[0],xmm1[0] ; AVX512VL-64-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 -; AVX512VL-64-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[3,3,3,3] +; AVX512VL-64-NEXT: vshufps {{.*#+}} xmm2 = xmm0[3,3,3,3] ; AVX512VL-64-NEXT: vcvttss2si %xmm2, %rax ; AVX512VL-64-NEXT: vmovq %rax, %xmm2 ; AVX512VL-64-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0] @@ -442,7 +442,7 @@ ; AVX512VL-32-NEXT: .cfi_offset %edi, -16 ; AVX512VL-32-NEXT: .cfi_offset %ebx, -12 ; AVX512VL-32-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX512VL-32-NEXT: vpermilps {{.*#+}} xmm3 = xmm2[3,3,3,3] +; AVX512VL-32-NEXT: vshufps {{.*#+}} xmm3 = xmm2[3,3,3,3] ; AVX512VL-32-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; AVX512VL-32-NEXT: xorl %eax, %eax ; AVX512VL-32-NEXT: vcomiss %xmm1, %xmm3 @@ -477,7 +477,7 @@ ; AVX512VL-32-NEXT: vmovss %xmm1, %xmm1, %xmm3 {%k1} {z} ; AVX512VL-32-NEXT: vsubss %xmm3, %xmm2, %xmm2 ; AVX512VL-32-NEXT: vmovss %xmm2, {{[0-9]+}}(%esp) -; AVX512VL-32-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[3,3,3,3] +; AVX512VL-32-NEXT: vshufps {{.*#+}} xmm2 = xmm0[3,3,3,3] ; AVX512VL-32-NEXT: xorl %eax, %eax ; AVX512VL-32-NEXT: vcomiss %xmm1, %xmm2 ; AVX512VL-32-NEXT: setae %al @@ -574,7 +574,7 @@ ; AVX512VL-64-LABEL: strict_vector_fptoui_v8f32_to_v8i64: ; AVX512VL-64: # %bb.0: ; AVX512VL-64-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX512VL-64-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[3,3,3,3] +; AVX512VL-64-NEXT: vshufps {{.*#+}} xmm2 = xmm1[3,3,3,3] ; AVX512VL-64-NEXT: vcvttss2usi %xmm2, %rax ; AVX512VL-64-NEXT: vmovq %rax, %xmm2 ; AVX512VL-64-NEXT: vpermilpd {{.*#+}} xmm3 = xmm1[1,0] @@ -588,7 +588,7 @@ ; AVX512VL-64-NEXT: vmovq %rax, %xmm1 ; AVX512VL-64-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm3[0],xmm1[0] ; AVX512VL-64-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 -; AVX512VL-64-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[3,3,3,3] +; AVX512VL-64-NEXT: vshufps {{.*#+}} xmm2 = xmm0[3,3,3,3] ; AVX512VL-64-NEXT: vcvttss2usi %xmm2, %rax ; AVX512VL-64-NEXT: vmovq %rax, %xmm2 ; AVX512VL-64-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0] diff --git a/llvm/test/CodeGen/X86/vec-strict-inttofp-128.ll b/llvm/test/CodeGen/X86/vec-strict-inttofp-128.ll --- a/llvm/test/CodeGen/X86/vec-strict-inttofp-128.ll +++ b/llvm/test/CodeGen/X86/vec-strict-inttofp-128.ll @@ -209,7 +209,7 @@ ; AVX-32-NEXT: andl $-8, %esp ; AVX-32-NEXT: subl $24, %esp ; AVX-32-NEXT: vmovlps %xmm0, {{[0-9]+}}(%esp) -; AVX-32-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX-32-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX-32-NEXT: vmovlps %xmm0, {{[0-9]+}}(%esp) ; AVX-32-NEXT: fildll {{[0-9]+}}(%esp) ; AVX-32-NEXT: fstps (%esp) @@ -236,7 +236,7 @@ ; AVX512DQ-32: # %bb.0: ; AVX512DQ-32-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512DQ-32-NEXT: vcvtqq2ps %zmm0, %ymm1 -; AVX512DQ-32-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX512DQ-32-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX512DQ-32-NEXT: vcvtqq2ps %zmm0, %ymm0 ; AVX512DQ-32-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],zero,zero ; AVX512DQ-32-NEXT: vzeroupper @@ -410,7 +410,7 @@ ; AVX-32-NEXT: andl $-8, %esp ; AVX-32-NEXT: subl $24, %esp ; AVX-32-NEXT: vmovlps %xmm0, {{[0-9]+}}(%esp) -; AVX-32-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX-32-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX-32-NEXT: vmovlps %xmm1, {{[0-9]+}}(%esp) ; AVX-32-NEXT: vextractps $1, %xmm0, %eax ; AVX-32-NEXT: shrl $31, %eax @@ -471,7 +471,7 @@ ; AVX512DQ-32: # %bb.0: ; AVX512DQ-32-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512DQ-32-NEXT: vcvtuqq2ps %zmm0, %ymm1 -; AVX512DQ-32-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX512DQ-32-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX512DQ-32-NEXT: vcvtuqq2ps %zmm0, %ymm0 ; AVX512DQ-32-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],zero,zero ; AVX512DQ-32-NEXT: vzeroupper @@ -887,21 +887,21 @@ ; ; AVX1-32-LABEL: uitofp_v2i1_v2f64: ; AVX1-32: # %bb.0: -; AVX1-32-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX1-32-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3] ; AVX1-32-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 ; AVX1-32-NEXT: vcvtdq2pd %xmm0, %xmm0 ; AVX1-32-NEXT: retl ; ; AVX1-64-LABEL: uitofp_v2i1_v2f64: ; AVX1-64: # %bb.0: -; AVX1-64-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX1-64-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3] ; AVX1-64-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX1-64-NEXT: vcvtdq2pd %xmm0, %xmm0 ; AVX1-64-NEXT: retq ; ; AVX512F-LABEL: uitofp_v2i1_v2f64: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX512F-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3] ; AVX512F-NEXT: vbroadcastss {{.*#+}} xmm1 = [1,1,1,1] ; AVX512F-NEXT: vandps %xmm1, %xmm0, %xmm0 ; AVX512F-NEXT: vcvtdq2pd %xmm0, %xmm0 @@ -923,7 +923,7 @@ ; ; AVX512DQ-LABEL: uitofp_v2i1_v2f64: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX512DQ-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3] ; AVX512DQ-NEXT: vbroadcastss {{.*#+}} xmm1 = [1,1,1,1] ; AVX512DQ-NEXT: vandps %xmm1, %xmm0, %xmm0 ; AVX512DQ-NEXT: vcvtdq2pd %xmm0, %xmm0 @@ -931,14 +931,14 @@ ; ; AVX512DQVL-32-LABEL: uitofp_v2i1_v2f64: ; AVX512DQVL-32: # %bb.0: -; AVX512DQVL-32-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX512DQVL-32-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3] ; AVX512DQVL-32-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}{1to4}, %xmm0, %xmm0 ; AVX512DQVL-32-NEXT: vcvtdq2pd %xmm0, %xmm0 ; AVX512DQVL-32-NEXT: retl ; ; AVX512DQVL-64-LABEL: uitofp_v2i1_v2f64: ; AVX512DQVL-64: # %bb.0: -; AVX512DQVL-64-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX512DQVL-64-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3] ; AVX512DQVL-64-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 ; AVX512DQVL-64-NEXT: vcvtdq2pd %xmm0, %xmm0 ; AVX512DQVL-64-NEXT: retq @@ -1218,7 +1218,7 @@ ; AVX-32-NEXT: andl $-8, %esp ; AVX-32-NEXT: subl $32, %esp ; AVX-32-NEXT: vmovlps %xmm0, {{[0-9]+}}(%esp) -; AVX-32-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX-32-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX-32-NEXT: vmovlps %xmm0, {{[0-9]+}}(%esp) ; AVX-32-NEXT: fildll {{[0-9]+}}(%esp) ; AVX-32-NEXT: fstpl {{[0-9]+}}(%esp) @@ -1406,7 +1406,7 @@ ; AVX-32-NEXT: andl $-8, %esp ; AVX-32-NEXT: subl $32, %esp ; AVX-32-NEXT: vmovlps %xmm0, {{[0-9]+}}(%esp) -; AVX-32-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX-32-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX-32-NEXT: vmovlps %xmm1, {{[0-9]+}}(%esp) ; AVX-32-NEXT: vextractps $1, %xmm0, %eax ; AVX-32-NEXT: shrl $31, %eax diff --git a/llvm/test/CodeGen/X86/vec-strict-inttofp-256.ll b/llvm/test/CodeGen/X86/vec-strict-inttofp-256.ll --- a/llvm/test/CodeGen/X86/vec-strict-inttofp-256.ll +++ b/llvm/test/CodeGen/X86/vec-strict-inttofp-256.ll @@ -641,11 +641,11 @@ ; AVX-32-NEXT: andl $-8, %esp ; AVX-32-NEXT: subl $64, %esp ; AVX-32-NEXT: vmovlps %xmm0, {{[0-9]+}}(%esp) -; AVX-32-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX-32-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX-32-NEXT: vmovlps %xmm1, {{[0-9]+}}(%esp) ; AVX-32-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX-32-NEXT: vmovlps %xmm0, {{[0-9]+}}(%esp) -; AVX-32-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX-32-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX-32-NEXT: vmovlps %xmm0, {{[0-9]+}}(%esp) ; AVX-32-NEXT: fildll {{[0-9]+}}(%esp) ; AVX-32-NEXT: fstpl {{[0-9]+}}(%esp) @@ -758,11 +758,11 @@ ; AVX-32-NEXT: andl $-8, %esp ; AVX-32-NEXT: subl $64, %esp ; AVX-32-NEXT: vmovlps %xmm0, {{[0-9]+}}(%esp) -; AVX-32-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX-32-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX-32-NEXT: vmovlps %xmm1, {{[0-9]+}}(%esp) ; AVX-32-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX-32-NEXT: vmovlps %xmm1, {{[0-9]+}}(%esp) -; AVX-32-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[2,3,2,3] +; AVX-32-NEXT: vshufps {{.*#+}} xmm2 = xmm1[2,3,2,3] ; AVX-32-NEXT: vmovlps %xmm2, {{[0-9]+}}(%esp) ; AVX-32-NEXT: vextractps $1, %xmm0, %eax ; AVX-32-NEXT: shrl $31, %eax @@ -919,11 +919,11 @@ ; AVX-32-NEXT: andl $-8, %esp ; AVX-32-NEXT: subl $48, %esp ; AVX-32-NEXT: vmovlps %xmm0, {{[0-9]+}}(%esp) -; AVX-32-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX-32-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX-32-NEXT: vmovlps %xmm1, {{[0-9]+}}(%esp) ; AVX-32-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX-32-NEXT: vmovlps %xmm0, {{[0-9]+}}(%esp) -; AVX-32-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX-32-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX-32-NEXT: vmovlps %xmm0, {{[0-9]+}}(%esp) ; AVX-32-NEXT: fildll {{[0-9]+}}(%esp) ; AVX-32-NEXT: fstps {{[0-9]+}}(%esp) @@ -1042,11 +1042,11 @@ ; AVX-32-NEXT: andl $-8, %esp ; AVX-32-NEXT: subl $48, %esp ; AVX-32-NEXT: vmovlps %xmm0, {{[0-9]+}}(%esp) -; AVX-32-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX-32-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX-32-NEXT: vmovlps %xmm1, {{[0-9]+}}(%esp) ; AVX-32-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX-32-NEXT: vmovlps %xmm1, {{[0-9]+}}(%esp) -; AVX-32-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[2,3,2,3] +; AVX-32-NEXT: vshufps {{.*#+}} xmm2 = xmm1[2,3,2,3] ; AVX-32-NEXT: vmovlps %xmm2, {{[0-9]+}}(%esp) ; AVX-32-NEXT: vextractps $1, %xmm0, %eax ; AVX-32-NEXT: shrl $31, %eax diff --git a/llvm/test/CodeGen/X86/vec-strict-inttofp-512.ll b/llvm/test/CodeGen/X86/vec-strict-inttofp-512.ll --- a/llvm/test/CodeGen/X86/vec-strict-inttofp-512.ll +++ b/llvm/test/CodeGen/X86/vec-strict-inttofp-512.ll @@ -273,18 +273,18 @@ ; NODQ-32-NEXT: subl $128, %esp ; NODQ-32-NEXT: vextractf32x4 $2, %zmm0, %xmm1 ; NODQ-32-NEXT: vmovlps %xmm1, {{[0-9]+}}(%esp) -; NODQ-32-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[2,3,2,3] +; NODQ-32-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3] ; NODQ-32-NEXT: vmovlps %xmm1, {{[0-9]+}}(%esp) ; NODQ-32-NEXT: vextractf32x4 $3, %zmm0, %xmm1 ; NODQ-32-NEXT: vmovlps %xmm1, {{[0-9]+}}(%esp) -; NODQ-32-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[2,3,2,3] +; NODQ-32-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3] ; NODQ-32-NEXT: vmovlps %xmm1, {{[0-9]+}}(%esp) ; NODQ-32-NEXT: vmovlps %xmm0, {{[0-9]+}}(%esp) -; NODQ-32-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; NODQ-32-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; NODQ-32-NEXT: vmovlps %xmm1, {{[0-9]+}}(%esp) ; NODQ-32-NEXT: vextractf128 $1, %ymm0, %xmm0 ; NODQ-32-NEXT: vmovlps %xmm0, {{[0-9]+}}(%esp) -; NODQ-32-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,2,3] +; NODQ-32-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] ; NODQ-32-NEXT: vmovlps %xmm0, {{[0-9]+}}(%esp) ; NODQ-32-NEXT: fildll {{[0-9]+}}(%esp) ; NODQ-32-NEXT: fstpl {{[0-9]+}}(%esp) @@ -371,18 +371,18 @@ ; NODQ-32-NEXT: subl $128, %esp ; NODQ-32-NEXT: vextractf32x4 $2, %zmm0, %xmm3 ; NODQ-32-NEXT: vmovlps %xmm3, {{[0-9]+}}(%esp) -; NODQ-32-NEXT: vpermilps {{.*#+}} xmm1 = xmm3[2,3,2,3] +; NODQ-32-NEXT: vshufps {{.*#+}} xmm1 = xmm3[2,3,2,3] ; NODQ-32-NEXT: vmovlps %xmm1, {{[0-9]+}}(%esp) ; NODQ-32-NEXT: vextractf32x4 $3, %zmm0, %xmm2 ; NODQ-32-NEXT: vmovlps %xmm2, {{[0-9]+}}(%esp) -; NODQ-32-NEXT: vpermilps {{.*#+}} xmm1 = xmm2[2,3,2,3] +; NODQ-32-NEXT: vshufps {{.*#+}} xmm1 = xmm2[2,3,2,3] ; NODQ-32-NEXT: vmovlps %xmm1, {{[0-9]+}}(%esp) ; NODQ-32-NEXT: vmovlps %xmm0, {{[0-9]+}}(%esp) -; NODQ-32-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; NODQ-32-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; NODQ-32-NEXT: vmovlps %xmm1, {{[0-9]+}}(%esp) ; NODQ-32-NEXT: vextractf128 $1, %ymm0, %xmm1 ; NODQ-32-NEXT: vmovlps %xmm1, {{[0-9]+}}(%esp) -; NODQ-32-NEXT: vpermilps {{.*#+}} xmm4 = xmm1[2,3,2,3] +; NODQ-32-NEXT: vshufps {{.*#+}} xmm4 = xmm1[2,3,2,3] ; NODQ-32-NEXT: vmovlps %xmm4, {{[0-9]+}}(%esp) ; NODQ-32-NEXT: vextractps $1, %xmm3, %eax ; NODQ-32-NEXT: shrl $31, %eax @@ -499,19 +499,19 @@ ; NODQ-32-NEXT: andl $-8, %esp ; NODQ-32-NEXT: subl $96, %esp ; NODQ-32-NEXT: vmovlps %xmm0, {{[0-9]+}}(%esp) -; NODQ-32-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; NODQ-32-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; NODQ-32-NEXT: vmovlps %xmm1, {{[0-9]+}}(%esp) ; NODQ-32-NEXT: vextractf128 $1, %ymm0, %xmm1 ; NODQ-32-NEXT: vmovlps %xmm1, {{[0-9]+}}(%esp) -; NODQ-32-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[2,3,2,3] +; NODQ-32-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3] ; NODQ-32-NEXT: vmovlps %xmm1, {{[0-9]+}}(%esp) ; NODQ-32-NEXT: vextractf32x4 $2, %zmm0, %xmm1 ; NODQ-32-NEXT: vmovlps %xmm1, {{[0-9]+}}(%esp) -; NODQ-32-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[2,3,2,3] +; NODQ-32-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3] ; NODQ-32-NEXT: vmovlps %xmm1, {{[0-9]+}}(%esp) ; NODQ-32-NEXT: vextractf32x4 $3, %zmm0, %xmm0 ; NODQ-32-NEXT: vmovlps %xmm0, {{[0-9]+}}(%esp) -; NODQ-32-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,2,3] +; NODQ-32-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] ; NODQ-32-NEXT: vmovlps %xmm0, {{[0-9]+}}(%esp) ; NODQ-32-NEXT: fildll {{[0-9]+}}(%esp) ; NODQ-32-NEXT: fstps {{[0-9]+}}(%esp) @@ -595,19 +595,19 @@ ; NODQ-32-NEXT: andl $-8, %esp ; NODQ-32-NEXT: subl $96, %esp ; NODQ-32-NEXT: vmovlps %xmm0, {{[0-9]+}}(%esp) -; NODQ-32-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; NODQ-32-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; NODQ-32-NEXT: vmovlps %xmm1, {{[0-9]+}}(%esp) ; NODQ-32-NEXT: vextractf128 $1, %ymm0, %xmm3 ; NODQ-32-NEXT: vmovlps %xmm3, {{[0-9]+}}(%esp) -; NODQ-32-NEXT: vpermilps {{.*#+}} xmm1 = xmm3[2,3,2,3] +; NODQ-32-NEXT: vshufps {{.*#+}} xmm1 = xmm3[2,3,2,3] ; NODQ-32-NEXT: vmovlps %xmm1, {{[0-9]+}}(%esp) ; NODQ-32-NEXT: vextractf32x4 $2, %zmm0, %xmm2 ; NODQ-32-NEXT: vmovlps %xmm2, {{[0-9]+}}(%esp) -; NODQ-32-NEXT: vpermilps {{.*#+}} xmm1 = xmm2[2,3,2,3] +; NODQ-32-NEXT: vshufps {{.*#+}} xmm1 = xmm2[2,3,2,3] ; NODQ-32-NEXT: vmovlps %xmm1, {{[0-9]+}}(%esp) ; NODQ-32-NEXT: vextractf32x4 $3, %zmm0, %xmm1 ; NODQ-32-NEXT: vmovlps %xmm1, {{[0-9]+}}(%esp) -; NODQ-32-NEXT: vpermilps {{.*#+}} xmm4 = xmm1[2,3,2,3] +; NODQ-32-NEXT: vshufps {{.*#+}} xmm4 = xmm1[2,3,2,3] ; NODQ-32-NEXT: vmovlps %xmm4, {{[0-9]+}}(%esp) ; NODQ-32-NEXT: vextractps $1, %xmm0, %eax ; NODQ-32-NEXT: shrl $31, %eax diff --git a/llvm/test/CodeGen/X86/vec_fp_to_int.ll b/llvm/test/CodeGen/X86/vec_fp_to_int.ll --- a/llvm/test/CodeGen/X86/vec_fp_to_int.ll +++ b/llvm/test/CodeGen/X86/vec_fp_to_int.ll @@ -954,7 +954,7 @@ ; ; AVX1-LABEL: fptosi_4f32_to_4i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,3,3,3] +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[3,3,3,3] ; AVX1-NEXT: vcvttss2si %xmm1, %rax ; AVX1-NEXT: vmovq %rax, %xmm1 ; AVX1-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] @@ -972,7 +972,7 @@ ; ; AVX2-LABEL: fptosi_4f32_to_4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,3,3,3] +; AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm0[3,3,3,3] ; AVX2-NEXT: vcvttss2si %xmm1, %rax ; AVX2-NEXT: vmovq %rax, %xmm1 ; AVX2-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] @@ -990,7 +990,7 @@ ; ; AVX512F-LABEL: fptosi_4f32_to_4i64: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,3,3,3] +; AVX512F-NEXT: vshufps {{.*#+}} xmm1 = xmm0[3,3,3,3] ; AVX512F-NEXT: vcvttss2si %xmm1, %rax ; AVX512F-NEXT: vmovq %rax, %xmm1 ; AVX512F-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] @@ -1008,7 +1008,7 @@ ; ; AVX512VL-LABEL: fptosi_4f32_to_4i64: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,3,3,3] +; AVX512VL-NEXT: vshufps {{.*#+}} xmm1 = xmm0[3,3,3,3] ; AVX512VL-NEXT: vcvttss2si %xmm1, %rax ; AVX512VL-NEXT: vmovq %rax, %xmm1 ; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] @@ -1062,7 +1062,7 @@ ; ; AVX1-LABEL: fptosi_8f32_to_4i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,3,3,3] +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[3,3,3,3] ; AVX1-NEXT: vcvttss2si %xmm1, %rax ; AVX1-NEXT: vmovq %rax, %xmm1 ; AVX1-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] @@ -1080,7 +1080,7 @@ ; ; AVX2-LABEL: fptosi_8f32_to_4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,3,3,3] +; AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm0[3,3,3,3] ; AVX2-NEXT: vcvttss2si %xmm1, %rax ; AVX2-NEXT: vmovq %rax, %xmm1 ; AVX2-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] @@ -1103,7 +1103,7 @@ ; AVX512F-NEXT: vcvttss2si %xmm0, %rcx ; AVX512F-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX512F-NEXT: vcvttss2si %xmm1, %rdx -; AVX512F-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX512F-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX512F-NEXT: vcvttss2si %xmm0, %rsi ; AVX512F-NEXT: vmovq %rsi, %xmm0 ; AVX512F-NEXT: vmovq %rdx, %xmm1 @@ -1121,7 +1121,7 @@ ; AVX512VL-NEXT: vcvttss2si %xmm0, %rcx ; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX512VL-NEXT: vcvttss2si %xmm1, %rdx -; AVX512VL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX512VL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX512VL-NEXT: vcvttss2si %xmm0, %rsi ; AVX512VL-NEXT: vmovq %rsi, %xmm0 ; AVX512VL-NEXT: vmovq %rdx, %xmm1 @@ -1566,7 +1566,7 @@ ; ; AVX1-LABEL: fptoui_4f32_to_4i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[3,3,3,3] +; AVX1-NEXT: vshufps {{.*#+}} xmm2 = xmm0[3,3,3,3] ; AVX1-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; AVX1-NEXT: vsubss %xmm1, %xmm2, %xmm3 ; AVX1-NEXT: vcvttss2si %xmm3, %rax @@ -1609,7 +1609,7 @@ ; ; AVX2-LABEL: fptoui_4f32_to_4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[3,3,3,3] +; AVX2-NEXT: vshufps {{.*#+}} xmm2 = xmm0[3,3,3,3] ; AVX2-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; AVX2-NEXT: vsubss %xmm1, %xmm2, %xmm3 ; AVX2-NEXT: vcvttss2si %xmm3, %rax @@ -1652,7 +1652,7 @@ ; ; AVX512F-LABEL: fptoui_4f32_to_4i64: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,3,3,3] +; AVX512F-NEXT: vshufps {{.*#+}} xmm1 = xmm0[3,3,3,3] ; AVX512F-NEXT: vcvttss2usi %xmm1, %rax ; AVX512F-NEXT: vmovq %rax, %xmm1 ; AVX512F-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] @@ -1670,7 +1670,7 @@ ; ; AVX512VL-LABEL: fptoui_4f32_to_4i64: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,3,3,3] +; AVX512VL-NEXT: vshufps {{.*#+}} xmm1 = xmm0[3,3,3,3] ; AVX512VL-NEXT: vcvttss2usi %xmm1, %rax ; AVX512VL-NEXT: vmovq %rax, %xmm1 ; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] @@ -1750,7 +1750,7 @@ ; ; AVX1-LABEL: fptoui_8f32_to_4i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[3,3,3,3] +; AVX1-NEXT: vshufps {{.*#+}} xmm2 = xmm0[3,3,3,3] ; AVX1-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; AVX1-NEXT: vsubss %xmm1, %xmm2, %xmm3 ; AVX1-NEXT: vcvttss2si %xmm3, %rax @@ -1793,7 +1793,7 @@ ; ; AVX2-LABEL: fptoui_8f32_to_4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[3,3,3,3] +; AVX2-NEXT: vshufps {{.*#+}} xmm2 = xmm0[3,3,3,3] ; AVX2-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; AVX2-NEXT: vsubss %xmm1, %xmm2, %xmm3 ; AVX2-NEXT: vcvttss2si %xmm3, %rax @@ -1841,7 +1841,7 @@ ; AVX512F-NEXT: vcvttss2usi %xmm0, %rcx ; AVX512F-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX512F-NEXT: vcvttss2usi %xmm1, %rdx -; AVX512F-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX512F-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX512F-NEXT: vcvttss2usi %xmm0, %rsi ; AVX512F-NEXT: vmovq %rsi, %xmm0 ; AVX512F-NEXT: vmovq %rdx, %xmm1 @@ -1859,7 +1859,7 @@ ; AVX512VL-NEXT: vcvttss2usi %xmm0, %rcx ; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX512VL-NEXT: vcvttss2usi %xmm1, %rdx -; AVX512VL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX512VL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX512VL-NEXT: vcvttss2usi %xmm0, %rsi ; AVX512VL-NEXT: vmovq %rsi, %xmm0 ; AVX512VL-NEXT: vmovq %rdx, %xmm1 diff --git a/llvm/test/CodeGen/X86/vec_int_to_fp.ll b/llvm/test/CodeGen/X86/vec_int_to_fp.ll --- a/llvm/test/CodeGen/X86/vec_int_to_fp.ll +++ b/llvm/test/CodeGen/X86/vec_int_to_fp.ll @@ -933,7 +933,7 @@ ; AVX1-NEXT: vblendps {{.*#+}} ymm2 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] ; AVX1-NEXT: vorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 ; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm1[1,3],ymm0[5,7],ymm1[5,7] -; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7] +; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7] ; AVX1-NEXT: vorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX1-NEXT: vsubpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX1-NEXT: vaddpd %ymm0, %ymm2, %ymm0 @@ -3631,7 +3631,7 @@ ; AVX1-NEXT: vblendps {{.*#+}} ymm2 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] ; AVX1-NEXT: vorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 ; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm1[1,3],ymm0[1,3],ymm1[5,7],ymm0[5,7] -; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,0,3,1,6,4,7,5] +; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0,3,1,6,4,7,5] ; AVX1-NEXT: vorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX1-NEXT: vsubpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX1-NEXT: vaddpd %ymm0, %ymm2, %ymm0 @@ -5431,7 +5431,7 @@ ; ; AVX-LABEL: extract3_sitofp_v4i32_f32: ; AVX: # %bb.0: -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX-NEXT: vcvtdq2ps %xmm0, %xmm0 ; AVX-NEXT: retq %e = extractelement <4 x i32> %x, i32 3 @@ -5457,7 +5457,7 @@ ; ; AVX-LABEL: extract3_sitofp_v4i32_f64: ; AVX: # %bb.0: -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX-NEXT: vcvtdq2pd %xmm0, %xmm0 ; AVX-NEXT: retq %e = extractelement <4 x i32> %x, i32 3 @@ -5489,7 +5489,7 @@ ; ; AVX512F-LABEL: extract3_uitofp_v4i32_f32: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX512F-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX512F-NEXT: vcvtudq2ps %zmm0, %zmm0 ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512F-NEXT: vzeroupper @@ -5497,13 +5497,13 @@ ; ; AVX512VL-LABEL: extract3_uitofp_v4i32_f32: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX512VL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX512VL-NEXT: vcvtudq2ps %xmm0, %xmm0 ; AVX512VL-NEXT: retq ; ; AVX512DQ-LABEL: extract3_uitofp_v4i32_f32: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX512DQ-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX512DQ-NEXT: vcvtudq2ps %zmm0, %zmm0 ; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512DQ-NEXT: vzeroupper @@ -5511,7 +5511,7 @@ ; ; AVX512VLDQ-LABEL: extract3_uitofp_v4i32_f32: ; AVX512VLDQ: # %bb.0: -; AVX512VLDQ-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX512VLDQ-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX512VLDQ-NEXT: vcvtudq2ps %xmm0, %xmm0 ; AVX512VLDQ-NEXT: retq %e = extractelement <4 x i32> %x, i32 3 @@ -5543,7 +5543,7 @@ ; ; AVX512F-LABEL: extract3_uitofp_v4i32_f64: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX512F-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX512F-NEXT: vcvtudq2pd %ymm0, %zmm0 ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512F-NEXT: vzeroupper @@ -5551,13 +5551,13 @@ ; ; AVX512VL-LABEL: extract3_uitofp_v4i32_f64: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX512VL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX512VL-NEXT: vcvtudq2pd %xmm0, %xmm0 ; AVX512VL-NEXT: retq ; ; AVX512DQ-LABEL: extract3_uitofp_v4i32_f64: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX512DQ-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX512DQ-NEXT: vcvtudq2pd %ymm0, %zmm0 ; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512DQ-NEXT: vzeroupper @@ -5565,7 +5565,7 @@ ; ; AVX512VLDQ-LABEL: extract3_uitofp_v4i32_f64: ; AVX512VLDQ: # %bb.0: -; AVX512VLDQ-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX512VLDQ-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX512VLDQ-NEXT: vcvtudq2pd %xmm0, %xmm0 ; AVX512VLDQ-NEXT: retq %e = extractelement <4 x i32> %x, i32 3 diff --git a/llvm/test/CodeGen/X86/vec_umulo.ll b/llvm/test/CodeGen/X86/vec_umulo.ll --- a/llvm/test/CodeGen/X86/vec_umulo.ll +++ b/llvm/test/CodeGen/X86/vec_umulo.ll @@ -2748,8 +2748,8 @@ ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [16777215,16777215,16777215,16777215] ; AVX1-NEXT: vandps %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vandps %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[1,1,3,3] -; AVX1-NEXT: vpermilps {{.*#+}} xmm3 = xmm0[1,1,3,3] +; AVX1-NEXT: vshufps {{.*#+}} xmm2 = xmm1[1,1,3,3] +; AVX1-NEXT: vshufps {{.*#+}} xmm3 = xmm0[1,1,3,3] ; AVX1-NEXT: vpmuludq %xmm2, %xmm3, %xmm2 ; AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm3 ; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] diff --git a/llvm/test/CodeGen/X86/vector-fshr-256.ll b/llvm/test/CodeGen/X86/vector-fshr-256.ll --- a/llvm/test/CodeGen/X86/vector-fshr-256.ll +++ b/llvm/test/CodeGen/X86/vector-fshr-256.ll @@ -43,7 +43,7 @@ ; AVX1-NEXT: vpsllq %xmm4, %xmm5, %xmm4 ; AVX1-NEXT: vpsllq %xmm3, %xmm5, %xmm3 ; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4,5,6,7] -; AVX1-NEXT: vpermilps {{.*#+}} xmm4 = xmm2[2,3,2,3] +; AVX1-NEXT: vshufps {{.*#+}} xmm4 = xmm2[2,3,2,3] ; AVX1-NEXT: vpaddq %xmm0, %xmm0, %xmm0 ; AVX1-NEXT: vpsllq %xmm4, %xmm0, %xmm4 ; AVX1-NEXT: vpsllq %xmm2, %xmm0, %xmm0 diff --git a/llvm/test/CodeGen/X86/vector-half-conversions.ll b/llvm/test/CodeGen/X86/vector-half-conversions.ll --- a/llvm/test/CodeGen/X86/vector-half-conversions.ll +++ b/llvm/test/CodeGen/X86/vector-half-conversions.ll @@ -2252,7 +2252,7 @@ ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX-NEXT: vzeroupper ; AVX-NEXT: callq __truncsfhf2@PLT ; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill @@ -2324,7 +2324,7 @@ ; AVX1-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0 ; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: callq __truncsfhf2@PLT ; AVX1-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill @@ -2373,7 +2373,7 @@ ; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: callq __truncsfhf2@PLT ; AVX1-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill @@ -2429,7 +2429,7 @@ ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vextractf128 $1, %ymm1, %xmm0 ; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: callq __truncsfhf2@PLT ; AVX2-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill @@ -2478,7 +2478,7 @@ ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: callq __truncsfhf2@PLT ; AVX2-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill @@ -2729,7 +2729,7 @@ ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX-NEXT: vzeroupper ; AVX-NEXT: callq __truncsfhf2@PLT ; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill @@ -2806,7 +2806,7 @@ ; AVX1-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0 ; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: callq __truncsfhf2@PLT ; AVX1-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill @@ -2855,7 +2855,7 @@ ; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: callq __truncsfhf2@PLT ; AVX1-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill @@ -2916,7 +2916,7 @@ ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vextractf128 $1, %ymm1, %xmm0 ; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: callq __truncsfhf2@PLT ; AVX2-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill @@ -2965,7 +2965,7 @@ ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: callq __truncsfhf2@PLT ; AVX2-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill @@ -4432,7 +4432,7 @@ ; AVX1-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0 ; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: callq __truncsfhf2@PLT ; AVX1-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill @@ -4481,7 +4481,7 @@ ; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: callq __truncsfhf2@PLT ; AVX1-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill @@ -4531,7 +4531,7 @@ ; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: callq __truncsfhf2@PLT ; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -4580,7 +4580,7 @@ ; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: callq __truncsfhf2@PLT ; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -4645,7 +4645,7 @@ ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vextractf128 $1, %ymm1, %xmm0 ; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: callq __truncsfhf2@PLT ; AVX2-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill @@ -4694,7 +4694,7 @@ ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: callq __truncsfhf2@PLT ; AVX2-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill @@ -4744,7 +4744,7 @@ ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: callq __truncsfhf2@PLT ; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -4793,7 +4793,7 @@ ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: callq __truncsfhf2@PLT ; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill diff --git a/llvm/test/CodeGen/X86/vector-interleave.ll b/llvm/test/CodeGen/X86/vector-interleave.ll --- a/llvm/test/CodeGen/X86/vector-interleave.ll +++ b/llvm/test/CodeGen/X86/vector-interleave.ll @@ -538,10 +538,10 @@ ; AVX1: # %bb.0: ; AVX1-NEXT: vmovups (%rdi), %xmm0 ; AVX1-NEXT: vmovups 16(%rdi), %xmm1 -; AVX1-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[0,0,1,1] -; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,2,3,3] -; AVX1-NEXT: vpermilps {{.*#+}} xmm3 = xmm1[0,0,1,1] -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[2,2,3,3] +; AVX1-NEXT: vshufps {{.*#+}} xmm2 = xmm0[0,0,1,1] +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,2,3,3] +; AVX1-NEXT: vshufps {{.*#+}} xmm3 = xmm1[0,0,1,1] +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,2,3,3] ; AVX1-NEXT: vmovups %xmm1, 48(%rsi) ; AVX1-NEXT: vmovups %xmm3, 32(%rsi) ; AVX1-NEXT: vmovups %xmm0, 16(%rsi) @@ -551,8 +551,8 @@ ; AVX2-LABEL: splat2_i32: ; AVX2: # %bb.0: ; AVX2-NEXT: vpermpd $216, (%rdi), %ymm0 # ymm0 = mem[0,2,1,3] -; AVX2-NEXT: vpermilps {{.*#+}} ymm1 = ymm0[0,0,1,1,4,4,5,5] -; AVX2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,2,3,3,6,6,7,7] +; AVX2-NEXT: vshufps {{.*#+}} ymm1 = ymm0[0,0,1,1,4,4,5,5] +; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,2,3,3,6,6,7,7] ; AVX2-NEXT: vmovups %ymm0, 32(%rsi) ; AVX2-NEXT: vmovups %ymm1, (%rsi) ; AVX2-NEXT: vzeroupper diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-5.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-5.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-5.ll @@ -1170,7 +1170,7 @@ ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm11 = [65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,65535,65535,65535] ; AVX1-ONLY-NEXT: vandps %ymm11, %ymm9, %ymm12 ; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm9 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm13 = xmm9[0,1,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm13 = xmm9[0,1,0,1] ; AVX1-ONLY-NEXT: vandnps %ymm13, %ymm11, %ymm13 ; AVX1-ONLY-NEXT: vorps %ymm13, %ymm12, %ymm12 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm12, %ymm10 @@ -2278,7 +2278,7 @@ ; AVX1-ONLY-NEXT: vandps %ymm3, %ymm14, %ymm3 ; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm4 = xmm0[0,1,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm0[0,1,0,1] ; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm14, %ymm4 ; AVX1-ONLY-NEXT: vorps %ymm4, %ymm3, %ymm3 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm0 @@ -2321,7 +2321,7 @@ ; AVX1-ONLY-NEXT: vandps %ymm0, %ymm14, %ymm0 ; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm8 = xmm1[0,1,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm8 = xmm1[0,1,0,1] ; AVX1-ONLY-NEXT: vandnps %ymm8, %ymm14, %ymm8 ; AVX1-ONLY-NEXT: vorps %ymm0, %ymm8, %ymm0 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 @@ -4701,7 +4701,7 @@ ; AVX1-ONLY-NEXT: vandps %ymm6, %ymm3, %ymm3 ; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm4 = xmm0[0,1,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm0[0,1,0,1] ; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm6, %ymm4 ; AVX1-ONLY-NEXT: vorps %ymm4, %ymm3, %ymm3 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm0 @@ -4744,7 +4744,7 @@ ; AVX1-ONLY-NEXT: vandps %ymm6, %ymm3, %ymm3 ; AVX1-ONLY-NEXT: vmovaps 544(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm4 = xmm0[0,1,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm0[0,1,0,1] ; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm6, %ymm4 ; AVX1-ONLY-NEXT: vorps %ymm4, %ymm3, %ymm3 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm0 @@ -4787,7 +4787,7 @@ ; AVX1-ONLY-NEXT: vandps %ymm6, %ymm3, %ymm3 ; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm4 = xmm0[0,1,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm0[0,1,0,1] ; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm6, %ymm4 ; AVX1-ONLY-NEXT: vorps %ymm4, %ymm3, %ymm3 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm0 @@ -4830,7 +4830,7 @@ ; AVX1-ONLY-NEXT: vandps %ymm6, %ymm2, %ymm2 ; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm11 = xmm0[0,1,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm11 = xmm0[0,1,0,1] ; AVX1-ONLY-NEXT: vandnps %ymm11, %ymm6, %ymm11 ; AVX1-ONLY-NEXT: vorps %ymm2, %ymm11, %ymm2 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm0 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-7.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-7.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-7.ll @@ -4080,7 +4080,7 @@ ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload ; AVX1-ONLY-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm11[0],xmm1[1],xmm11[1],xmm1[2],xmm11[2],xmm1[3],xmm11[3] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm12[0,1,0,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm12[0,1,0,3] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm13[2,2,3,3] ; AVX1-ONLY-NEXT: vpunpckhqdq {{.*#+}} xmm4 = xmm1[1],xmm4[1] diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-8.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-8.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-8.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-8.ll @@ -2520,7 +2520,7 @@ ; AVX1-ONLY-NEXT: vmovdqa %xmm11, %xmm1 ; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm7[2],xmm11[2],xmm7[3],xmm11[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm3 = xmm10[2,2,2,2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm10[2,2,2,2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm12[0,1,2],xmm3[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm3[6,7] @@ -5302,7 +5302,7 @@ ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm8[2,2,2,2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm8[2,2,2,2] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm9[0,1,2],xmm0[3] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload @@ -5326,12 +5326,12 @@ ; AVX1-ONLY-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm3 = xmm4[2,2,2,2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm4[2,2,2,2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm1[0,1,2],xmm3[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm3[6,7] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm3 = xmm6[2,2,2,2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm6[2,2,2,2] ; AVX1-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm3 = mem[0,1,2],xmm3[3] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload @@ -5383,14 +5383,14 @@ ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm9[2],xmm8[2],xmm9[3],xmm8[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm3 = xmm2[2,3,2,3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm14 = xmm13[3,3,3,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm2[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm13[3,3,3,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm14[0],xmm3[1],xmm14[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm3[0,1],xmm0[2,3] ; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm3 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm3 = xmm11[2],mem[2],xmm11[3],mem[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm11 = xmm5[2,3,2,3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm10 = xmm12[3,3,3,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm11 = xmm5[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm12[3,3,3,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm10[0],xmm11[1],xmm10[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm10 @@ -5409,7 +5409,7 @@ ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm3 = xmm2[2],xmm6[2],xmm2[3],xmm6[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm1[2,3,2,3] ; AVX1-ONLY-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm1 = mem[3,3,3,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3] @@ -5618,7 +5618,7 @@ ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps (%rsp), %xmm10 # 16-byte Reload -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm10[1,1,1,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm10[1,1,1,1] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm8[1],xmm0[2,3] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload @@ -5649,7 +5649,7 @@ ; AVX1-ONLY-NEXT: # xmm1 = xmm4[0],mem[0],xmm4[1],mem[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm14 = xmm4[1,1,1,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm4[1,1,1,1] ; AVX1-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm14 = xmm14[0],mem[1],xmm14[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14 @@ -5666,7 +5666,7 @@ ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm12[1,1,1,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm12[1,1,1,1] ; AVX1-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm1 = xmm1[0],mem[1],xmm1[2,3] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload @@ -5690,7 +5690,7 @@ ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm14[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm5[2,2,2,2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm5[2,2,2,2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm7[0,1,2],xmm0[3] ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm10[2],xmm8[2],xmm10[3],xmm8[3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] @@ -5705,7 +5705,7 @@ ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm7[2,2,2,2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm7[2,2,2,2] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload @@ -5716,7 +5716,7 @@ ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm4[2],xmm9[2],xmm4[3],xmm9[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm14 = xmm8[2,2,2,2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm8[2,2,2,2] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm3[0,1,2],xmm14[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14 @@ -5728,7 +5728,7 @@ ; AVX1-ONLY-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm11[2,2,2,2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm11[2,2,2,2] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm10[0,1,2],xmm1[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 @@ -5763,12 +5763,12 @@ ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm14[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm2[2],xmm7[2],xmm2[3],xmm7[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm5[2,3,2,3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm4 = xmm6[3,3,3,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm5[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm6[3,3,3,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm4[0],xmm1[1],xmm4[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm3[2],xmm8[2],xmm3[3],xmm8[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm2 = xmm9[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm9[2,3,2,3] ; AVX1-ONLY-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm3 = mem[3,3,3,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2,3] @@ -5809,8 +5809,8 @@ ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm4 = xmm13[2,3,2,3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm5 = xmm12[3,3,3,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm13[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm12[3,3,3,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-2.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-2.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-2.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-2.ll @@ -28,8 +28,8 @@ ; AVX-LABEL: load_i32_stride2_vf2: ; AVX: # %bb.0: ; AVX-NEXT: vmovaps (%rdi), %xmm0 -; AVX-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,2,2,3] -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,3,2,3] +; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm0[0,2,2,3] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3,2,3] ; AVX-NEXT: vmovlps %xmm1, (%rsi) ; AVX-NEXT: vmovlps %xmm0, (%rdx) ; AVX-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-3.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-3.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-3.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-3.ll @@ -35,10 +35,10 @@ ; AVX1-ONLY: # %bb.0: ; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[0,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm0[0,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm1[0],xmm0[1],xmm1[2,3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm3 = xmm3[1,0,2,3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm3[1,0,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] ; AVX1-ONLY-NEXT: vmovlps %xmm2, (%rsi) ; AVX1-ONLY-NEXT: vmovlps %xmm3, (%rdx) @@ -49,9 +49,9 @@ ; AVX2-ONLY: # %bb.0: ; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm0 ; AVX2-ONLY-NEXT: vmovaps 16(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[0,3,2,3] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm0[0,3,2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,0,2,3] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,0,2,3] ; AVX2-ONLY-NEXT: vbroadcastss 8(%rdi), %xmm3 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2,3] ; AVX2-ONLY-NEXT: vmovlps %xmm2, (%rsi) @@ -63,9 +63,9 @@ ; AVX512F-SLOW: # %bb.0: ; AVX512F-SLOW-NEXT: vmovaps (%rdi), %xmm0 ; AVX512F-SLOW-NEXT: vmovaps 16(%rdi), %xmm1 -; AVX512F-SLOW-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[0,3,2,3] +; AVX512F-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm0[0,3,2,3] ; AVX512F-SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] -; AVX512F-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,0,2,3] +; AVX512F-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,0,2,3] ; AVX512F-SLOW-NEXT: vbroadcastss 8(%rdi), %xmm3 ; AVX512F-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2,3] ; AVX512F-SLOW-NEXT: vmovlps %xmm2, (%rsi) @@ -91,9 +91,9 @@ ; AVX512BW-SLOW: # %bb.0: ; AVX512BW-SLOW-NEXT: vmovaps (%rdi), %xmm0 ; AVX512BW-SLOW-NEXT: vmovaps 16(%rdi), %xmm1 -; AVX512BW-SLOW-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[0,3,2,3] +; AVX512BW-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm0[0,3,2,3] ; AVX512BW-SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] -; AVX512BW-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,0,2,3] +; AVX512BW-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,0,2,3] ; AVX512BW-SLOW-NEXT: vbroadcastss 8(%rdi), %xmm3 ; AVX512BW-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2,3] ; AVX512BW-SLOW-NEXT: vmovlps %xmm2, (%rsi) @@ -153,14 +153,14 @@ ; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm0[0],xmm2[1],xmm0[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm1[2],xmm3[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm3 = xmm3[0,3,2,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,3,2,1] ; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm4 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = mem[0,1],xmm4[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm1[0],xmm4[1,2],xmm1[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm4 = xmm4[1,0,3,2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm4[1,0,3,2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,1,0,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,1,0,3] ; AVX1-ONLY-NEXT: vmovaps %xmm3, (%rsi) ; AVX1-ONLY-NEXT: vmovaps %xmm4, (%rdx) ; AVX1-ONLY-NEXT: vmovaps %xmm0, (%rcx) @@ -260,14 +260,14 @@ ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,3],ymm4[0,2],ymm3[4,7],ymm4[4,6] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm0[2,3,0,1] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm0[1,0],ymm4[2,0],ymm0[5,4],ymm4[6,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm5 = ymm5[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm5[6,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm0[2,0],ymm4[3,0],ymm0[6,4],ymm4[7,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm4[0,0],ymm5[2,0],ymm4[4,4],ymm5[6,4] ; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm6 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm7[1,2],ymm6[0,3],ymm7[5,6],ymm6[4,7] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm7 = ymm7[0,2,3,1,4,6,7,5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm7[0,2,3,1,4,6,7,5] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0,1,2,3,4],ymm5[5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm6[1,0],ymm2[2,0],ymm6[5,4],ymm2[6,4] @@ -301,7 +301,7 @@ ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] ; AVX2-SLOW-NEXT: vmovaps {{.*#+}} ymm1 = <2,5,0,3,6,u,u,u> ; AVX2-SLOW-NEXT: vpermps %ymm0, %ymm1, %ymm0 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm2[0,1,0,3,4,5,4,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,1,0,3,4,5,4,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-SLOW-NEXT: vmovaps %ymm3, (%rsi) @@ -361,7 +361,7 @@ ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} ymm1 = <2,5,0,3,6,u,u,u> ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm0, %ymm1, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm2[0,1,0,3,4,5,4,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,1,0,3,4,5,4,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, (%rsi) @@ -506,7 +506,7 @@ ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[0,3],ymm7[0,2],ymm5[4,7],ymm7[4,6] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm3[2,3,0,1] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm3[1,0],ymm7[2,0],ymm3[5,4],ymm7[6,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm8 = ymm8[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm8[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6],ymm1[7] ; AVX1-ONLY-NEXT: vmovaps 112(%rdi), %xmm9 @@ -514,21 +514,21 @@ ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[0,3],ymm9[0,2],ymm8[4,7],ymm9[4,6] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm0[2,3,0,1] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm0[1,0],ymm9[2,0],ymm0[5,4],ymm9[6,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm10 = ymm10[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm10[6,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm3[2,0],ymm7[3,0],ymm3[6,4],ymm7[7,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm7[0,0],ymm10[2,0],ymm7[4,4],ymm10[6,4] ; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm11 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm6[0,1],ymm4[2],ymm6[3,4],ymm4[5],ymm6[6,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm12[1,2],ymm11[0,3],ymm12[5,6],ymm11[4,7] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm12 = ymm12[0,2,3,1,4,6,7,5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm12[0,2,3,1,4,6,7,5] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm12[0,1,2,3,4],ymm10[5,6,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm0[2,0],ymm9[3,0],ymm0[6,4],ymm9[7,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm9[0,0],ymm12[2,0],ymm9[4,4],ymm12[6,4] ; AVX1-ONLY-NEXT: vmovaps 112(%rdi), %xmm13 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[1,2],ymm13[0,3],ymm14[5,6],ymm13[4,7] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm14 = ymm14[0,2,3,1,4,6,7,5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[0,2,3,1,4,6,7,5] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm14[0,1,2,3,4],ymm12[5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm4[0,1],ymm6[2],ymm4[3,4],ymm6[5],ymm4[6,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm11[1,0],ymm6[2,0],ymm11[5,4],ymm6[6,4] @@ -581,12 +581,12 @@ ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7] ; AVX2-SLOW-NEXT: vmovaps {{.*#+}} ymm3 = <2,5,0,3,6,u,u,u> ; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm3, %ymm2 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm4 = ymm4[0,1,0,3,4,5,4,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,1,0,3,4,5,4,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,0,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm4[5,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm5[2],ymm1[3,4],ymm5[5],ymm1[6,7] ; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm3, %ymm1 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,1,0,3,4,5,4,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,0,3,4,5,4,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7] ; AVX2-SLOW-NEXT: vmovaps %ymm6, 32(%rsi) @@ -678,12 +678,12 @@ ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} ymm3 = <2,5,0,3,6,u,u,u> ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm2, %ymm3, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm4 = ymm4[0,1,0,3,4,5,4,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,1,0,3,4,5,4,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,0,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm4[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm5[2],ymm1[3,4],ymm5[5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm3, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,1,0,3,4,5,4,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,0,3,4,5,4,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 32(%rsi) @@ -968,7 +968,7 @@ ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm8[2,3,0,1] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm8[1,0],ymm10[2,0],ymm8[5,4],ymm10[6,4] ; AVX1-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0],ymm5[1],ymm7[2,3],ymm5[4],ymm7[5,6],ymm5[7] @@ -979,7 +979,7 @@ ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm14 = ymm6[2,3,0,1] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm6[1,0],ymm14[2,0],ymm6[5,4],ymm14[6,4] ; AVX1-ONLY-NEXT: vmovups %ymm6, (%rsp) # 32-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -990,7 +990,7 @@ ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm2[2,3,0,1] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[1,0],ymm12[2,0],ymm2[5,4],ymm12[6,4] ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %ymm15 @@ -1002,7 +1002,7 @@ ; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm4 ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm4[2,3,0,1] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm4[1,0],ymm1[2,0],ymm4[5,4],ymm1[6,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm13 = ymm13[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm13[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm13[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -1012,7 +1012,7 @@ ; AVX1-ONLY-NEXT: vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm8 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm8 = mem[0,1],ymm9[2],mem[3,4],ymm9[5],mem[6,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[1,2],ymm13[0,3],ymm8[5,6],ymm13[4,7] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm8 = ymm8[0,2,3,1,4,6,7,5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[0,2,3,1,4,6,7,5] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1,2,3,4],ymm0[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm6[2,0],ymm14[3,0],ymm6[6,4],ymm14[7,4] @@ -1020,7 +1020,7 @@ ; AVX1-ONLY-NEXT: vmovaps 304(%rdi), %xmm8 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm7[0,1],ymm5[2],ymm7[3,4],ymm5[5],ymm7[6,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[1,2],ymm8[0,3],ymm10[5,6],ymm8[4,7] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm10 = ymm10[0,2,3,1,4,6,7,5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[0,2,3,1,4,6,7,5] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3,4],ymm0[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm4[2,0],ymm1[3,0],ymm4[6,4],ymm1[7,4] @@ -1028,7 +1028,7 @@ ; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm10 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm11[0,1],ymm15[2],ymm11[3,4],ymm15[5],ymm11[6,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm6[1,2],ymm10[0,3],ymm6[5,6],ymm10[4,7] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm6 = ymm6[0,2,3,1,4,6,7,5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm6[0,2,3,1,4,6,7,5] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3,4],ymm0[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm2[2,0],ymm12[3,0],ymm2[6,4],ymm12[7,4] @@ -1038,7 +1038,7 @@ ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm2[0,1],ymm0[2],ymm2[3,4],ymm0[5],ymm2[6,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm7[1,2],ymm6[0,3],ymm7[5,6],ymm6[4,7] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm7 = ymm7[0,2,3,1,4,6,7,5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm7[0,2,3,1,4,6,7,5] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm7[0,1,2,3,4],ymm3[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vblendps $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm7 # 32-byte Folded Reload @@ -1155,22 +1155,22 @@ ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm10[0,1],ymm15[2],ymm10[3,4],ymm15[5],ymm10[6,7] ; AVX2-SLOW-NEXT: vmovaps {{.*#+}} ymm10 = <2,5,0,3,6,u,u,u> ; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm10, %ymm1 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm11 = ymm12[0,1,0,3,4,5,4,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm11 = ymm12[0,1,0,3,4,5,4,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[0,1,0,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm11[5,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm7[2],ymm4[3,4],ymm7[5],ymm4[6,7] ; AVX2-SLOW-NEXT: vpermps %ymm4, %ymm10, %ymm4 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm5 = ymm5[0,1,0,3,4,5,4,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm5 = ymm5[0,1,0,3,4,5,4,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,0,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm5[5,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm3[0,1],ymm13[2],ymm3[3,4],ymm13[5],ymm3[6,7] ; AVX2-SLOW-NEXT: vpermps %ymm5, %ymm10, %ymm5 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm3 = ymm8[0,1,0,3,4,5,4,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm3 = ymm8[0,1,0,3,4,5,4,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3,4],ymm3[5,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm2[0,1],ymm14[2],ymm2[3,4],ymm14[5],ymm2[6,7] ; AVX2-SLOW-NEXT: vpermps %ymm5, %ymm10, %ymm5 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm2 = ymm6[0,1,0,3,4,5,4,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm6[0,1,0,3,4,5,4,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1,2,3,4],ymm2[5,6,7] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload @@ -1360,22 +1360,22 @@ ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm10[0,1],ymm15[2],ymm10[3,4],ymm15[5],ymm10[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} ymm10 = <2,5,0,3,6,u,u,u> ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm10, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm11 = ymm12[0,1,0,3,4,5,4,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm11 = ymm12[0,1,0,3,4,5,4,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[0,1,0,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm11[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm7[2],ymm4[3,4],ymm7[5],ymm4[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm4, %ymm10, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm5 = ymm5[0,1,0,3,4,5,4,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm5 = ymm5[0,1,0,3,4,5,4,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,0,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm5[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm3[0,1],ymm13[2],ymm3[3,4],ymm13[5],ymm3[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm5, %ymm10, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm3 = ymm8[0,1,0,3,4,5,4,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm3 = ymm8[0,1,0,3,4,5,4,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3,4],ymm3[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm2[0,1],ymm14[2],ymm2[3,4],ymm14[5],ymm2[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm5, %ymm10, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm2 = ymm6[0,1,0,3,4,5,4,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm6[0,1,0,3,4,5,4,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload @@ -1930,7 +1930,7 @@ ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm8[2,3,0,1] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm8[1,0],ymm1[2,0],ymm8[5,4],ymm1[6,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0],ymm6[1],ymm7[2,3],ymm6[4],ymm7[5,6],ymm6[7] @@ -1939,7 +1939,7 @@ ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,3],ymm1[0,2],ymm0[4,7],ymm1[4,6] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm15 = ymm5[2,3,0,1] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm5[1,0],ymm15[2,0],ymm5[5,4],ymm15[6,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6],ymm3[7] @@ -1949,7 +1949,7 @@ ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm2[2,3,0,1] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[1,0],ymm5[2,0],ymm2[5,4],ymm5[6,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 704(%rdi), %ymm2 @@ -1964,7 +1964,7 @@ ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm14 = ymm1[2,3,0,1] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,0],ymm14[2,0],ymm1[5,4],ymm14[6,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %ymm8 @@ -1978,7 +1978,7 @@ ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm13 = ymm1[2,3,0,1] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,0],ymm13[2,0],ymm1[5,4],ymm13[6,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %ymm9 @@ -1993,7 +1993,7 @@ ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[1,0],ymm12[2,0],ymm2[5,4],ymm12[6,4] ; AVX1-ONLY-NEXT: vmovaps %ymm2, %ymm4 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %ymm10 @@ -2008,7 +2008,7 @@ ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[1,0],ymm7[2,0],ymm2[5,4],ymm7[6,4] ; AVX1-ONLY-NEXT: vmovaps %ymm2, %ymm6 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 608(%rdi), %ymm11 @@ -2023,7 +2023,7 @@ ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3,0,1] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm0[1,0],ymm1[2,0],ymm0[5,4],ymm1[6,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload @@ -2036,7 +2036,7 @@ ; AVX1-ONLY-NEXT: vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,2],ymm2[0,3],ymm1[5,6],ymm2[4,7] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,2,3,1,4,6,7,5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2,3,1,4,6,7,5] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -2049,7 +2049,7 @@ ; AVX1-ONLY-NEXT: vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,2],ymm2[0,3],ymm1[5,6],ymm2[4,7] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,2,3,1,4,6,7,5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2,3,1,4,6,7,5] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -2062,7 +2062,7 @@ ; AVX1-ONLY-NEXT: vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,2],ymm2[0,3],ymm1[5,6],ymm2[4,7] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,2,3,1,4,6,7,5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2,3,1,4,6,7,5] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -2075,7 +2075,7 @@ ; AVX1-ONLY-NEXT: vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,2],ymm2[0,3],ymm1[5,6],ymm2[4,7] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,2,3,1,4,6,7,5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2,3,1,4,6,7,5] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -2086,7 +2086,7 @@ ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm0[0,1],ymm8[2],ymm0[3,4],ymm8[5],ymm0[6,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[1,2],ymm3[0,3],ymm2[5,6],ymm3[4,7] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm2 = ymm2[0,2,3,1,4,6,7,5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,2,3,1,4,6,7,5] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -2096,7 +2096,7 @@ ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm1[0,1],ymm9[2],ymm1[3,4],ymm9[5],ymm1[6,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[1,2],ymm4[0,3],ymm5[5,6],ymm4[4,7] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm5 = ymm5[0,2,3,1,4,6,7,5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[0,2,3,1,4,6,7,5] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1,2,3,4],ymm2[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -2106,7 +2106,7 @@ ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm2[0,1],ymm10[2],ymm2[3,4],ymm10[5],ymm2[6,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[1,2],ymm6[0,3],ymm15[5,6],ymm6[4,7] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm15 = ymm15[0,2,3,1,4,6,7,5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[0,2,3,1,4,6,7,5] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm15[0,1,2,3,4],ymm5[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload @@ -2117,7 +2117,7 @@ ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm5[0,1],ymm11[2],ymm5[3,4],ymm11[5],ymm5[6,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[1,2],ymm7[0,3],ymm14[5,6],ymm7[4,7] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm14 = ymm14[0,2,3,1,4,6,7,5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[0,2,3,1,4,6,7,5] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0,1,2,3,4],ymm15[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm15 # 32-byte Reload @@ -2383,12 +2383,12 @@ ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1],ymm12[2],ymm7[3,4],ymm12[5],ymm7[6,7] ; AVX2-SLOW-NEXT: vmovaps {{.*#+}} ymm12 = <2,5,0,3,6,u,u,u> ; AVX2-SLOW-NEXT: vpermps %ymm0, %ymm12, %ymm0 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm8[0,1,0,3,4,5,4,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm8[0,1,0,3,4,5,4,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1],ymm2[2],ymm4[3,4],ymm2[5],ymm4[6,7] ; AVX2-SLOW-NEXT: vpermps %ymm0, %ymm12, %ymm0 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm3[0,1,0,3,4,5,4,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm3[0,1,0,3,4,5,4,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-SLOW-NEXT: vblendps $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm1 # 32-byte Folded Reload @@ -2414,7 +2414,7 @@ ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm10[0,1],ymm9[2],ymm10[3,4],ymm9[5],ymm10[6,7] ; AVX2-SLOW-NEXT: vpermps %ymm4, %ymm12, %ymm4 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm5 = ymm11[0,1,0,3,4,5,4,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm5 = ymm11[0,1,0,3,4,5,4,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,0,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm5[5,6,7] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload @@ -2428,7 +2428,7 @@ ; AVX2-SLOW-NEXT: vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm6 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm6 = mem[0,1],ymm14[2],mem[3,4],ymm14[5],mem[6,7] ; AVX2-SLOW-NEXT: vpermps %ymm6, %ymm12, %ymm6 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm7 = ymm15[0,1,0,3,4,5,4,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm7 = ymm15[0,1,0,3,4,5,4,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,0,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm7[5,6,7] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload @@ -2865,12 +2865,12 @@ ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1],ymm12[2],ymm7[3,4],ymm12[5],ymm7[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} ymm12 = <2,5,0,3,6,u,u,u> ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm0, %ymm12, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm8[0,1,0,3,4,5,4,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm8[0,1,0,3,4,5,4,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1],ymm2[2],ymm4[3,4],ymm2[5],ymm4[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm0, %ymm12, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm3[0,1,0,3,4,5,4,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm3[0,1,0,3,4,5,4,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm1 # 32-byte Folded Reload @@ -2896,7 +2896,7 @@ ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm10[0,1],ymm9[2],ymm10[3,4],ymm9[5],ymm10[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm4, %ymm12, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm5 = ymm11[0,1,0,3,4,5,4,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm5 = ymm11[0,1,0,3,4,5,4,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,0,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm5[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload @@ -2910,7 +2910,7 @@ ; AVX2-FAST-PERLANE-NEXT: vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm6 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm6 = mem[0,1],ymm14[2],mem[3,4],ymm14[5],mem[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm6, %ymm12, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm7 = ymm15[0,1,0,3,4,5,4,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm7 = ymm15[0,1,0,3,4,5,4,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,0,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm7[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-5.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-5.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-5.ll @@ -227,7 +227,7 @@ ; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm4 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm3[0,1],xmm4[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1],mem[2],xmm5[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm5 = xmm5[1,2,3,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm5[1,2,3,3] ; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm6 ; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm5 = xmm5[0,1,2],xmm6[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] @@ -434,7 +434,7 @@ ; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm10 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm11 = xmm9[0,1],xmm10[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm11 = xmm11[0,1],mem[2],xmm11[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm11 = xmm11[1,2,3,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm11 = xmm11[1,2,3,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm11[0,1,2],ymm7[3,4,5,6,7] ; AVX1-ONLY-NEXT: vbroadcastss 144(%rdi), %ymm11 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm11[7] @@ -447,7 +447,7 @@ ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm11[0,1,2],ymm8[3,4,5,6,7] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm0[2,3,0,1] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm0[1,0],ymm11[0,0],ymm0[5,4],ymm11[4,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm12 = ymm12[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm12[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm12[6,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm2[0,0],ymm1[3,0],ymm2[4,4],ymm1[7,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm12[2,0],ymm1[2,2],ymm12[6,4],ymm1[6,6] @@ -455,7 +455,7 @@ ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm9 = xmm9[3,0],mem[1,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2],ymm12[3,4,5,6,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm0[2,0],ymm11[1,0],ymm0[6,4],ymm11[5,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm10 = ymm10[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm10[6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5],ymm1[6,7] @@ -863,7 +863,7 @@ ; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm10 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm15[0,1],xmm10[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],mem[2],xmm3[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm3 = xmm3[1,2,3,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm3[1,2,3,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3,4,5,6,7] ; AVX1-ONLY-NEXT: vbroadcastss 144(%rdi), %ymm3 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm3[7] @@ -876,7 +876,7 @@ ; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %xmm8 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm9[0,1],xmm8[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],mem[2],xmm3[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm3 = xmm3[1,2,3,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm3[1,2,3,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3,4,5,6,7] ; AVX1-ONLY-NEXT: vbroadcastss 304(%rdi), %ymm3 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm3[7] @@ -891,7 +891,7 @@ ; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm1[2,3,0,1] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm1[1,0],ymm4[0,0],ymm1[5,4],ymm4[4,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm13 = ymm13[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm13[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm13[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm3, (%rsp) # 32-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,0],ymm2[2,0],ymm0[7,4],ymm2[6,4] @@ -905,7 +905,7 @@ ; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm0[2,3,0,1] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm0[1,0],ymm3[0,0],ymm0[5,4],ymm3[4,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm14 = ymm14[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],ymm14[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm10[0,1],xmm15[2,3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm6[0,0],ymm5[3,0],ymm6[4,4],ymm5[7,4] @@ -913,7 +913,7 @@ ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm10[3,0],mem[1,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2],ymm14[3,4,5,6,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm1[2,0],ymm4[1,0],ymm1[6,4],ymm4[5,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm14 = ymm14[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm14[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0,1],xmm9[2,3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm11[0,0],ymm2[3,0],ymm11[4,4],ymm2[7,4] @@ -921,7 +921,7 @@ ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm8 = xmm8[3,0],mem[1,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2],ymm9[3,4,5,6,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm0[2,0],ymm3[1,0],ymm0[6,4],ymm3[5,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm9 = ymm9[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm9[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm9[6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm6[5],ymm5[6,7] @@ -1793,7 +1793,7 @@ ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm6[0,1],xmm2[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],mem[2],xmm2[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[1,2,3,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm2[1,2,3,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7] ; AVX1-ONLY-NEXT: vbroadcastss 304(%rdi), %ymm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm2[7] @@ -1809,7 +1809,7 @@ ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm4[0,1],xmm2[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],mem[2],xmm2[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[1,2,3,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm2[1,2,3,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7] ; AVX1-ONLY-NEXT: vbroadcastss 624(%rdi), %ymm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm2[7] @@ -1825,7 +1825,7 @@ ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm4[0,1],xmm2[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],mem[2],xmm2[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[1,2,3,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm2[1,2,3,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7] ; AVX1-ONLY-NEXT: vbroadcastss 144(%rdi), %ymm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm2[7] @@ -1841,7 +1841,7 @@ ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],mem[2],xmm2[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[1,2,3,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm2[1,2,3,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7] ; AVX1-ONLY-NEXT: vbroadcastss 464(%rdi), %ymm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm2[7] @@ -1859,7 +1859,7 @@ ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm5[1,0],ymm1[0,0],ymm5[5,4],ymm1[4,4] ; AVX1-ONLY-NEXT: vmovaps %ymm1, %ymm11 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm13[3,0],ymm7[2,0],ymm13[7,4],ymm7[6,4] @@ -1874,7 +1874,7 @@ ; AVX1-ONLY-NEXT: vmovaps 608(%rdi), %ymm6 ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm6[2,3,0,1] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm6[1,0],ymm12[0,0],ymm6[5,4],ymm12[4,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm4 = ymm4[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm4[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm10[3,0],ymm14[2,0],ymm10[7,4],ymm14[6,4] @@ -1888,7 +1888,7 @@ ; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %ymm9 ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm13 = ymm9[2,3,0,1] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm9[1,0],ymm13[0,0],ymm9[5,4],ymm13[4,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm4 = ymm4[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm4[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -1904,7 +1904,7 @@ ; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %ymm4 ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm4[2,3,0,1] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm4[1,0],ymm10[0,0],ymm4[5,4],ymm10[4,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm15 = ymm15[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm15[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -1916,7 +1916,7 @@ ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,0],mem[1,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm15[3,4,5,6,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm5[2,0],ymm11[1,0],ymm5[6,4],ymm11[5,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm15 = ymm15[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm15[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -1928,7 +1928,7 @@ ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,0],mem[1,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm15[3,4,5,6,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm6[2,0],ymm12[1,0],ymm6[6,4],ymm12[5,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm15 = ymm15[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm15[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -1940,7 +1940,7 @@ ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,0],mem[1,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm15[3,4,5,6,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm9[2,0],ymm13[1,0],ymm9[6,4],ymm13[5,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm15 = ymm15[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm15[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -1952,7 +1952,7 @@ ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,0],mem[1,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm15[3,4,5,6,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm4[2,0],ymm10[1,0],ymm4[6,4],ymm10[5,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm15 = ymm15[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm15[6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: vblendps $32, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload @@ -3734,7 +3734,7 @@ ; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],mem[2],xmm4[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm4 = xmm4[1,2,3,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm4[1,2,3,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3,4,5,6,7] ; AVX1-ONLY-NEXT: vbroadcastss 304(%rdi), %ymm4 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm4[7] @@ -3751,7 +3751,7 @@ ; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],mem[2],xmm4[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm4 = xmm4[1,2,3,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm4[1,2,3,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3,4,5,6,7] ; AVX1-ONLY-NEXT: vbroadcastss 624(%rdi), %ymm4 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm4[7] @@ -3768,7 +3768,7 @@ ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm0[0,1],xmm1[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],mem[2],xmm4[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm4 = xmm4[1,2,3,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm4[1,2,3,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3,4,5,6,7] ; AVX1-ONLY-NEXT: vbroadcastss 944(%rdi), %ymm4 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5,6],ymm4[7] @@ -3785,7 +3785,7 @@ ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm0[0,1],xmm1[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],mem[2],xmm4[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm4 = xmm4[1,2,3,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm4[1,2,3,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3,4,5,6,7] ; AVX1-ONLY-NEXT: vbroadcastss 1264(%rdi), %ymm4 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5,6],ymm4[7] @@ -3801,7 +3801,7 @@ ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm0[0,1],xmm1[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],mem[2],xmm4[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm4 = xmm4[1,2,3,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm4[1,2,3,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3,4,5,6,7] ; AVX1-ONLY-NEXT: vbroadcastss 144(%rdi), %ymm4 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5,6],ymm4[7] @@ -3817,7 +3817,7 @@ ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm0[0,1],xmm1[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],mem[2],xmm4[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm4 = xmm4[1,2,3,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm4[1,2,3,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3,4,5,6,7] ; AVX1-ONLY-NEXT: vbroadcastss 464(%rdi), %ymm4 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5,6],ymm4[7] @@ -3834,7 +3834,7 @@ ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm0[0,1],xmm1[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],mem[2],xmm4[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm4 = xmm4[1,2,3,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm4[1,2,3,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3,4,5,6,7] ; AVX1-ONLY-NEXT: vbroadcastss 784(%rdi), %ymm4 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5,6],ymm4[7] @@ -3851,7 +3851,7 @@ ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm0[0,1],xmm1[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],mem[2],xmm4[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm4 = xmm4[1,2,3,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm4[1,2,3,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3,4,5,6,7] ; AVX1-ONLY-NEXT: vbroadcastss 1104(%rdi), %ymm4 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5,6],ymm4[7] @@ -3872,7 +3872,7 @@ ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm1[2,3,0,1] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm1[1,0],ymm10[0,0],ymm1[5,4],ymm10[4,4] ; AVX1-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm3 = ymm3[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm3[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload @@ -3891,7 +3891,7 @@ ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3,0,1] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[1,0],ymm1[0,0],ymm2[5,4],ymm1[4,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload @@ -3910,7 +3910,7 @@ ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3,0,1] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[1,0],ymm1[0,0],ymm2[5,4],ymm1[4,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -3929,7 +3929,7 @@ ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm1[2,3,0,1] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,0],ymm7[0,0],ymm1[5,4],ymm7[4,4] ; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -3947,7 +3947,7 @@ ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3,0,1] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[1,0],ymm1[0,0],ymm2[5,4],ymm1[4,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -3964,7 +3964,7 @@ ; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %ymm3 ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm3[2,3,0,1] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm3[1,0],ymm9[0,0],ymm3[5,4],ymm9[4,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm2[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -3980,7 +3980,7 @@ ; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %ymm6 ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm6[2,3,0,1] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm6[1,0],ymm12[0,0],ymm6[5,4],ymm12[4,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm5 = ymm5[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm5[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -3998,7 +3998,7 @@ ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm8[1,0],ymm1[0,0],ymm8[5,4],ymm1[4,4] ; AVX1-ONLY-NEXT: vmovaps %ymm1, %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm15 = ymm15[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm15[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -4012,7 +4012,7 @@ ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm15[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm1[2,0],ymm10[1,0],ymm1[6,4],ymm10[5,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm15 = ymm15[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm15[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -4027,7 +4027,7 @@ ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps $18, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm15 = ymm1[2,0],mem[1,0],ymm1[6,4],mem[5,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm15 = ymm15[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm15[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -4042,7 +4042,7 @@ ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps $18, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm15 = ymm1[2,0],mem[1,0],ymm1[6,4],mem[5,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm15 = ymm15[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm15[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -4056,7 +4056,7 @@ ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm15[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm1[2,0],ymm7[1,0],ymm1[6,4],ymm7[5,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm15 = ymm15[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm15[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -4071,7 +4071,7 @@ ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps $18, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm15 = ymm1[2,0],mem[1,0],ymm1[6,4],mem[5,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm15 = ymm15[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm15[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -4083,7 +4083,7 @@ ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,0],mem[1,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm15[3,4,5,6,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm8[2,0],ymm2[1,0],ymm8[6,4],ymm2[5,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm15 = ymm15[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm15[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -4095,7 +4095,7 @@ ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,0],mem[1,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm15[3,4,5,6,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm6[2,0],ymm12[1,0],ymm6[6,4],ymm12[5,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm15 = ymm15[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm15[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -4107,7 +4107,7 @@ ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,0],mem[1,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm15[3,4,5,6,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm3[2,0],ymm9[1,0],ymm3[6,4],ymm9[5,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm15 = ymm15[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm0[0,1,2,3,4,5],ymm15[6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-6.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-6.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-6.ll @@ -51,18 +51,18 @@ ; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm3 = xmm1[2,2,3,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm1[2,2,3,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm0[0],xmm3[1],xmm0[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm0[0,1],xmm1[2,3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm4 = xmm4[1,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm4[1,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm2[0,1],xmm0[2,3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm5 = xmm5[2,0,2,3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm5[2,0,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm6 = xmm2[2,2,3,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm2[2,2,3,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm1[0],xmm6[1],xmm1[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[1,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,3,2,3] ; AVX1-ONLY-NEXT: vmovlps %xmm3, (%rsi) ; AVX1-ONLY-NEXT: vmovlps %xmm4, (%rdx) ; AVX1-ONLY-NEXT: vmovlps %xmm5, (%rcx) @@ -78,13 +78,13 @@ ; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovaps 16(%rdi), %xmm2 ; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm3 -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm4 = xmm2[2,2,3,3] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm2[2,2,3,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm1[0],xmm4[1],xmm1[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3] -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[1,3,2,3] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm2[1,3,2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm3[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm5 = xmm5[2,0,2,3] -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,3,3,3] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm5[2,0,2,3] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2,3] ; AVX2-ONLY-NEXT: vmovaps {{.*#+}} xmm3 = <4,2,u,u> ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] @@ -307,16 +307,16 @@ ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,1],xmm1[3,3] ; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm6[1] ; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm7 = xmm1[2,2,3,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm1[2,2,3,3] ; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm8 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm8[0],xmm7[1],xmm8[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm9 = xmm5[0,1],xmm6[2,3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm9 = xmm9[0,1,0,2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm9 = xmm9[0,1,0,2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm9[2,3] ; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5],xmm6[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm8[0,1],xmm1[2,3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[1,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm5[2,3] ; AVX1-ONLY-NEXT: vmovaps %xmm4, (%rsi) ; AVX1-ONLY-NEXT: vmovaps %xmm2, (%rdx) @@ -558,7 +558,7 @@ ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm10[0,1,2],ymm2[3,4,5,6,7] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm4[2,3],ymm3[0,1] ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm11 = ymm10[0],ymm4[0],ymm10[3],ymm4[2] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm11 = ymm11[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm11[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm11[6,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm0[3,0],ymm5[1,0],ymm0[7,4],ymm5[5,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm11[2,0],ymm5[2,3],ymm11[6,4],ymm5[6,7] @@ -566,7 +566,7 @@ ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm8 = xmm8[0,2],xmm9[1,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm8[0,1,2],ymm5[3,4,5,6,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm10[3,1],ymm4[1,3],ymm10[7,5],ymm4[5,7] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm8 = ymm8[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm8[6,7] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm8 = ymm4[0,1],ymm3[2],ymm4[3] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm8[2,3,0,1] @@ -589,7 +589,7 @@ ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm3[2,0],ymm4[0,0],ymm3[6,4],ymm4[4,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm4[0,2],ymm8[2,0],ymm4[4,6],ymm8[6,4] ; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm9 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm10 = xmm9[2,2,3,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm9[2,2,3,3] ; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm11 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm11[0],xmm10[1],xmm11[2,3] ; AVX1-ONLY-NEXT: vmovapd 80(%rdi), %xmm12 @@ -600,7 +600,7 @@ ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[3,0],ymm4[1,0],ymm3[7,4],ymm4[5,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm4[0,3],ymm3[2,0],ymm4[4,7],ymm3[6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm11[0,1],xmm9[2,3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm4 = xmm4[1,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm4[1,3,2,3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm12[3,1],ymm1[1,3],ymm12[7,5],ymm1[5,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,1],ymm1[2,0],ymm0[5,5],ymm1[6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,3,4,5,6,7] @@ -628,7 +628,7 @@ ; AVX2-SLOW-NEXT: vpermps %ymm6, %ymm2, %ymm2 ; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm7[0,1],ymm5[0,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm5[6,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm9 = ymm8[0,2,2,2,4,6,6,6] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm9 = ymm8[0,2,2,2,4,6,6,6] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm9[3,4,5,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vbroadcastsd {{.*#+}} ymm10 = [8589934596,8589934596,8589934596,8589934596] @@ -636,42 +636,42 @@ ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm10[6,7] ; AVX2-SLOW-NEXT: vmovaps {{.*#+}} xmm10 = <1,7,5,u> ; AVX2-SLOW-NEXT: vpermps %ymm6, %ymm10, %ymm6 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm8 = ymm8[1,3,2,3,5,7,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm8 = ymm8[1,3,2,3,5,7,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2],ymm8[3,4,5,6,7] ; AVX2-SLOW-NEXT: vbroadcastsd {{.*#+}} ymm8 = [12884901893,12884901893,12884901893,12884901893] ; AVX2-SLOW-NEXT: vpermps %ymm9, %ymm8, %ymm8 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm8[6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm7[0,1],ymm5[2,3],ymm7[4,5],ymm5[6,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm8 = ymm8[2,0,2,3,6,4,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm8 = ymm8[2,0,2,3,6,4,6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,2,0,3] ; AVX2-SLOW-NEXT: vmovaps (%rdi), %xmm9 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm10 = xmm9[2,3,2,3] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm11 = ymm4[0,0,2,3,4,4,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm10 = xmm9[2,3,2,3] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm11 = ymm4[0,0,2,3,4,4,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0],ymm11[1,2,3],ymm10[4],ymm11[5,6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[0,3,2,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm10[0,1,2],ymm8[3,4,5,6,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm10 = ymm1[0,0,0,0,4,4,4,4] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm11 = ymm0[0,0,2,3,4,4,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm10 = ymm1[0,0,0,0,4,4,4,4] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm11 = ymm0[0,0,2,3,4,4,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0,1,2],ymm10[3],ymm11[4,5,6],ymm10[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[0,1,0,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm10[5,6,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm10 = ymm5[3,3,3,3,7,7,7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm10 = ymm5[3,3,3,3,7,7,7,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0],ymm7[1],ymm10[2,3,4],ymm7[5],ymm10[6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[0,2,0,3] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm9 = xmm9[3,3,3,3] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm11 = ymm4[0,1,3,3,4,5,7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm9 = xmm9[3,3,3,3] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm11 = ymm4[0,1,3,3,4,5,7,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0],ymm11[1,2,3],ymm9[4],ymm11[5,6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[0,3,2,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2],ymm10[3,4,5,6,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm10 = ymm1[0,1,0,1,4,5,4,5] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm11 = ymm0[0,1,3,3,4,5,7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm10 = ymm1[0,1,0,1,4,5,4,5] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm11 = ymm0[0,1,3,3,4,5,7,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0,1,2],ymm10[3],ymm11[4,5,6],ymm10[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[0,1,0,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4],ymm10[5,6,7] ; AVX2-SLOW-NEXT: vmovaps 80(%rdi), %xmm10 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0,1,2,3],ymm5[4,5,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm5[0,1],ymm10[2,3],ymm5[4,5,6,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm7 = ymm7[0,1,0,2,4,5,4,6] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm7 = ymm7[0,1,0,2,4,5,4,6] ; AVX2-SLOW-NEXT: vmovaps {{.*#+}} xmm11 = <4,2,u,u> ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] ; AVX2-SLOW-NEXT: vpermps %ymm3, %ymm11, %ymm4 @@ -681,7 +681,7 @@ ; AVX2-SLOW-NEXT: # ymm1 = mem[0,1,0,1] ; AVX2-SLOW-NEXT: vpermps %ymm0, %ymm1, %ymm1 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3,4],ymm1[5,6,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm4 = ymm5[1,1,1,1,5,5,5,5] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm4 = ymm5[1,1,1,1,5,5,5,5] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2],ymm10[3],ymm4[4,5,6,7] ; AVX2-SLOW-NEXT: vmovaps {{.*#+}} xmm5 = <5,3,u,u> ; AVX2-SLOW-NEXT: vpermps %ymm3, %ymm5, %ymm3 @@ -713,7 +713,7 @@ ; AVX2-FAST-NEXT: vpermps %ymm6, %ymm2, %ymm2 ; AVX2-FAST-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm7[0,1],ymm5[0,1] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm5[6,7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm9 = ymm8[0,2,2,2,4,6,6,6] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm9 = ymm8[0,2,2,2,4,6,6,6] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm9[3,4,5,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm9 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-NEXT: vbroadcastsd {{.*#+}} ymm10 = [8589934596,8589934596,8589934596,8589934596] @@ -721,7 +721,7 @@ ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm10[6,7] ; AVX2-FAST-NEXT: vmovaps {{.*#+}} xmm10 = <1,7,5,u> ; AVX2-FAST-NEXT: vpermps %ymm6, %ymm10, %ymm6 -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm8 = ymm8[1,3,2,3,5,7,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm8 = ymm8[1,3,2,3,5,7,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2],ymm8[3,4,5,6,7] ; AVX2-FAST-NEXT: vbroadcastsd {{.*#+}} ymm8 = [12884901893,12884901893,12884901893,12884901893] ; AVX2-FAST-NEXT: vpermps %ymm9, %ymm8, %ymm8 @@ -730,33 +730,33 @@ ; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm9 = ; AVX2-FAST-NEXT: vpermps %ymm8, %ymm9, %ymm8 ; AVX2-FAST-NEXT: vmovaps (%rdi), %xmm9 -; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm10 = xmm9[2,3,2,3] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm11 = ymm4[0,0,2,3,4,4,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm10 = xmm9[2,3,2,3] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm11 = ymm4[0,0,2,3,4,4,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0],ymm11[1,2,3],ymm10[4],ymm11[5,6,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[0,3,2,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm10[0,1,2],ymm8[3,4,5,6,7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm10 = ymm1[0,0,0,0,4,4,4,4] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm11 = ymm0[0,0,2,3,4,4,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm10 = ymm1[0,0,0,0,4,4,4,4] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm11 = ymm0[0,0,2,3,4,4,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0,1,2],ymm10[3],ymm11[4,5,6],ymm10[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[0,1,0,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm10[5,6,7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm10 = ymm5[3,3,3,3,7,7,7,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm10 = ymm5[3,3,3,3,7,7,7,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0],ymm7[1],ymm10[2,3,4],ymm7[5],ymm10[6,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[0,2,0,3] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm9 = xmm9[3,3,3,3] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm11 = ymm4[0,1,3,3,4,5,7,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm9 = xmm9[3,3,3,3] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm11 = ymm4[0,1,3,3,4,5,7,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0],ymm11[1,2,3],ymm9[4],ymm11[5,6,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[0,3,2,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2],ymm10[3,4,5,6,7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm10 = ymm1[0,1,0,1,4,5,4,5] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm11 = ymm0[0,1,3,3,4,5,7,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm10 = ymm1[0,1,0,1,4,5,4,5] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm11 = ymm0[0,1,3,3,4,5,7,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0,1,2],ymm10[3],ymm11[4,5,6],ymm10[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[0,1,0,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4],ymm10[5,6,7] ; AVX2-FAST-NEXT: vmovaps 80(%rdi), %xmm10 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0,1,2,3],ymm5[4,5,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm5[0,1],ymm10[2,3],ymm5[4,5,6,7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm7 = ymm7[0,1,0,2,4,5,4,6] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm7 = ymm7[0,1,0,2,4,5,4,6] ; AVX2-FAST-NEXT: vmovaps {{.*#+}} xmm11 = <4,2,u,u> ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] ; AVX2-FAST-NEXT: vpermps %ymm3, %ymm11, %ymm4 @@ -766,7 +766,7 @@ ; AVX2-FAST-NEXT: # ymm1 = mem[0,1,0,1] ; AVX2-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm1 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3,4],ymm1[5,6,7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm4 = ymm5[1,1,1,1,5,5,5,5] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm5[1,1,1,1,5,5,5,5] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2],ymm10[3],ymm4[4,5,6,7] ; AVX2-FAST-NEXT: vmovaps {{.*#+}} xmm5 = <5,3,u,u> ; AVX2-FAST-NEXT: vpermps %ymm3, %ymm5, %ymm3 @@ -798,7 +798,7 @@ ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm6, %ymm2, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm7[0,1],ymm5[0,1] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm5[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm9 = ymm8[0,2,2,2,4,6,6,6] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm9 = ymm8[0,2,2,2,4,6,6,6] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm9[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd {{.*#+}} ymm10 = [8589934596,8589934596,8589934596,8589934596] @@ -806,42 +806,42 @@ ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm10[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} xmm10 = <1,7,5,u> ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm6, %ymm10, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm8 = ymm8[1,3,2,3,5,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm8 = ymm8[1,3,2,3,5,7,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2],ymm8[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd {{.*#+}} ymm8 = [12884901893,12884901893,12884901893,12884901893] ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm9, %ymm8, %ymm8 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm8[6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm7[0,1],ymm5[2,3],ymm7[4,5],ymm5[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm8 = ymm8[2,0,2,3,6,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm8 = ymm8[2,0,2,3,6,4,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,2,0,3] ; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdi), %xmm9 -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm10 = xmm9[2,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm11 = ymm4[0,0,2,3,4,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm10 = xmm9[2,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm11 = ymm4[0,0,2,3,4,4,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0],ymm11[1,2,3],ymm10[4],ymm11[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[0,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm10[0,1,2],ymm8[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm10 = ymm1[0,0,0,0,4,4,4,4] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm11 = ymm0[0,0,2,3,4,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm10 = ymm1[0,0,0,0,4,4,4,4] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm11 = ymm0[0,0,2,3,4,4,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0,1,2],ymm10[3],ymm11[4,5,6],ymm10[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[0,1,0,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm10[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm10 = ymm5[3,3,3,3,7,7,7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm10 = ymm5[3,3,3,3,7,7,7,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0],ymm7[1],ymm10[2,3,4],ymm7[5],ymm10[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[0,2,0,3] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm9 = xmm9[3,3,3,3] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm11 = ymm4[0,1,3,3,4,5,7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm9 = xmm9[3,3,3,3] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm11 = ymm4[0,1,3,3,4,5,7,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0],ymm11[1,2,3],ymm9[4],ymm11[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[0,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2],ymm10[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm10 = ymm1[0,1,0,1,4,5,4,5] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm11 = ymm0[0,1,3,3,4,5,7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm10 = ymm1[0,1,0,1,4,5,4,5] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm11 = ymm0[0,1,3,3,4,5,7,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0,1,2],ymm10[3],ymm11[4,5,6],ymm10[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[0,1,0,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4],ymm10[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps 80(%rdi), %xmm10 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0,1,2,3],ymm5[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm5[0,1],ymm10[2,3],ymm5[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm7 = ymm7[0,1,0,2,4,5,4,6] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm7 = ymm7[0,1,0,2,4,5,4,6] ; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} xmm11 = <4,2,u,u> ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm3, %ymm11, %ymm4 @@ -851,7 +851,7 @@ ; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0,1,0,1] ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm0, %ymm1, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3,4],ymm1[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm4 = ymm5[1,1,1,1,5,5,5,5] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm4 = ymm5[1,1,1,1,5,5,5,5] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2],ymm10[3],ymm4[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} xmm5 = <5,3,u,u> ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm3, %ymm5, %ymm3 @@ -1253,7 +1253,7 @@ ; AVX1-ONLY-NEXT: vmovapd %ymm0, %ymm10 ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm11 = ymm7[0],ymm14[0],ymm7[3],ymm14[2] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm11 = ymm11[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm11[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm11[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, 288(%rdi), %ymm3, %ymm1 @@ -1270,7 +1270,7 @@ ; AVX1-ONLY-NEXT: vmovapd 320(%rdi), %ymm12 ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm12[2,3],ymm5[0,1] ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm11 = ymm5[0],ymm12[0],ymm5[3],ymm12[2] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm11 = ymm11[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm11[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm11[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm13[3,0],ymm6[1,0],ymm13[7,4],ymm6[5,4] @@ -1280,7 +1280,7 @@ ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm9[0,2],xmm2[1,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm6[3,4,5,6,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm7[3,1],ymm14[1,3],ymm7[7,5],ymm14[5,7] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm6 = ymm6[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm6[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm6[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps %ymm8, %ymm4 @@ -1290,7 +1290,7 @@ ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm2[0,2],xmm0[1,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm5[3,1],ymm12[1,3],ymm5[7,5],ymm12[5,7] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -1341,7 +1341,7 @@ ; AVX1-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = mem[0,1],ymm14[2,3],mem[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm4[2,2,3,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm4[2,2,3,3] ; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm5 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm5[0],xmm1[1],xmm5[2,3] ; AVX1-ONLY-NEXT: vmovapd 80(%rdi), %xmm7 @@ -1356,7 +1356,7 @@ ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1],ymm12[2,3],ymm10[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm11 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm12 = xmm11[2,2,3,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm12 = xmm11[2,2,3,3] ; AVX1-ONLY-NEXT: vmovaps 208(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm12 = xmm1[0],xmm12[1],xmm1[2,3] ; AVX1-ONLY-NEXT: vmovapd 272(%rdi), %xmm2 @@ -1373,7 +1373,7 @@ ; AVX1-ONLY-NEXT: vshufps $215, (%rsp), %ymm7, %ymm5 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm5 = ymm7[3,1],mem[1,3],ymm7[7,5],mem[5,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm14[1,1],ymm5[2,0],ymm14[5,5],ymm5[6,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm4 = xmm4[1,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm4[1,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm5[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3,4],ymm0[5,6,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm9[3,0],ymm12[1,0],ymm9[7,4],ymm12[5,4] @@ -1381,7 +1381,7 @@ ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm11[2,3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[3,1],ymm3[1,3],ymm2[7,5],ymm3[5,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm13[1,1],ymm2[2,0],ymm13[5,5],ymm2[6,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[1,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm4[5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload @@ -1431,7 +1431,7 @@ ; AVX2-SLOW-NEXT: vpermps %ymm4, %ymm1, %ymm6 ; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm9[0,1],ymm7[0,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],ymm7[6,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm13 = ymm12[0,2,2,2,4,6,6,6] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm13 = ymm12[0,2,2,2,4,6,6,6] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2],ymm13[3,4,5,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm13 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vbroadcastsd {{.*#+}} ymm0 = [8589934596,8589934596,8589934596,8589934596] @@ -1443,7 +1443,7 @@ ; AVX2-SLOW-NEXT: vmovaps 256(%rdi), %ymm5 ; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm15 = ymm5[0,1],ymm8[0,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm15[0,1,2,3,4,5],ymm8[6,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm15 = ymm2[0,2,2,2,4,6,6,6] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm15 = ymm2[0,2,2,2,4,6,6,6] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm14[0,1,2],ymm15[3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovaps 320(%rdi), %ymm14 ; AVX2-SLOW-NEXT: vmovaps 352(%rdi), %ymm15 @@ -1453,73 +1453,73 @@ ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps {{.*#+}} xmm0 = <1,7,5,u> ; AVX2-SLOW-NEXT: vpermps %ymm4, %ymm0, %ymm3 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm4 = ymm12[1,3,2,3,5,7,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm4 = ymm12[1,3,2,3,5,7,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm4[3,4,5,6,7] ; AVX2-SLOW-NEXT: vbroadcastsd {{.*#+}} ymm4 = [12884901893,12884901893,12884901893,12884901893] ; AVX2-SLOW-NEXT: vpermps %ymm13, %ymm4, %ymm12 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm12[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpermps %ymm6, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm2 = ymm2[1,3,2,3,5,7,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm2[1,3,2,3,5,7,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7] ; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm4, %ymm1 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps (%rdi), %xmm0 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm2 = ymm11[0,0,2,3,4,4,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm11[0,0,2,3,4,4,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm9[0,1],ymm7[2,3],ymm9[4,5],ymm7[6,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm2 = ymm2[2,0,2,3,6,4,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,0,2,3,6,4,6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,0,3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,3,2,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm2 = ymm12[0,0,0,0,4,4,4,4] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm12[0,0,0,0,4,4,4,4] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm3 = ymm6[0,0,2,3,4,4,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm3 = ymm6[0,0,2,3,4,4,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm1, (%rsp) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps 192(%rdi), %xmm1 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[2,3,2,3] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm3 = ymm10[0,0,2,3,4,4,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm1[2,3,2,3] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm3 = ymm10[0,0,2,3,4,4,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm3[1,2,3],ymm2[4],ymm3[5,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1],ymm8[2,3],ymm5[4,5],ymm8[6,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm3 = ymm3[2,0,2,3,6,4,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm3 = ymm3[2,0,2,3,6,4,6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,2,0,3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,3,2,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3,4,5,6,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm3 = ymm15[0,0,0,0,4,4,4,4] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm4 = ymm14[0,0,2,3,4,4,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm3 = ymm15[0,0,0,0,4,4,4,4] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm4 = ymm14[0,0,2,3,4,4,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3],ymm4[4,5,6],ymm3[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm2 = ymm11[0,1,3,3,4,5,7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm11[0,1,3,3,4,5,7,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm2 = ymm7[3,3,3,3,7,7,7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm7[3,3,3,3,7,7,7,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm9[1],ymm2[2,3,4],ymm9[5],ymm2[6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,0,3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm2 = ymm12[0,1,0,1,4,5,4,5] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm3 = ymm6[0,1,3,3,4,5,7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm12[0,1,0,1,4,5,4,5] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm3 = ymm6[0,1,3,3,4,5,7,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm13 = ymm0[0,1,2,3,4],ymm2[5,6,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm1[3,3,3,3] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm10[0,1,3,3,4,5,7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm1[3,3,3,3] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm10[0,1,3,3,4,5,7,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm8[3,3,3,3,7,7,7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm8[3,3,3,3,7,7,7,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm5[1],ymm1[2,3,4],ymm5[5],ymm1[6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm15[0,1,0,1,4,5,4,5] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm2 = ymm14[0,1,3,3,4,5,7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm15[0,1,0,1,4,5,4,5] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm14[0,1,3,3,4,5,7,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm0[0,1,2,3,4],ymm1[5,6,7] @@ -1528,7 +1528,7 @@ ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm8[4,5,6,7] ; AVX2-SLOW-NEXT: vmovaps 272(%rdi), %xmm4 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm4[2,3],ymm2[4,5,6,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,1,0,2,4,5,4,6] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,0,2,4,5,4,6] ; AVX2-SLOW-NEXT: vmovaps {{.*#+}} xmm5 = <4,2,u,u> ; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm5, %ymm8 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1],ymm0[2,3,4,5,6,7] @@ -1542,13 +1542,13 @@ ; AVX2-SLOW-NEXT: # ymm9 = ymm11[0,1,2,3],mem[4,5,6,7] ; AVX2-SLOW-NEXT: vmovaps 80(%rdi), %xmm11 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm14 = ymm7[0,1],ymm11[2,3],ymm7[4,5,6,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm14 = ymm14[0,1,0,2,4,5,4,6] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm14 = ymm14[0,1,0,2,4,5,4,6] ; AVX2-SLOW-NEXT: vpermps %ymm9, %ymm5, %ymm5 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm14[2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm14 = ymm12[0,1],ymm6[2,3],ymm12[4,5,6,7] ; AVX2-SLOW-NEXT: vpermps %ymm14, %ymm10, %ymm10 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm10[5,6,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm2 = ymm2[1,1,1,1,5,5,5,5] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm2[1,1,1,1,5,5,5,5] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm4[3],ymm2[4,5,6,7] ; AVX2-SLOW-NEXT: vmovaps {{.*#+}} xmm4 = <5,3,u,u> ; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm4, %ymm1 @@ -1557,7 +1557,7 @@ ; AVX2-SLOW-NEXT: # ymm2 = mem[0,1,0,1] ; AVX2-SLOW-NEXT: vpermps %ymm8, %ymm2, %ymm8 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm8[5,6,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm7 = ymm7[1,1,1,1,5,5,5,5] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm7 = ymm7[1,1,1,1,5,5,5,5] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2],ymm11[3],ymm7[4,5,6,7] ; AVX2-SLOW-NEXT: vpermps %ymm9, %ymm4, %ymm4 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm7[2,3,4,5,6,7] @@ -1607,7 +1607,7 @@ ; AVX2-FAST-NEXT: vpermps %ymm4, %ymm1, %ymm6 ; AVX2-FAST-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm9[0,1],ymm7[0,1] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],ymm7[6,7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm13 = ymm12[0,2,2,2,4,6,6,6] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm13 = ymm12[0,2,2,2,4,6,6,6] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2],ymm13[3,4,5,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm13 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-NEXT: vbroadcastsd {{.*#+}} ymm0 = [8589934596,8589934596,8589934596,8589934596] @@ -1619,7 +1619,7 @@ ; AVX2-FAST-NEXT: vmovaps 256(%rdi), %ymm5 ; AVX2-FAST-NEXT: vperm2f128 {{.*#+}} ymm15 = ymm5[0,1],ymm8[0,1] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm15[0,1,2,3,4,5],ymm8[6,7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm15 = ymm2[0,2,2,2,4,6,6,6] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm15 = ymm2[0,2,2,2,4,6,6,6] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm14[0,1,2],ymm15[3,4,5,6,7] ; AVX2-FAST-NEXT: vmovaps 320(%rdi), %ymm14 ; AVX2-FAST-NEXT: vmovaps 352(%rdi), %ymm15 @@ -1629,21 +1629,21 @@ ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps {{.*#+}} xmm0 = <1,7,5,u> ; AVX2-FAST-NEXT: vpermps %ymm4, %ymm0, %ymm3 -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm4 = ymm12[1,3,2,3,5,7,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm12[1,3,2,3,5,7,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm4[3,4,5,6,7] ; AVX2-FAST-NEXT: vbroadcastsd {{.*#+}} ymm4 = [12884901893,12884901893,12884901893,12884901893] ; AVX2-FAST-NEXT: vpermps %ymm13, %ymm4, %ymm12 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm12[6,7] ; AVX2-FAST-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpermps %ymm6, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm2 = ymm2[1,3,2,3,5,7,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm2[1,3,2,3,5,7,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7] ; AVX2-FAST-NEXT: vpermps %ymm1, %ymm4, %ymm1 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps (%rdi), %xmm0 -; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm2 = ymm11[0,0,2,3,4,4,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm11[0,0,2,3,4,4,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm9[0,1],ymm7[2,3],ymm9[4,5],ymm7[6,7] ; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm3 = [2,0,6,4,2,0,6,7] @@ -1651,9 +1651,9 @@ ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,3,2,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm2 = ymm12[0,0,0,0,4,4,4,4] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm12[0,0,0,0,4,4,4,4] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm4 = ymm6[0,0,2,3,4,4,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm6[0,0,2,3,4,4,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2],ymm2[3],ymm4[4,5,6],ymm2[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] @@ -1661,40 +1661,40 @@ ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1],ymm8[2,3],ymm5[4,5],ymm8[6,7] ; AVX2-FAST-NEXT: vpermps %ymm1, %ymm3, %ymm1 ; AVX2-FAST-NEXT: vmovaps 192(%rdi), %xmm2 -; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm3 = xmm2[2,3,2,3] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm4 = ymm10[0,0,2,3,4,4,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm3 = xmm2[2,3,2,3] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm10[0,0,2,3,4,4,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm4[1,2,3],ymm3[4],ymm4[5,6,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,3,2,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3,4,5,6,7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm3 = ymm15[0,0,0,0,4,4,4,4] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm4 = ymm14[0,0,2,3,4,4,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm3 = ymm15[0,0,0,0,4,4,4,4] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm14[0,0,2,3,4,4,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3],ymm4[4,5,6],ymm3[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm3[5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm1 = ymm11[0,1,3,3,4,5,7,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm11[0,1,3,3,4,5,7,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm1 = ymm7[3,3,3,3,7,7,7,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm7[3,3,3,3,7,7,7,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm9[1],ymm1[2,3,4],ymm9[5],ymm1[6,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm1 = ymm12[0,1,0,1,4,5,4,5] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm3 = ymm6[0,1,3,3,4,5,7,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm12[0,1,0,1,4,5,4,5] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm3 = ymm6[0,1,3,3,4,5,7,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3],ymm3[4,5,6],ymm1[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm13 = ymm0[0,1,2,3,4],ymm1[5,6,7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm0 = xmm2[3,3,3,3] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm1 = ymm10[0,1,3,3,4,5,7,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm2[3,3,3,3] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm10[0,1,3,3,4,5,7,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm1 = ymm8[3,3,3,3,7,7,7,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm8[3,3,3,3,7,7,7,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm5[1],ymm1[2,3,4],ymm5[5],ymm1[6,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm1 = ymm15[0,1,0,1,4,5,4,5] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm2 = ymm14[0,1,3,3,4,5,7,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm15[0,1,0,1,4,5,4,5] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm14[0,1,3,3,4,5,7,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm0[0,1,2,3,4],ymm1[5,6,7] @@ -1703,7 +1703,7 @@ ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm8[4,5,6,7] ; AVX2-FAST-NEXT: vmovaps 272(%rdi), %xmm4 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm4[2,3],ymm2[4,5,6,7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,1,0,2,4,5,4,6] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,0,2,4,5,4,6] ; AVX2-FAST-NEXT: vmovaps {{.*#+}} xmm5 = <4,2,u,u> ; AVX2-FAST-NEXT: vpermps %ymm1, %ymm5, %ymm8 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1],ymm0[2,3,4,5,6,7] @@ -1717,13 +1717,13 @@ ; AVX2-FAST-NEXT: # ymm9 = ymm11[0,1,2,3],mem[4,5,6,7] ; AVX2-FAST-NEXT: vmovaps 80(%rdi), %xmm11 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm14 = ymm7[0,1],ymm11[2,3],ymm7[4,5,6,7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm14 = ymm14[0,1,0,2,4,5,4,6] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm14 = ymm14[0,1,0,2,4,5,4,6] ; AVX2-FAST-NEXT: vpermps %ymm9, %ymm5, %ymm5 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm14[2,3,4,5,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm14 = ymm12[0,1],ymm6[2,3],ymm12[4,5,6,7] ; AVX2-FAST-NEXT: vpermps %ymm14, %ymm10, %ymm10 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm10[5,6,7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm2 = ymm2[1,1,1,1,5,5,5,5] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm2[1,1,1,1,5,5,5,5] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm4[3],ymm2[4,5,6,7] ; AVX2-FAST-NEXT: vmovaps {{.*#+}} xmm4 = <5,3,u,u> ; AVX2-FAST-NEXT: vpermps %ymm1, %ymm4, %ymm1 @@ -1732,7 +1732,7 @@ ; AVX2-FAST-NEXT: # ymm2 = mem[0,1,0,1] ; AVX2-FAST-NEXT: vpermps %ymm8, %ymm2, %ymm8 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm8[5,6,7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm7 = ymm7[1,1,1,1,5,5,5,5] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm7 = ymm7[1,1,1,1,5,5,5,5] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2],ymm11[3],ymm7[4,5,6,7] ; AVX2-FAST-NEXT: vpermps %ymm9, %ymm4, %ymm4 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm7[2,3,4,5,6,7] @@ -1782,7 +1782,7 @@ ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm4, %ymm1, %ymm6 ; AVX2-FAST-PERLANE-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm9[0,1],ymm7[0,1] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],ymm7[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm13 = ymm12[0,2,2,2,4,6,6,6] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm13 = ymm12[0,2,2,2,4,6,6,6] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2],ymm13[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm13 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd {{.*#+}} ymm0 = [8589934596,8589934596,8589934596,8589934596] @@ -1794,7 +1794,7 @@ ; AVX2-FAST-PERLANE-NEXT: vmovaps 256(%rdi), %ymm5 ; AVX2-FAST-PERLANE-NEXT: vperm2f128 {{.*#+}} ymm15 = ymm5[0,1],ymm8[0,1] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm15[0,1,2,3,4,5],ymm8[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm15 = ymm2[0,2,2,2,4,6,6,6] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm15 = ymm2[0,2,2,2,4,6,6,6] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm14[0,1,2],ymm15[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps 320(%rdi), %ymm14 ; AVX2-FAST-PERLANE-NEXT: vmovaps 352(%rdi), %ymm15 @@ -1804,73 +1804,73 @@ ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} xmm0 = <1,7,5,u> ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm4, %ymm0, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm4 = ymm12[1,3,2,3,5,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm4 = ymm12[1,3,2,3,5,7,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm4[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd {{.*#+}} ymm4 = [12884901893,12884901893,12884901893,12884901893] ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm13, %ymm4, %ymm12 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm12[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm6, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm2 = ymm2[1,3,2,3,5,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm2[1,3,2,3,5,7,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm4, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdi), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm2 = ymm11[0,0,2,3,4,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm11[0,0,2,3,4,4,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm9[0,1],ymm7[2,3],ymm9[4,5],ymm7[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm2 = ymm2[2,0,2,3,6,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,0,2,3,6,4,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,0,3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm2 = ymm12[0,0,0,0,4,4,4,4] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm12[0,0,0,0,4,4,4,4] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm3 = ymm6[0,0,2,3,4,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm3 = ymm6[0,0,2,3,4,4,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, (%rsp) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 192(%rdi), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[2,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm3 = ymm10[0,0,2,3,4,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm2 = xmm1[2,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm3 = ymm10[0,0,2,3,4,4,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm3[1,2,3],ymm2[4],ymm3[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1],ymm8[2,3],ymm5[4,5],ymm8[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm3 = ymm3[2,0,2,3,6,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm3 = ymm3[2,0,2,3,6,4,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,2,0,3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm3 = ymm15[0,0,0,0,4,4,4,4] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm4 = ymm14[0,0,2,3,4,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm3 = ymm15[0,0,0,0,4,4,4,4] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm4 = ymm14[0,0,2,3,4,4,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3],ymm4[4,5,6],ymm3[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm2 = ymm11[0,1,3,3,4,5,7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm11[0,1,3,3,4,5,7,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm2 = ymm7[3,3,3,3,7,7,7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm7[3,3,3,3,7,7,7,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm9[1],ymm2[2,3,4],ymm9[5],ymm2[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,0,3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm2 = ymm12[0,1,0,1,4,5,4,5] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm3 = ymm6[0,1,3,3,4,5,7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm12[0,1,0,1,4,5,4,5] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm3 = ymm6[0,1,3,3,4,5,7,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm13 = ymm0[0,1,2,3,4],ymm2[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm0 = xmm1[3,3,3,3] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm10[0,1,3,3,4,5,7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm1[3,3,3,3] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm10[0,1,3,3,4,5,7,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm8[3,3,3,3,7,7,7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm8[3,3,3,3,7,7,7,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm5[1],ymm1[2,3,4],ymm5[5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm15[0,1,0,1,4,5,4,5] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm2 = ymm14[0,1,3,3,4,5,7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm15[0,1,0,1,4,5,4,5] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm14[0,1,3,3,4,5,7,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm0[0,1,2,3,4],ymm1[5,6,7] @@ -1879,7 +1879,7 @@ ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm8[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps 272(%rdi), %xmm4 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm4[2,3],ymm2[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,1,0,2,4,5,4,6] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,0,2,4,5,4,6] ; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} xmm5 = <4,2,u,u> ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm5, %ymm8 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1],ymm0[2,3,4,5,6,7] @@ -1893,13 +1893,13 @@ ; AVX2-FAST-PERLANE-NEXT: # ymm9 = ymm11[0,1,2,3],mem[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps 80(%rdi), %xmm11 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm14 = ymm7[0,1],ymm11[2,3],ymm7[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm14 = ymm14[0,1,0,2,4,5,4,6] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm14 = ymm14[0,1,0,2,4,5,4,6] ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm9, %ymm5, %ymm5 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm14[2,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm14 = ymm12[0,1],ymm6[2,3],ymm12[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm14, %ymm10, %ymm10 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm10[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm2 = ymm2[1,1,1,1,5,5,5,5] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm2[1,1,1,1,5,5,5,5] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm4[3],ymm2[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} xmm4 = <5,3,u,u> ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm4, %ymm1 @@ -1908,7 +1908,7 @@ ; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[0,1,0,1] ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm8, %ymm2, %ymm8 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm8[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm7 = ymm7[1,1,1,1,5,5,5,5] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm7 = ymm7[1,1,1,1,5,5,5,5] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2],ymm11[3],ymm7[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm9, %ymm4, %ymm4 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm7[2,3,4,5,6,7] @@ -2782,7 +2782,7 @@ ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm5[2,3],ymm4[0,1] ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm12[0],ymm5[0],ymm12[3],ymm5[2] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, 480(%rdi), %ymm3, %ymm5 @@ -2803,7 +2803,7 @@ ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[0,1] ; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[3],ymm2[2] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %ymm1 @@ -2829,7 +2829,7 @@ ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm7[2,3],ymm1[0,1] ; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0],ymm7[0],ymm1[3],ymm7[2] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 672(%rdi), %ymm1 @@ -2854,7 +2854,7 @@ ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm15 = ymm0[2,3],ymm13[0,1] ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm13 = ymm15[0],ymm0[0],ymm15[3],ymm0[2] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm13 = ymm13[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm13[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0,1,2,3,4,5],ymm13[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload @@ -2865,7 +2865,7 @@ ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2],ymm8[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm12[3,1],ymm11[1,3],ymm12[7,5],ymm11[5,7] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm8 = ymm8[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm8[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload @@ -2878,7 +2878,7 @@ ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm0[3,1],ymm8[1,3],ymm0[7,5],ymm8[5,7] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm5 = ymm5[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -2890,7 +2890,7 @@ ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps $215, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm3 = ymm0[3,1],mem[1,3],ymm0[7,5],mem[5,7] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm3 = ymm3[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload @@ -2901,7 +2901,7 @@ ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm3[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm15[3,1],ymm7[1,3],ymm15[7,5],ymm7[5,7] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -3017,7 +3017,7 @@ ; AVX1-ONLY-NEXT: # ymm8 = mem[0,1],ymm0[2,3],mem[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,2,3,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,2,3,3] ; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] @@ -3036,7 +3036,7 @@ ; AVX1-ONLY-NEXT: # ymm6 = mem[0,1],ymm0[2,3],mem[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,2,3,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,2,3,3] ; AVX1-ONLY-NEXT: vmovaps 400(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2,3] @@ -3055,7 +3055,7 @@ ; AVX1-ONLY-NEXT: # ymm9 = mem[0,1],ymm0[2,3],mem[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,2,3,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,2,3,3] ; AVX1-ONLY-NEXT: vmovaps 208(%rdi), %xmm13 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm13[0],xmm0[1],xmm13[2,3] ; AVX1-ONLY-NEXT: vmovapd 272(%rdi), %xmm12 @@ -3071,7 +3071,7 @@ ; AVX1-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm4 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm4 = mem[0,1],ymm2[2,3],mem[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 608(%rdi), %xmm11 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm5 = xmm11[2,2,3,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm11[2,2,3,3] ; AVX1-ONLY-NEXT: vmovaps 592(%rdi), %xmm10 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm10[0],xmm5[1],xmm10[2,3] ; AVX1-ONLY-NEXT: vmovapd 656(%rdi), %xmm5 @@ -3094,7 +3094,7 @@ ; AVX1-ONLY-NEXT: # ymm8 = ymm8[3,1],mem[1,3],ymm8[7,5],mem[5,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm15[1,1],ymm8[2,0],ymm15[5,5],ymm8[6,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[1,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm2[1,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm8[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm1[5,6,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm6[3,0],ymm3[1,0],ymm6[7,4],ymm3[5,4] @@ -3107,7 +3107,7 @@ ; AVX1-ONLY-NEXT: # ymm6 = ymm6[3,1],mem[1,3],ymm6[7,5],mem[5,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm8[1,1],ymm6[2,0],ymm8[5,5],ymm6[6,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm3 = xmm3[1,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm3[1,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm6[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3,4],ymm1[5,6,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm9[3,0],ymm0[1,0],ymm9[7,4],ymm0[5,4] @@ -3118,7 +3118,7 @@ ; AVX1-ONLY-NEXT: # ymm6 = ymm12[3,1],mem[1,3],ymm12[7,5],mem[5,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm8[1,1],ymm6[2,0],ymm8[5,5],ymm6[6,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm3 = xmm3[1,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm3[1,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm6[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3,4],ymm0[5,6,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm4[3,0],ymm14[1,0],ymm4[7,4],ymm14[5,4] @@ -3127,7 +3127,7 @@ ; AVX1-ONLY-NEXT: vshufps $215, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm5 = ymm5[3,1],mem[1,3],ymm5[7,5],mem[5,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm7[1,1],ymm5[2,0],ymm7[5,5],ymm5[6,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm4 = xmm4[1,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm4[1,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm5[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm3[5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload @@ -3205,7 +3205,7 @@ ; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm14, %ymm2 ; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm5[0,1],ymm3[0,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm11 = ymm1[0,1,2,3,4,5],ymm3[6,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm3 = ymm11[0,2,2,2,4,6,6,6] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm3 = ymm11[0,2,2,2,4,6,6,6] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm2[0,1,2],ymm3[3,4,5,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm10 = ymm4[0,1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vbroadcastsd {{.*#+}} ymm2 = [8589934596,8589934596,8589934596,8589934596] @@ -3219,7 +3219,7 @@ ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm0[0,1,2,3],ymm8[4,5],ymm0[6,7] ; AVX2-SLOW-NEXT: vpermps %ymm6, %ymm14, %ymm5 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm7 = ymm12[0,2,2,2,4,6,6,6] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm7 = ymm12[0,2,2,2,4,6,6,6] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2],ymm7[3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovaps 512(%rdi), %ymm0 ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -3241,7 +3241,7 @@ ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] ; AVX2-SLOW-NEXT: vpermps %ymm3, %ymm14, %ymm15 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm13 = ymm9[0,2,2,2,4,6,6,6] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm13 = ymm9[0,2,2,2,4,6,6,6] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm13 = ymm15[0,1,2],ymm13[3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovaps 320(%rdi), %ymm0 ; AVX2-SLOW-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill @@ -3263,7 +3263,7 @@ ; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm15[0,1],ymm1[0,1] ; AVX2-SLOW-NEXT: vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm1[6,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm5 = ymm7[0,2,2,2,4,6,6,6] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm5 = ymm7[0,2,2,2,4,6,6,6] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm14[0,1,2],ymm5[3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovaps 704(%rdi), %ymm1 ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -3275,59 +3275,59 @@ ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps {{.*#+}} xmm0 = <1,7,5,u> ; AVX2-SLOW-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm11 = ymm11[1,3,2,3,5,7,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm11 = ymm11[1,3,2,3,5,7,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2],ymm11[3,4,5,6,7] ; AVX2-SLOW-NEXT: vbroadcastsd {{.*#+}} ymm11 = [12884901893,12884901893,12884901893,12884901893] ; AVX2-SLOW-NEXT: vpermps %ymm10, %ymm11, %ymm10 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm10[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpermps %ymm6, %ymm0, %ymm5 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm6 = ymm12[1,3,2,3,5,7,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm6 = ymm12[1,3,2,3,5,7,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2],ymm6[3,4,5,6,7] ; AVX2-SLOW-NEXT: vpermps %ymm4, %ymm11, %ymm4 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpermps %ymm3, %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm3 = ymm9[1,3,2,3,5,7,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm3 = ymm9[1,3,2,3,5,7,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm3[3,4,5,6,7] ; AVX2-SLOW-NEXT: vpermps %ymm8, %ymm11, %ymm2 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpermps %ymm13, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm7[1,3,2,3,5,7,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm7[1,3,2,3,5,7,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX2-SLOW-NEXT: vpermps %ymm14, %ymm11, %ymm1 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps (%rdi), %xmm2 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm2[2,3,2,3] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm2[2,3,2,3] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm13[0,0,2,3,4,4,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm13[0,0,2,3,4,4,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm11[0,1],ymm12[2,3],ymm11[4,5],ymm12[6,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[2,0,2,3,6,4,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0,2,3,6,4,6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm9[0,0,0,0,4,4,4,4] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm9[0,0,0,0,4,4,4,4] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm3 = ymm10[0,0,2,3,4,4,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm3 = ymm10[0,0,2,3,4,4,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3],ymm3[4,5,6],ymm1[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps 384(%rdi), %xmm1 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm1[2,3,2,3] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm1[2,3,2,3] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm3 = ymm8[0,0,2,3,4,4,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm3 = ymm8[0,0,2,3,4,4,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm3[1,2,3],ymm0[4],ymm3[5,6,7] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-SLOW-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm3 = mem[0,1],ymm3[2,3],mem[4,5],ymm3[6,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm3 = ymm3[2,0,2,3,6,4,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm3 = ymm3[2,0,2,3,6,4,6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,2,0,3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm3[3,4,5,6,7] @@ -3341,13 +3341,13 @@ ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps 576(%rdi), %xmm0 ; AVX2-SLOW-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX2-SLOW-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm4 = mem[0,0,2,3,4,4,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm4[1,2,3],ymm0[4],ymm4[5,6,7] ; AVX2-SLOW-NEXT: vblendps $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm4 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm4 = ymm15[0,1],mem[2,3],ymm15[4,5],mem[6,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm4 = ymm4[2,0,2,3,6,4,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm4 = ymm4[2,0,2,3,6,4,6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,2,0,3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm4[3,4,5,6,7] @@ -3360,69 +3360,69 @@ ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps 192(%rdi), %xmm0 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm4 = xmm0[2,3,2,3] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm4 = xmm0[2,3,2,3] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm5 = ymm7[0,0,2,3,4,4,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm5 = ymm7[0,0,2,3,4,4,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm5[1,2,3],ymm4[4],ymm5[5,6,7] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm14[0,1],ymm15[2,3],ymm14[4,5],ymm15[6,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm5 = ymm5[2,0,2,3,6,4,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm5 = ymm5[2,0,2,3,6,4,6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,2,0,3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,3,2,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2],ymm5[3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm5 = ymm3[0,0,0,0,4,4,4,4] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm5 = ymm3[0,0,0,0,4,4,4,4] ; AVX2-SLOW-NEXT: vpermilps $224, (%rsp), %ymm6 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm6 = mem[0,0,2,3,4,4,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2],ymm5[3],ymm6[4,5,6],ymm5[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,0,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm5[5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[3,3,3,3] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm4 = ymm13[0,1,3,3,4,5,7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm2[3,3,3,3] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm4 = ymm13[0,1,3,3,4,5,7,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm4[1,2,3],ymm2[4],ymm4[5,6,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm4 = ymm12[3,3,3,3,7,7,7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm4 = ymm12[3,3,3,3,7,7,7,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm11[1],ymm4[2,3,4],ymm11[5],ymm4[6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,2,0,3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,3,2,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm4[3,4,5,6,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm4 = ymm9[0,1,0,1,4,5,4,5] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm5 = ymm10[0,1,3,3,4,5,7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm4 = ymm9[0,1,0,1,4,5,4,5] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm5 = ymm10[0,1,3,3,4,5,7,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3],ymm5[4,5,6],ymm4[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,0,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm4[5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,3,3,3] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm2 = ymm8[0,1,3,3,4,5,7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm8[0,1,3,3,4,5,7,7] ; AVX2-SLOW-NEXT: vmovaps %ymm8, %ymm13 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm2 = ymm9[3,3,3,3,7,7,7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm9[3,3,3,3,7,7,7,7] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm5[1],ymm2[2,3,4],ymm5[5],ymm2[6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,0,3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,3,2,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm2 = ymm8[0,1,0,1,4,5,4,5] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm8[0,1,0,1,4,5,4,5] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm4 = ymm6[0,1,3,3,4,5,7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm4 = ymm6[0,1,3,3,4,5,7,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2],ymm2[3],ymm4[4,5,6],ymm2[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm7[0,1,3,3,4,5,7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm7[0,1,3,3,4,5,7,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm15[3,3,3,3,7,7,7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm15[3,3,3,3,7,7,7,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm14[1],ymm1[2,3,4],ymm14[5],ymm1[6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm3[0,1,0,1,4,5,4,5] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm3[0,1,0,1,4,5,4,5] ; AVX2-SLOW-NEXT: vmovups (%rsp), %ymm10 # 32-byte Reload -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm2 = ymm10[0,1,3,3,4,5,7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm10[0,1,3,3,4,5,7,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] @@ -3430,10 +3430,10 @@ ; AVX2-SLOW-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm0 = mem[3,3,3,3] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm3[0,1,3,3,4,5,7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm3[0,1,3,3,4,5,7,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm12[3,3,3,3,7,7,7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm12[3,3,3,3,7,7,7,7] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm11[1],ymm1[2,3,4],ymm11[5],ymm1[6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3] @@ -3442,7 +3442,7 @@ ; AVX2-SLOW-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm1 = mem[0,1,0,1,4,5,4,5] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm2 = ymm15[0,1,3,3,4,5,7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm15[0,1,3,3,4,5,7,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] @@ -3454,7 +3454,7 @@ ; AVX2-SLOW-NEXT: vmovaps 464(%rdi), %xmm1 ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm14[0,1],ymm1[2,3],ymm14[4,5,6,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm2 = ymm2[0,1,0,2,4,5,4,6] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,1,0,2,4,5,4,6] ; AVX2-SLOW-NEXT: vmovaps {{.*#+}} xmm1 = <4,2,u,u> ; AVX2-SLOW-NEXT: vpermps %ymm0, %ymm1, %ymm4 ; AVX2-SLOW-NEXT: vmovaps %ymm1, %ymm9 @@ -3474,7 +3474,7 @@ ; AVX2-SLOW-NEXT: # ymm13 = ymm1[0,1,2,3],mem[4,5,6,7] ; AVX2-SLOW-NEXT: vmovaps 272(%rdi), %xmm6 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm13[0,1],ymm6[2,3],ymm13[4,5,6,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm4 = ymm4[0,1,0,2,4,5,4,6] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,1,0,2,4,5,4,6] ; AVX2-SLOW-NEXT: vmovaps %ymm9, %ymm1 ; AVX2-SLOW-NEXT: vpermps %ymm0, %ymm9, %ymm8 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm8[0,1],ymm4[2,3,4,5,6,7] @@ -3488,7 +3488,7 @@ ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm12 = ymm11[0,1,2,3],ymm12[4,5,6,7] ; AVX2-SLOW-NEXT: vmovaps 656(%rdi), %xmm4 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm12[0,1],ymm4[2,3],ymm12[4,5,6,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm8 = ymm8[0,1,0,2,4,5,4,6] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm8 = ymm8[0,1,0,2,4,5,4,6] ; AVX2-SLOW-NEXT: vpermps %ymm9, %ymm1, %ymm11 ; AVX2-SLOW-NEXT: vmovaps %ymm1, %ymm3 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm11[0,1],ymm8[2,3,4,5,6,7] @@ -3505,7 +3505,7 @@ ; AVX2-SLOW-NEXT: # ymm1 = ymm0[0,1,2,3],mem[4,5,6,7] ; AVX2-SLOW-NEXT: vmovaps 80(%rdi), %xmm0 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm11 = ymm8[0,1],ymm0[2,3],ymm8[4,5,6,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm11 = ymm11[0,1,0,2,4,5,4,6] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm11 = ymm11[0,1,0,2,4,5,4,6] ; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm3, %ymm5 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm11[2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload @@ -3513,7 +3513,7 @@ ; AVX2-SLOW-NEXT: # ymm11 = mem[0,1],ymm3[2,3],mem[4,5,6,7] ; AVX2-SLOW-NEXT: vpermps %ymm11, %ymm2, %ymm2 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm2[5,6,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm2 = ymm14[1,1,1,1,5,5,5,5] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm14[1,1,1,1,5,5,5,5] ; AVX2-SLOW-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm2 = ymm2[0,1,2],mem[3],ymm2[4,5,6,7] ; AVX2-SLOW-NEXT: vmovaps {{.*#+}} xmm14 = <5,3,u,u> @@ -3523,19 +3523,19 @@ ; AVX2-SLOW-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-SLOW-NEXT: vpermps %ymm7, %ymm3, %ymm7 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm7[5,6,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm7 = ymm13[1,1,1,1,5,5,5,5] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm7 = ymm13[1,1,1,1,5,5,5,5] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2],ymm6[3],ymm7[4,5,6,7] ; AVX2-SLOW-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm7 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpermps %ymm10, %ymm3, %ymm7 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm7[5,6,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm7 = ymm12[1,1,1,1,5,5,5,5] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm7 = ymm12[1,1,1,1,5,5,5,5] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm7[0,1,2],ymm4[3],ymm7[4,5,6,7] ; AVX2-SLOW-NEXT: vpermps %ymm9, %ymm14, %ymm7 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm7[0,1],ymm4[2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpermps %ymm15, %ymm3, %ymm7 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm7[5,6,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm7 = ymm8[1,1,1,1,5,5,5,5] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm7 = ymm8[1,1,1,1,5,5,5,5] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1,2],ymm0[3],ymm7[4,5,6,7] ; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm14, %ymm1 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] @@ -3615,7 +3615,7 @@ ; AVX2-FAST-NEXT: vpermps %ymm1, %ymm14, %ymm2 ; AVX2-FAST-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm5[0,1],ymm3[0,1] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm11 = ymm1[0,1,2,3,4,5],ymm3[6,7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm3 = ymm11[0,2,2,2,4,6,6,6] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm3 = ymm11[0,2,2,2,4,6,6,6] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm2[0,1,2],ymm3[3,4,5,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm10 = ymm4[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-NEXT: vbroadcastsd {{.*#+}} ymm2 = [8589934596,8589934596,8589934596,8589934596] @@ -3629,7 +3629,7 @@ ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm0[0,1,2,3],ymm8[4,5],ymm0[6,7] ; AVX2-FAST-NEXT: vpermps %ymm6, %ymm14, %ymm5 -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm7 = ymm12[0,2,2,2,4,6,6,6] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm7 = ymm12[0,2,2,2,4,6,6,6] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2],ymm7[3,4,5,6,7] ; AVX2-FAST-NEXT: vmovaps 512(%rdi), %ymm0 ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -3651,7 +3651,7 @@ ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] ; AVX2-FAST-NEXT: vpermps %ymm3, %ymm14, %ymm15 -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm13 = ymm9[0,2,2,2,4,6,6,6] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm13 = ymm9[0,2,2,2,4,6,6,6] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm13 = ymm15[0,1,2],ymm13[3,4,5,6,7] ; AVX2-FAST-NEXT: vmovaps 320(%rdi), %ymm0 ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -3673,7 +3673,7 @@ ; AVX2-FAST-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm15[0,1],ymm1[0,1] ; AVX2-FAST-NEXT: vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm1[6,7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm5 = ymm7[0,2,2,2,4,6,6,6] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm5 = ymm7[0,2,2,2,4,6,6,6] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm14[0,1,2],ymm5[3,4,5,6,7] ; AVX2-FAST-NEXT: vmovaps 704(%rdi), %ymm1 ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -3685,34 +3685,34 @@ ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps {{.*#+}} xmm0 = <1,7,5,u> ; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm11 = ymm11[1,3,2,3,5,7,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm11 = ymm11[1,3,2,3,5,7,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2],ymm11[3,4,5,6,7] ; AVX2-FAST-NEXT: vbroadcastsd {{.*#+}} ymm11 = [12884901893,12884901893,12884901893,12884901893] ; AVX2-FAST-NEXT: vpermps %ymm10, %ymm11, %ymm10 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm10[6,7] ; AVX2-FAST-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpermps %ymm6, %ymm0, %ymm5 -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm6 = ymm12[1,3,2,3,5,7,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm6 = ymm12[1,3,2,3,5,7,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2],ymm6[3,4,5,6,7] ; AVX2-FAST-NEXT: vpermps %ymm4, %ymm11, %ymm4 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7] ; AVX2-FAST-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpermps %ymm3, %ymm0, %ymm1 -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm3 = ymm9[1,3,2,3,5,7,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm3 = ymm9[1,3,2,3,5,7,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm3[3,4,5,6,7] ; AVX2-FAST-NEXT: vpermps %ymm8, %ymm11, %ymm2 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpermps %ymm13, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm1 = ymm7[1,3,2,3,5,7,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm7[1,3,2,3,5,7,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX2-FAST-NEXT: vpermps %ymm14, %ymm11, %ymm1 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps (%rdi), %xmm2 -; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm0 = xmm2[2,3,2,3] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm2[2,3,2,3] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm1 = ymm9[0,0,2,3,4,4,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm9[0,0,2,3,4,4,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload @@ -3722,17 +3722,17 @@ ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,3,2,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm3[3,4,5,6,7] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm3 = ymm8[0,0,0,0,4,4,4,4] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm3 = ymm8[0,0,0,0,4,4,4,4] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm4 = ymm10[0,0,2,3,4,4,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm10[0,0,2,3,4,4,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3],ymm4[4,5,6],ymm3[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm3[5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps 384(%rdi), %xmm1 -; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm3 = xmm1[2,3,2,3] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm3 = xmm1[2,3,2,3] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm4 = ymm0[0,0,2,3,4,4,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm0[0,0,2,3,4,4,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm4[1,2,3],ymm3[4],ymm4[5,6,7] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX2-FAST-NEXT: vblendps $51, (%rsp), %ymm4, %ymm4 # 32-byte Folded Reload @@ -3750,7 +3750,7 @@ ; AVX2-FAST-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps 576(%rdi), %xmm3 ; AVX2-FAST-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm4 = xmm3[2,3,2,3] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm4 = xmm3[2,3,2,3] ; AVX2-FAST-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm5 = mem[0,0,2,3,4,4,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm5[1,2,3],ymm4[4],ymm5[5,6,7] @@ -3772,64 +3772,64 @@ ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm14[0,1],ymm15[2,3],ymm14[4,5],ymm15[6,7] ; AVX2-FAST-NEXT: vpermps %ymm4, %ymm7, %ymm4 ; AVX2-FAST-NEXT: vmovaps 192(%rdi), %xmm13 -; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm5 = xmm13[2,3,2,3] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm5 = xmm13[2,3,2,3] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm6 = ymm7[0,0,2,3,4,4,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm6 = ymm7[0,0,2,3,4,4,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm6[1,2,3],ymm5[4],ymm6[5,6,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,3,2,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3,4,5,6,7] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm5 = ymm3[0,0,0,0,4,4,4,4] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm5 = ymm3[0,0,0,0,4,4,4,4] ; AVX2-FAST-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm6 = mem[0,0,2,3,4,4,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2],ymm5[3],ymm6[4,5,6],ymm5[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,0,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm5[5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[3,3,3,3] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm4 = ymm9[0,1,3,3,4,5,7,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm2 = xmm2[3,3,3,3] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm9[0,1,3,3,4,5,7,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm4[1,2,3],ymm2[4],ymm4[5,6,7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm4 = ymm12[3,3,3,3,7,7,7,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm12[3,3,3,3,7,7,7,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm11[1],ymm4[2,3,4],ymm11[5],ymm4[6,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,2,0,3] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,3,2,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm4[3,4,5,6,7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm4 = ymm8[0,1,0,1,4,5,4,5] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm5 = ymm10[0,1,3,3,4,5,7,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm8[0,1,0,1,4,5,4,5] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm5 = ymm10[0,1,3,3,4,5,7,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3],ymm5[4,5,6],ymm4[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,0,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm4[5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,3,3,3] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm2 = ymm0[0,1,3,3,4,5,7,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm0[0,1,3,3,4,5,7,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm2 = ymm9[3,3,3,3,7,7,7,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm9[3,3,3,3,7,7,7,7] ; AVX2-FAST-NEXT: vmovups (%rsp), %ymm5 # 32-byte Reload ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm5[1],ymm2[2,3,4],ymm5[5],ymm2[6,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,0,3] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,3,2,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm2 = ymm8[0,1,0,1,4,5,4,5] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm8[0,1,0,1,4,5,4,5] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm4 = ymm6[0,1,3,3,4,5,7,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm6[0,1,3,3,4,5,7,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2],ymm2[3],ymm4[4,5,6],ymm2[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm0 = xmm13[3,3,3,3] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm1 = ymm7[0,1,3,3,4,5,7,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm13[3,3,3,3] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm7[0,1,3,3,4,5,7,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm1 = ymm15[3,3,3,3,7,7,7,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm15[3,3,3,3,7,7,7,7] ; AVX2-FAST-NEXT: vmovaps %ymm15, %ymm13 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm14[1],ymm1[2,3,4],ymm14[5],ymm1[6,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm1 = ymm3[0,1,0,1,4,5,4,5] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm3[0,1,0,1,4,5,4,5] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm2 = ymm10[0,1,3,3,4,5,7,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm10[0,1,3,3,4,5,7,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] @@ -3837,10 +3837,10 @@ ; AVX2-FAST-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; AVX2-FAST-NEXT: # xmm0 = mem[3,3,3,3] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm1 = ymm3[0,1,3,3,4,5,7,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm3[0,1,3,3,4,5,7,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm1 = ymm12[3,3,3,3,7,7,7,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm12[3,3,3,3,7,7,7,7] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm11[1],ymm1[2,3,4],ymm11[5],ymm1[6,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3] @@ -3849,7 +3849,7 @@ ; AVX2-FAST-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm1 = mem[0,1,0,1,4,5,4,5] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm2 = ymm15[0,1,3,3,4,5,7,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm15[0,1,3,3,4,5,7,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] @@ -3862,7 +3862,7 @@ ; AVX2-FAST-NEXT: vmovaps 464(%rdi), %xmm1 ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm14[0,1],ymm1[2,3],ymm14[4,5,6,7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm2 = ymm2[0,1,0,2,4,5,4,6] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,1,0,2,4,5,4,6] ; AVX2-FAST-NEXT: vmovaps {{.*#+}} xmm1 = <4,2,u,u> ; AVX2-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm4 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm2[2,3,4,5,6,7] @@ -3880,7 +3880,7 @@ ; AVX2-FAST-NEXT: # ymm13 = mem[0,1,2,3],ymm13[4,5,6,7] ; AVX2-FAST-NEXT: vmovaps 272(%rdi), %xmm6 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm13[0,1],ymm6[2,3],ymm13[4,5,6,7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm4 = ymm4[0,1,0,2,4,5,4,6] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,1,0,2,4,5,4,6] ; AVX2-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm8 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm8[0,1],ymm4[2,3,4,5,6,7] ; AVX2-FAST-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload @@ -3893,7 +3893,7 @@ ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm12 = ymm11[0,1,2,3],ymm12[4,5,6,7] ; AVX2-FAST-NEXT: vmovaps 656(%rdi), %xmm4 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm12[0,1],ymm4[2,3],ymm12[4,5,6,7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm8 = ymm8[0,1,0,2,4,5,4,6] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm8 = ymm8[0,1,0,2,4,5,4,6] ; AVX2-FAST-NEXT: vpermps %ymm9, %ymm1, %ymm11 ; AVX2-FAST-NEXT: vmovaps %ymm1, %ymm3 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm11[0,1],ymm8[2,3,4,5,6,7] @@ -3910,7 +3910,7 @@ ; AVX2-FAST-NEXT: # ymm1 = ymm0[0,1,2,3],mem[4,5,6,7] ; AVX2-FAST-NEXT: vmovaps 80(%rdi), %xmm0 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm11 = ymm8[0,1],ymm0[2,3],ymm8[4,5,6,7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm11 = ymm11[0,1,0,2,4,5,4,6] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm11 = ymm11[0,1,0,2,4,5,4,6] ; AVX2-FAST-NEXT: vpermps %ymm1, %ymm3, %ymm5 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm11[2,3,4,5,6,7] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload @@ -3918,7 +3918,7 @@ ; AVX2-FAST-NEXT: # ymm11 = mem[0,1],ymm3[2,3],mem[4,5,6,7] ; AVX2-FAST-NEXT: vpermps %ymm11, %ymm2, %ymm2 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm2[5,6,7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm2 = ymm14[1,1,1,1,5,5,5,5] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm14[1,1,1,1,5,5,5,5] ; AVX2-FAST-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm2 = ymm2[0,1,2],mem[3],ymm2[4,5,6,7] ; AVX2-FAST-NEXT: vmovaps {{.*#+}} xmm14 = <5,3,u,u> @@ -3928,19 +3928,19 @@ ; AVX2-FAST-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-FAST-NEXT: vpermps %ymm7, %ymm3, %ymm7 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm7[5,6,7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm7 = ymm13[1,1,1,1,5,5,5,5] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm7 = ymm13[1,1,1,1,5,5,5,5] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2],ymm6[3],ymm7[4,5,6,7] ; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm7 # 32-byte Folded Reload ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3,4,5,6,7] ; AVX2-FAST-NEXT: vpermps %ymm10, %ymm3, %ymm7 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm7[5,6,7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm7 = ymm12[1,1,1,1,5,5,5,5] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm7 = ymm12[1,1,1,1,5,5,5,5] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm7[0,1,2],ymm4[3],ymm7[4,5,6,7] ; AVX2-FAST-NEXT: vpermps %ymm9, %ymm14, %ymm7 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm7[0,1],ymm4[2,3,4,5,6,7] ; AVX2-FAST-NEXT: vpermps %ymm15, %ymm3, %ymm7 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm7[5,6,7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm7 = ymm8[1,1,1,1,5,5,5,5] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm7 = ymm8[1,1,1,1,5,5,5,5] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1,2],ymm0[3],ymm7[4,5,6,7] ; AVX2-FAST-NEXT: vpermps %ymm1, %ymm14, %ymm1 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] @@ -4020,7 +4020,7 @@ ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm14, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm5[0,1],ymm3[0,1] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm11 = ymm1[0,1,2,3,4,5],ymm3[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm3 = ymm11[0,2,2,2,4,6,6,6] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm3 = ymm11[0,2,2,2,4,6,6,6] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm2[0,1,2],ymm3[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm10 = ymm4[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd {{.*#+}} ymm2 = [8589934596,8589934596,8589934596,8589934596] @@ -4034,7 +4034,7 @@ ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm0[0,1,2,3],ymm8[4,5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm6, %ymm14, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm7 = ymm12[0,2,2,2,4,6,6,6] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm7 = ymm12[0,2,2,2,4,6,6,6] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2],ymm7[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps 512(%rdi), %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -4056,7 +4056,7 @@ ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm3, %ymm14, %ymm15 -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm13 = ymm9[0,2,2,2,4,6,6,6] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm13 = ymm9[0,2,2,2,4,6,6,6] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm13 = ymm15[0,1,2],ymm13[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps 320(%rdi), %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill @@ -4078,7 +4078,7 @@ ; AVX2-FAST-PERLANE-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm15[0,1],ymm1[0,1] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm5 = ymm7[0,2,2,2,4,6,6,6] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm5 = ymm7[0,2,2,2,4,6,6,6] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm14[0,1,2],ymm5[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps 704(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -4090,59 +4090,59 @@ ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} xmm0 = <1,7,5,u> ; AVX2-FAST-PERLANE-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm11 = ymm11[1,3,2,3,5,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm11 = ymm11[1,3,2,3,5,7,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2],ymm11[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd {{.*#+}} ymm11 = [12884901893,12884901893,12884901893,12884901893] ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm10, %ymm11, %ymm10 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm10[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm6, %ymm0, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm6 = ymm12[1,3,2,3,5,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm6 = ymm12[1,3,2,3,5,7,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2],ymm6[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm4, %ymm11, %ymm4 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm3, %ymm0, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm3 = ymm9[1,3,2,3,5,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm3 = ymm9[1,3,2,3,5,7,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm3[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm8, %ymm11, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm13, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm7[1,3,2,3,5,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm7[1,3,2,3,5,7,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm14, %ymm11, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdi), %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm0 = xmm2[2,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm2[2,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm13[0,0,2,3,4,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm13[0,0,2,3,4,4,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm11[0,1],ymm12[2,3],ymm11[4,5],ymm12[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[2,0,2,3,6,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0,2,3,6,4,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm9[0,0,0,0,4,4,4,4] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm9[0,0,0,0,4,4,4,4] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm3 = ymm10[0,0,2,3,4,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm3 = ymm10[0,0,2,3,4,4,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3],ymm3[4,5,6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 384(%rdi), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm0 = xmm1[2,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm1[2,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm3 = ymm8[0,0,2,3,4,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm3 = ymm8[0,0,2,3,4,4,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm3[1,2,3],ymm0[4],ymm3[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm3 = mem[0,1],ymm3[2,3],mem[4,5],ymm3[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm3 = ymm3[2,0,2,3,6,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm3 = ymm3[2,0,2,3,6,4,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,2,0,3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm3[3,4,5,6,7] @@ -4156,13 +4156,13 @@ ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 576(%rdi), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm4 = mem[0,0,2,3,4,4,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm4[1,2,3],ymm0[4],ymm4[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm4 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm4 = ymm15[0,1],mem[2,3],ymm15[4,5],mem[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm4 = ymm4[2,0,2,3,6,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm4 = ymm4[2,0,2,3,6,4,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,2,0,3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm4[3,4,5,6,7] @@ -4175,69 +4175,69 @@ ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 192(%rdi), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm4 = xmm0[2,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm4 = xmm0[2,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm5 = ymm7[0,0,2,3,4,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm5 = ymm7[0,0,2,3,4,4,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm5[1,2,3],ymm4[4],ymm5[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm14[0,1],ymm15[2,3],ymm14[4,5],ymm15[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm5 = ymm5[2,0,2,3,6,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm5 = ymm5[2,0,2,3,6,4,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,2,0,3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2],ymm5[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm5 = ymm3[0,0,0,0,4,4,4,4] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm5 = ymm3[0,0,0,0,4,4,4,4] ; AVX2-FAST-PERLANE-NEXT: vpermilps $224, (%rsp), %ymm6 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm6 = mem[0,0,2,3,4,4,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2],ymm5[3],ymm6[4,5,6],ymm5[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,0,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm5[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[3,3,3,3] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm4 = ymm13[0,1,3,3,4,5,7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm2 = xmm2[3,3,3,3] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm4 = ymm13[0,1,3,3,4,5,7,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm4[1,2,3],ymm2[4],ymm4[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm4 = ymm12[3,3,3,3,7,7,7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm4 = ymm12[3,3,3,3,7,7,7,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm11[1],ymm4[2,3,4],ymm11[5],ymm4[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,2,0,3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm4[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm4 = ymm9[0,1,0,1,4,5,4,5] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm5 = ymm10[0,1,3,3,4,5,7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm4 = ymm9[0,1,0,1,4,5,4,5] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm5 = ymm10[0,1,3,3,4,5,7,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3],ymm5[4,5,6],ymm4[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,0,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm4[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,3,3,3] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm2 = ymm8[0,1,3,3,4,5,7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm8[0,1,3,3,4,5,7,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm8, %ymm13 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm2 = ymm9[3,3,3,3,7,7,7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm9[3,3,3,3,7,7,7,7] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm5[1],ymm2[2,3,4],ymm5[5],ymm2[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,0,3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm2 = ymm8[0,1,0,1,4,5,4,5] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm8[0,1,0,1,4,5,4,5] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm4 = ymm6[0,1,3,3,4,5,7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm4 = ymm6[0,1,3,3,4,5,7,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2],ymm2[3],ymm4[4,5,6],ymm2[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm7[0,1,3,3,4,5,7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm7[0,1,3,3,4,5,7,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm15[3,3,3,3,7,7,7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm15[3,3,3,3,7,7,7,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm14[1],ymm1[2,3,4],ymm14[5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm3[0,1,0,1,4,5,4,5] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm3[0,1,0,1,4,5,4,5] ; AVX2-FAST-PERLANE-NEXT: vmovups (%rsp), %ymm10 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm2 = ymm10[0,1,3,3,4,5,7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm10[0,1,3,3,4,5,7,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] @@ -4245,10 +4245,10 @@ ; AVX2-FAST-PERLANE-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm0 = mem[3,3,3,3] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm3[0,1,3,3,4,5,7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm3[0,1,3,3,4,5,7,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm12[3,3,3,3,7,7,7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm12[3,3,3,3,7,7,7,7] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm11[1],ymm1[2,3,4],ymm11[5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3] @@ -4257,7 +4257,7 @@ ; AVX2-FAST-PERLANE-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0,1,0,1,4,5,4,5] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm2 = ymm15[0,1,3,3,4,5,7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm15[0,1,3,3,4,5,7,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] @@ -4269,7 +4269,7 @@ ; AVX2-FAST-PERLANE-NEXT: vmovaps 464(%rdi), %xmm1 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm14[0,1],ymm1[2,3],ymm14[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm2 = ymm2[0,1,0,2,4,5,4,6] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,1,0,2,4,5,4,6] ; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} xmm1 = <4,2,u,u> ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm0, %ymm1, %ymm4 ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, %ymm9 @@ -4289,7 +4289,7 @@ ; AVX2-FAST-PERLANE-NEXT: # ymm13 = ymm1[0,1,2,3],mem[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps 272(%rdi), %xmm6 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm13[0,1],ymm6[2,3],ymm13[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm4 = ymm4[0,1,0,2,4,5,4,6] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,1,0,2,4,5,4,6] ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm9, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm0, %ymm9, %ymm8 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm8[0,1],ymm4[2,3,4,5,6,7] @@ -4303,7 +4303,7 @@ ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm12 = ymm11[0,1,2,3],ymm12[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps 656(%rdi), %xmm4 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm12[0,1],ymm4[2,3],ymm12[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm8 = ymm8[0,1,0,2,4,5,4,6] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm8 = ymm8[0,1,0,2,4,5,4,6] ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm9, %ymm1, %ymm11 ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, %ymm3 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm11[0,1],ymm8[2,3,4,5,6,7] @@ -4320,7 +4320,7 @@ ; AVX2-FAST-PERLANE-NEXT: # ymm1 = ymm0[0,1,2,3],mem[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps 80(%rdi), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm11 = ymm8[0,1],ymm0[2,3],ymm8[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm11 = ymm11[0,1,0,2,4,5,4,6] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm11 = ymm11[0,1,0,2,4,5,4,6] ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm3, %ymm5 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm11[2,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload @@ -4328,7 +4328,7 @@ ; AVX2-FAST-PERLANE-NEXT: # ymm11 = mem[0,1],ymm3[2,3],mem[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm11, %ymm2, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm2[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm2 = ymm14[1,1,1,1,5,5,5,5] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm14[1,1,1,1,5,5,5,5] ; AVX2-FAST-PERLANE-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm2 = ymm2[0,1,2],mem[3],ymm2[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} xmm14 = <5,3,u,u> @@ -4338,19 +4338,19 @@ ; AVX2-FAST-PERLANE-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm7, %ymm3, %ymm7 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm7[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm7 = ymm13[1,1,1,1,5,5,5,5] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm7 = ymm13[1,1,1,1,5,5,5,5] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2],ymm6[3],ymm7[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm7 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm10, %ymm3, %ymm7 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm7[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm7 = ymm12[1,1,1,1,5,5,5,5] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm7 = ymm12[1,1,1,1,5,5,5,5] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm7[0,1,2],ymm4[3],ymm7[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm9, %ymm14, %ymm7 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm7[0,1],ymm4[2,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm15, %ymm3, %ymm7 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm7[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm7 = ymm8[1,1,1,1,5,5,5,5] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm7 = ymm8[1,1,1,1,5,5,5,5] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1,2],ymm0[3],ymm7[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm14, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] @@ -6048,7 +6048,7 @@ ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm5[2,3],ymm4[0,1] ; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0],ymm5[0],ymm1[3],ymm5[2] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, 672(%rdi), %ymm3, %ymm1 @@ -6070,7 +6070,7 @@ ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[0,1] ; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[3],ymm2[2] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 1056(%rdi), %ymm1 @@ -6097,7 +6097,7 @@ ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[0,1] ; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[3],ymm2[2] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 1440(%rdi), %ymm1 @@ -6124,7 +6124,7 @@ ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[0,1] ; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[3],ymm2[2] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %ymm1 @@ -6151,7 +6151,7 @@ ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[0,1] ; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[3],ymm2[2] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %ymm1 @@ -6178,7 +6178,7 @@ ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[0,1] ; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[3],ymm2[2] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 864(%rdi), %ymm1 @@ -6205,7 +6205,7 @@ ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[0,1] ; AVX1-ONLY-NEXT: vmovupd %ymm1, (%rsp) # 32-byte Spill ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[3],ymm2[2] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 1248(%rdi), %ymm1 @@ -6233,7 +6233,7 @@ ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm15[0,1] ; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm15 = ymm2[0],ymm0[0],ymm2[3],ymm0[2] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm15 = ymm15[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm15[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -6248,7 +6248,7 @@ ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps $215, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm15 = ymm1[3,1],mem[1,3],ymm1[7,5],mem[5,7] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm15 = ymm15[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm15[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -6262,7 +6262,7 @@ ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm1[3,1],ymm15[1,3],ymm1[7,5],ymm15[5,7] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm14 = ymm14[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm14[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -6275,7 +6275,7 @@ ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm1[3,1],ymm13[1,3],ymm1[7,5],ymm13[5,7] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm11 = ymm11[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm11[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm11[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -6288,7 +6288,7 @@ ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm1[3,1],ymm14[1,3],ymm1[7,5],ymm14[5,7] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm9 = ymm9[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm9[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm9[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -6301,7 +6301,7 @@ ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps $215, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm7 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm7 = ymm1[3,1],mem[1,3],ymm1[7,5],mem[5,7] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm7 = ymm7[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm7[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm7[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -6314,7 +6314,7 @@ ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps $215, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm5 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm5 = ymm1[3,1],mem[1,3],ymm1[7,5],mem[5,7] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm5 = ymm5[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm5[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload @@ -6327,7 +6327,7 @@ ; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps $215, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm3 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm3 = ymm1[3,1],mem[1,3],ymm1[7,5],mem[5,7] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm3 = ymm3[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm3[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload @@ -6341,7 +6341,7 @@ ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[3,1],ymm2[1,3],ymm1[7,5],ymm2[5,7] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -6591,7 +6591,7 @@ ; AVX1-ONLY-NEXT: # ymm2 = mem[0,1],ymm0[2,3],mem[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,2,3,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,2,3,3] ; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] @@ -6611,7 +6611,7 @@ ; AVX1-ONLY-NEXT: # ymm6 = mem[0,1],ymm0[2,3],mem[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,2,3,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,2,3,3] ; AVX1-ONLY-NEXT: vmovaps 208(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] @@ -6635,7 +6635,7 @@ ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,2,3,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,2,3,3] ; AVX1-ONLY-NEXT: vmovaps 400(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] @@ -6654,7 +6654,7 @@ ; AVX1-ONLY-NEXT: # ymm12 = mem[0,1],ymm0[2,3],mem[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 608(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,2,3,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,2,3,3] ; AVX1-ONLY-NEXT: vmovaps 592(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] @@ -6674,7 +6674,7 @@ ; AVX1-ONLY-NEXT: # ymm11 = mem[0,1],ymm0[2,3],mem[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 800(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,2,3,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,2,3,3] ; AVX1-ONLY-NEXT: vmovaps 784(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] @@ -6693,7 +6693,7 @@ ; AVX1-ONLY-NEXT: # ymm10 = mem[0,1],ymm0[2,3],mem[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 992(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,2,3,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,2,3,3] ; AVX1-ONLY-NEXT: vmovaps 976(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] @@ -6714,7 +6714,7 @@ ; AVX1-ONLY-NEXT: # ymm9 = mem[0,1],ymm0[2,3],mem[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 1184(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,2,3,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,2,3,3] ; AVX1-ONLY-NEXT: vmovaps 1168(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] @@ -6734,7 +6734,7 @@ ; AVX1-ONLY-NEXT: # ymm7 = mem[0,1],ymm0[2,3],mem[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 1376(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,2,3,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,2,3,3] ; AVX1-ONLY-NEXT: vmovaps 1360(%rdi), %xmm13 ; AVX1-ONLY-NEXT: vmovaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm13[0],xmm0[1],xmm13[2,3] @@ -6761,7 +6761,7 @@ ; AVX1-ONLY-NEXT: # ymm15 = ymm15[3,1],mem[1,3],ymm15[7,5],mem[5,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm14[1,1],ymm15[2,0],ymm14[5,5],ymm15[6,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm13 = xmm13[1,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm13 = xmm13[1,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1],ymm15[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm13[0,1,2,3,4],ymm8[5,6,7] ; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm8 # 32-byte Reload @@ -6775,7 +6775,7 @@ ; AVX1-ONLY-NEXT: # ymm15 = ymm8[3,1],mem[1,3],ymm8[7,5],mem[5,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm8[1,1],ymm15[2,0],ymm8[5,5],ymm15[6,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm13 = xmm13[1,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm13 = xmm13[1,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1],ymm15[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm13[0,1,2,3,4],ymm6[5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload @@ -6789,7 +6789,7 @@ ; AVX1-ONLY-NEXT: # ymm15 = ymm8[3,1],mem[1,3],ymm8[7,5],mem[5,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm8[1,1],ymm15[2,0],ymm8[5,5],ymm15[6,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm13 = xmm13[1,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm13 = xmm13[1,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1],ymm15[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm13[0,1,2,3,4],ymm5[5,6,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm12[3,0],ymm4[1,0],ymm12[7,4],ymm4[5,4] @@ -6802,7 +6802,7 @@ ; AVX1-ONLY-NEXT: # ymm13 = ymm8[3,1],mem[1,3],ymm8[7,5],mem[5,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm8[1,1],ymm13[2,0],ymm8[5,5],ymm13[6,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm12 = xmm12[1,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm12 = xmm12[1,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1],ymm13[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm12[0,1,2,3,4],ymm4[5,6,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm11[3,0],ymm3[1,0],ymm11[7,4],ymm3[5,4] @@ -6815,7 +6815,7 @@ ; AVX1-ONLY-NEXT: # ymm12 = ymm8[3,1],mem[1,3],ymm8[7,5],mem[5,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm8[1,1],ymm12[2,0],ymm8[5,5],ymm12[6,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm11 = xmm11[1,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm11 = xmm11[1,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1],ymm12[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm11[0,1,2,3,4],ymm3[5,6,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[3,0],ymm2[1,0],ymm10[7,4],ymm2[5,4] @@ -6828,7 +6828,7 @@ ; AVX1-ONLY-NEXT: # ymm11 = ymm8[3,1],mem[1,3],ymm8[7,5],mem[5,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm8[1,1],ymm11[2,0],ymm8[5,5],ymm11[6,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm10 = xmm10[1,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm10[1,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1],ymm11[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm10[0,1,2,3,4],ymm2[5,6,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm9[3,0],ymm1[1,0],ymm9[7,4],ymm1[5,4] @@ -6841,7 +6841,7 @@ ; AVX1-ONLY-NEXT: # ymm10 = ymm8[3,1],mem[1,3],ymm8[7,5],mem[5,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm8[1,1],ymm10[2,0],ymm8[5,5],ymm10[6,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm9 = xmm9[1,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm9 = xmm9[1,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1],ymm10[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm9[0,1,2,3,4],ymm1[5,6,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm7[3,0],ymm0[1,0],ymm7[7,4],ymm0[5,4] @@ -6854,7 +6854,7 @@ ; AVX1-ONLY-NEXT: # ymm9 = ymm8[3,1],mem[1,3],ymm8[7,5],mem[5,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm8[1,1],ymm9[2,0],ymm8[5,5],ymm9[6,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm7 = xmm7[1,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm7[1,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm9[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1,2,3,4],ymm0[5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload @@ -6976,7 +6976,7 @@ ; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm9[0,1],ymm8[0,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm8[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,2,2,2,4,6,6,6] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2,2,2,4,6,6,6] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm6[4,5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -6993,7 +6993,7 @@ ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5],ymm0[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpermps %ymm0, %ymm4, %ymm0 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,2,2,2,4,6,6,6] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2,2,2,4,6,6,6] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovaps 704(%rdi), %ymm1 ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -7018,7 +7018,7 @@ ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpermps %ymm0, %ymm4, %ymm0 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm2[0,2,2,2,4,6,6,6] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,2,2,2,4,6,6,6] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovaps 1088(%rdi), %ymm1 ; AVX2-SLOW-NEXT: vmovups %ymm1, (%rsp) # 32-byte Spill @@ -7043,7 +7043,7 @@ ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpermps %ymm0, %ymm4, %ymm0 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm2[0,2,2,2,4,6,6,6] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,2,2,2,4,6,6,6] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovaps 1472(%rdi), %ymm1 ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -7066,7 +7066,7 @@ ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm12 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] ; AVX2-SLOW-NEXT: vpermps %ymm12, %ymm4, %ymm0 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm13[0,2,2,2,4,6,6,6] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm13[0,2,2,2,4,6,6,6] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovaps 128(%rdi), %ymm1 ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -7088,7 +7088,7 @@ ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] ; AVX2-SLOW-NEXT: vpermps %ymm7, %ymm4, %ymm0 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm10[0,2,2,2,4,6,6,6] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm10[0,2,2,2,4,6,6,6] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovaps 512(%rdi), %ymm1 ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -7110,7 +7110,7 @@ ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] ; AVX2-SLOW-NEXT: vpermps %ymm3, %ymm4, %ymm0 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm6[0,2,2,2,4,6,6,6] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm6[0,2,2,2,4,6,6,6] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovaps 896(%rdi), %ymm1 ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -7132,7 +7132,7 @@ ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[0,1],ymm4[0,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm1[0,1,2,3,4,5],ymm4[6,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm4[0,2,2,2,4,6,6,6] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm4[0,2,2,2,4,6,6,6] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovaps 1280(%rdi), %ymm1 ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -7173,43 +7173,43 @@ ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpermps %ymm12, %ymm9, %ymm12 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm13 = ymm13[1,3,2,3,5,7,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm13 = ymm13[1,3,2,3,5,7,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2],ymm13[3,4,5,6,7] ; AVX2-SLOW-NEXT: vpermps %ymm11, %ymm0, %ymm11 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm11 = ymm12[0,1,2,3,4,5],ymm11[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpermps %ymm7, %ymm9, %ymm7 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm10 = ymm10[1,3,2,3,5,7,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm10 = ymm10[1,3,2,3,5,7,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2],ymm10[3,4,5,6,7] ; AVX2-SLOW-NEXT: vpermps %ymm8, %ymm0, %ymm8 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm8[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpermps %ymm3, %ymm9, %ymm3 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm6 = ymm6[1,3,2,3,5,7,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm6 = ymm6[1,3,2,3,5,7,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm6[3,4,5,6,7] ; AVX2-SLOW-NEXT: vpermps %ymm5, %ymm0, %ymm5 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm5[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm9, %ymm2 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm3 = ymm4[1,3,2,3,5,7,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm3 = ymm4[1,3,2,3,5,7,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3,4,5,6,7] ; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps 192(%rdi), %xmm6 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm6[2,3,2,3] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm6[2,3,2,3] ; AVX2-SLOW-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm1 = mem[0,0,2,3,4,4,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-SLOW-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm1 = mem[0,1],ymm1[2,3],mem[4,5],ymm1[6,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[2,0,2,3,6,4,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0,2,3,6,4,6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm5[0,0,0,0,4,4,4,4] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm5[0,0,0,0,4,4,4,4] ; AVX2-SLOW-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm2 = mem[0,0,2,3,4,4,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] @@ -7218,14 +7218,14 @@ ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps 576(%rdi), %xmm0 ; AVX2-SLOW-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX2-SLOW-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm1 = mem[0,0,2,3,4,4,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm13[0,1],ymm4[2,3],ymm13[4,5],ymm4[6,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[2,0,2,3,6,4,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0,2,3,6,4,6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] @@ -7239,19 +7239,19 @@ ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps 960(%rdi), %xmm0 ; AVX2-SLOW-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm12[0,0,2,3,4,4,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm12[0,0,2,3,4,4,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-SLOW-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm1 = mem[0,1],ymm1[2,3],mem[4,5],ymm1[6,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[2,0,2,3,6,4,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0,2,3,6,4,6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm3[0,0,0,0,4,4,4,4] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm3[0,0,0,0,4,4,4,4] ; AVX2-SLOW-NEXT: vpermilps $224, (%rsp), %ymm2 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm2 = mem[0,0,2,3,4,4,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] @@ -7260,34 +7260,34 @@ ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps 1344(%rdi), %xmm0 ; AVX2-SLOW-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX2-SLOW-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm1 = mem[0,0,2,3,4,4,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm11[0,1],ymm14[2,3],ymm11[4,5],ymm14[6,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[2,0,2,3,6,4,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0,2,3,6,4,6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX2-SLOW-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm1 = mem[0,0,0,0,4,4,4,4] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm2 = ymm15[0,0,2,3,4,4,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm15[0,0,2,3,4,4,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps 1152(%rdi), %xmm2 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm2[2,3,2,3] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm2[2,3,2,3] ; AVX2-SLOW-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm1 = mem[0,0,2,3,4,4,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-SLOW-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm1 = mem[0,1],ymm1[2,3],mem[4,5],ymm1[6,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[2,0,2,3,6,4,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0,2,3,6,4,6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] @@ -7301,14 +7301,14 @@ ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps (%rdi), %xmm0 ; AVX2-SLOW-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX2-SLOW-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm1 = mem[0,0,2,3,4,4,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-SLOW-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm1 = mem[0,1],ymm1[2,3],mem[4,5],ymm1[6,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[2,0,2,3,6,4,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0,2,3,6,4,6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] @@ -7322,14 +7322,14 @@ ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps 384(%rdi), %xmm0 ; AVX2-SLOW-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX2-SLOW-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm8 = mem[0,0,2,3,4,4,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm8[1,2,3],ymm0[4],ymm8[5,6,7] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-SLOW-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm8 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm8 = mem[0,1],ymm1[2,3],mem[4,5],ymm1[6,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm8 = ymm8[2,0,2,3,6,4,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm8 = ymm8[2,0,2,3,6,4,6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,2,0,3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm8[3,4,5,6,7] @@ -7342,26 +7342,26 @@ ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm8[5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps 768(%rdi), %xmm0 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm8 = xmm0[2,3,2,3] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm8 = xmm0[2,3,2,3] ; AVX2-SLOW-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm9 = mem[0,0,2,3,4,4,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0],ymm9[1,2,3],ymm8[4],ymm9[5,6,7] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-SLOW-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm9 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm9 = mem[0,1],ymm1[2,3],mem[4,5],ymm1[6,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm9 = ymm9[2,0,2,3,6,4,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm9 = ymm9[2,0,2,3,6,4,6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[0,2,0,3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,3,2,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2],ymm9[3,4,5,6,7] ; AVX2-SLOW-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm9 = mem[0,0,0,0,4,4,4,4] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm10 = ymm7[0,0,2,3,4,4,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm10 = ymm7[0,0,2,3,4,4,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1,2],ymm9[3],ymm10[4,5,6],ymm9[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[0,1,0,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm9[5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm6 = xmm6[3,3,3,3] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm6 = xmm6[3,3,3,3] ; AVX2-SLOW-NEXT: vpermilps $244, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm8 = mem[0,1,3,3,4,5,7,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm8[1,2,3],ymm6[4],ymm8[5,6,7] @@ -7372,7 +7372,7 @@ ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,2,0,3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,3,2,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2],ymm8[3,4,5,6,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm8 = ymm5[0,1,0,1,4,5,4,5] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm8 = ymm5[0,1,0,1,4,5,4,5] ; AVX2-SLOW-NEXT: vpermilps $244, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm9 = mem[0,1,3,3,4,5,7,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1,2],ymm8[3],ymm9[4,5,6],ymm8[7] @@ -7384,7 +7384,7 @@ ; AVX2-SLOW-NEXT: vpermilps $244, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm6 = mem[0,1,3,3,4,5,7,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm6[1,2,3],ymm5[4],ymm6[5,6,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm6 = ymm4[3,3,3,3,7,7,7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm6 = ymm4[3,3,3,3,7,7,7,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm13[1],ymm6[2,3,4],ymm13[5],ymm6[6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,2,0,3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,3,2,3] @@ -7399,7 +7399,7 @@ ; AVX2-SLOW-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm4 = mem[3,3,3,3] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm5 = ymm12[0,1,3,3,4,5,7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm5 = ymm12[0,1,3,3,4,5,7,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm5[1,2,3],ymm4[4],ymm5[5,6,7] ; AVX2-SLOW-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm5 = mem[3,3,3,3,7,7,7,7] @@ -7408,7 +7408,7 @@ ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,2,0,3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,3,2,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2],ymm5[3,4,5,6,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm5 = ymm3[0,1,0,1,4,5,4,5] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm5 = ymm3[0,1,0,1,4,5,4,5] ; AVX2-SLOW-NEXT: vpermilps $244, (%rsp), %ymm6 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm6 = mem[0,1,3,3,4,5,7,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2],ymm5[3],ymm6[4,5,6],ymm5[7] @@ -7420,24 +7420,24 @@ ; AVX2-SLOW-NEXT: vpermilps $244, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm4 = mem[0,1,3,3,4,5,7,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm4[1,2,3],ymm3[4],ymm4[5,6,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm4 = ymm14[3,3,3,3,7,7,7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm4 = ymm14[3,3,3,3,7,7,7,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm11[1],ymm4[2,3,4],ymm11[5],ymm4[6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,2,0,3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,3,2,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm4[3,4,5,6,7] ; AVX2-SLOW-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm4 = mem[0,1,0,1,4,5,4,5] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm5 = ymm15[0,1,3,3,4,5,7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm5 = ymm15[0,1,3,3,4,5,7,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3],ymm5[4,5,6],ymm4[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,0,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[3,3,3,3] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm2[3,3,3,3] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm3 = ymm11[0,1,3,3,4,5,7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm3 = ymm11[0,1,3,3,4,5,7,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm3[1,2,3],ymm2[4],ymm3[5,6,7] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm3 = ymm13[3,3,3,3,7,7,7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm3 = ymm13[3,3,3,3,7,7,7,7] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm12[1],ymm3[2,3,4],ymm12[5],ymm3[6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,2,0,3] @@ -7451,19 +7451,19 @@ ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm2 = ymm9[0,1,3,3,4,5,7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm9[0,1,3,3,4,5,7,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm2 = ymm1[3,3,3,3,7,7,7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm1[3,3,3,3,7,7,7,7] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm14[1],ymm2[2,3,4],ymm14[5],ymm2[6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,0,3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm2 = ymm10[0,1,0,1,4,5,4,5] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm3 = ymm7[0,1,3,3,4,5,7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm10[0,1,0,1,4,5,4,5] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm3 = ymm7[0,1,3,3,4,5,7,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] @@ -7471,19 +7471,19 @@ ; AVX2-SLOW-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm0 = mem[3,3,3,3] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm3[0,1,3,3,4,5,7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm3[0,1,3,3,4,5,7,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm5[3,3,3,3,7,7,7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm5[3,3,3,3,7,7,7,7] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm4[1],ymm1[2,3,4],ymm4[5],ymm1[6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm7[0,1,0,1,4,5,4,5] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm7[0,1,0,1,4,5,4,5] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm2 = ymm6[0,1,3,3,4,5,7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm6[0,1,3,3,4,5,7,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] @@ -7519,7 +7519,7 @@ ; AVX2-SLOW-NEXT: vmovaps 272(%rdi), %xmm0 ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5,6,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,1,0,2,4,5,4,6] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,0,2,4,5,4,6] ; AVX2-SLOW-NEXT: vmovaps {{.*#+}} xmm8 = <4,2,u,u> ; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm8, %ymm1 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm0[2,3,4,5,6,7] @@ -7539,7 +7539,7 @@ ; AVX2-SLOW-NEXT: vmovaps 464(%rdi), %xmm1 ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3],ymm3[4,5,6,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,1,0,2,4,5,4,6] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,0,2,4,5,4,6] ; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm8, %ymm2 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm7[0,1],ymm6[2,3],ymm7[4,5,6,7] @@ -7558,7 +7558,7 @@ ; AVX2-SLOW-NEXT: vmovaps 656(%rdi), %xmm1 ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3],ymm3[4,5,6,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,1,0,2,4,5,4,6] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,0,2,4,5,4,6] ; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm8, %ymm2 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload @@ -7577,7 +7577,7 @@ ; AVX2-SLOW-NEXT: vmovaps 848(%rdi), %xmm1 ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3],ymm3[4,5,6,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,1,0,2,4,5,4,6] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,0,2,4,5,4,6] ; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm8, %ymm2 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm2 # 32-byte Folded Reload @@ -7597,7 +7597,7 @@ ; AVX2-SLOW-NEXT: vmovaps 1040(%rdi), %xmm1 ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3],ymm3[4,5,6,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,1,0,2,4,5,4,6] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,0,2,4,5,4,6] ; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm8, %ymm2 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovups (%rsp), %ymm2 # 32-byte Reload @@ -7613,7 +7613,7 @@ ; AVX2-SLOW-NEXT: vmovaps 1232(%rdi), %xmm1 ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm11[0,1],ymm1[2,3],ymm11[4,5,6,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,1,0,2,4,5,4,6] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,0,2,4,5,4,6] ; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm8, %ymm2 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload @@ -7630,7 +7630,7 @@ ; AVX2-SLOW-NEXT: # ymm9 = ymm1[0,1,2,3],mem[4,5,6,7] ; AVX2-SLOW-NEXT: vmovaps 1424(%rdi), %xmm12 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm9[0,1],ymm12[2,3],ymm9[4,5,6,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,1,0,2,4,5,4,6] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,0,2,4,5,4,6] ; AVX2-SLOW-NEXT: vpermps %ymm13, %ymm8, %ymm2 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload @@ -7647,7 +7647,7 @@ ; AVX2-SLOW-NEXT: # ymm4 = ymm1[0,1,2,3],mem[4,5,6,7] ; AVX2-SLOW-NEXT: vmovaps 80(%rdi), %xmm2 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1],ymm2[2,3],ymm5[4,5,6,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,1,0,2,4,5,4,6] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,0,2,4,5,4,6] ; AVX2-SLOW-NEXT: vpermps %ymm4, %ymm8, %ymm3 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload @@ -7703,20 +7703,20 @@ ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm15[2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpermps %ymm14, %ymm0, %ymm14 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm14 = ymm7[0,1,2,3,4],ymm14[5,6,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm7 = ymm11[1,1,1,1,5,5,5,5] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm7 = ymm11[1,1,1,1,5,5,5,5] ; AVX2-SLOW-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm7 = ymm7[0,1,2],mem[3],ymm7[4,5,6,7] ; AVX2-SLOW-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm11 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm11[0,1],ymm7[2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpermps %ymm10, %ymm0, %ymm10 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm10 = ymm7[0,1,2,3,4],ymm10[5,6,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm7 = ymm9[1,1,1,1,5,5,5,5] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm7 = ymm9[1,1,1,1,5,5,5,5] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2],ymm12[3],ymm7[4,5,6,7] ; AVX2-SLOW-NEXT: vpermps %ymm13, %ymm1, %ymm8 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1],ymm7[2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpermps %ymm6, %ymm0, %ymm6 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3,4],ymm6[5,6,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm5 = ymm5[1,1,1,1,5,5,5,5] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm5 = ymm5[1,1,1,1,5,5,5,5] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1,2],ymm2[3],ymm5[4,5,6,7] ; AVX2-SLOW-NEXT: vpermps %ymm4, %ymm1, %ymm1 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3,4,5,6,7] @@ -7845,7 +7845,7 @@ ; AVX2-FAST-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm9[0,1],ymm8[0,1] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm8[6,7] ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,2,2,2,4,6,6,6] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2,2,2,4,6,6,6] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm6[4,5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -7862,7 +7862,7 @@ ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5],ymm0[6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpermps %ymm0, %ymm4, %ymm0 -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,2,2,2,4,6,6,6] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2,2,2,4,6,6,6] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX2-FAST-NEXT: vmovaps 704(%rdi), %ymm1 ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -7887,7 +7887,7 @@ ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpermps %ymm0, %ymm4, %ymm0 -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm1 = ymm2[0,2,2,2,4,6,6,6] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,2,2,2,4,6,6,6] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX2-FAST-NEXT: vmovaps 1088(%rdi), %ymm1 ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -7912,7 +7912,7 @@ ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpermps %ymm0, %ymm4, %ymm0 -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm1 = ymm2[0,2,2,2,4,6,6,6] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,2,2,2,4,6,6,6] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX2-FAST-NEXT: vmovaps 1472(%rdi), %ymm1 ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -7935,7 +7935,7 @@ ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm12 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] ; AVX2-FAST-NEXT: vpermps %ymm12, %ymm4, %ymm0 -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm1 = ymm13[0,2,2,2,4,6,6,6] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm13[0,2,2,2,4,6,6,6] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX2-FAST-NEXT: vmovaps 128(%rdi), %ymm1 ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -7957,7 +7957,7 @@ ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] ; AVX2-FAST-NEXT: vpermps %ymm7, %ymm4, %ymm0 -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm1 = ymm9[0,2,2,2,4,6,6,6] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm9[0,2,2,2,4,6,6,6] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX2-FAST-NEXT: vmovaps 512(%rdi), %ymm1 ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -7979,7 +7979,7 @@ ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] ; AVX2-FAST-NEXT: vpermps %ymm3, %ymm4, %ymm0 -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm1 = ymm6[0,2,2,2,4,6,6,6] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm6[0,2,2,2,4,6,6,6] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX2-FAST-NEXT: vmovaps 896(%rdi), %ymm1 ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -8001,7 +8001,7 @@ ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[0,1],ymm4[0,1] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm1[0,1,2,3,4,5],ymm4[6,7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm1 = ymm4[0,2,2,2,4,6,6,6] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm4[0,2,2,2,4,6,6,6] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX2-FAST-NEXT: vmovaps 1280(%rdi), %ymm1 ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -8042,31 +8042,31 @@ ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] ; AVX2-FAST-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpermps %ymm12, %ymm10, %ymm12 -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm13 = ymm13[1,3,2,3,5,7,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm13 = ymm13[1,3,2,3,5,7,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2],ymm13[3,4,5,6,7] ; AVX2-FAST-NEXT: vpermps %ymm11, %ymm0, %ymm11 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm11 = ymm12[0,1,2,3,4,5],ymm11[6,7] ; AVX2-FAST-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpermps %ymm7, %ymm10, %ymm7 -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm9 = ymm9[1,3,2,3,5,7,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm9 = ymm9[1,3,2,3,5,7,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2],ymm9[3,4,5,6,7] ; AVX2-FAST-NEXT: vpermps %ymm8, %ymm0, %ymm8 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm8[6,7] ; AVX2-FAST-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpermps %ymm3, %ymm10, %ymm3 -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm6 = ymm6[1,3,2,3,5,7,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm6 = ymm6[1,3,2,3,5,7,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm6[3,4,5,6,7] ; AVX2-FAST-NEXT: vpermps %ymm5, %ymm0, %ymm5 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm5[6,7] ; AVX2-FAST-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpermps %ymm2, %ymm10, %ymm2 -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm3 = ymm4[1,3,2,3,5,7,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm3 = ymm4[1,3,2,3,5,7,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3,4,5,6,7] ; AVX2-FAST-NEXT: vpermps %ymm1, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps 192(%rdi), %xmm6 -; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm0 = xmm6[2,3,2,3] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm6[2,3,2,3] ; AVX2-FAST-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm1 = mem[0,0,2,3,4,4,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] @@ -8086,7 +8086,7 @@ ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps 576(%rdi), %xmm1 -; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[2,3,2,3] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm2 = xmm1[2,3,2,3] ; AVX2-FAST-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm3 = mem[0,0,2,3,4,4,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm3[1,2,3],ymm2[4],ymm3[5,6,7] @@ -8105,7 +8105,7 @@ ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps 960(%rdi), %xmm2 -; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm3 = xmm2[2,3,2,3] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm3 = xmm2[2,3,2,3] ; AVX2-FAST-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm4 = mem[0,0,2,3,4,4,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm4[1,2,3],ymm3[4],ymm4[5,6,7] @@ -8116,7 +8116,7 @@ ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,3,2,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm4[3,4,5,6,7] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm4 = ymm13[0,0,0,0,4,4,4,4] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm13[0,0,0,0,4,4,4,4] ; AVX2-FAST-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm5 = mem[0,0,2,3,4,4,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3],ymm5[4,5,6],ymm4[7] @@ -8125,7 +8125,7 @@ ; AVX2-FAST-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps 1344(%rdi), %xmm3 ; AVX2-FAST-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm4 = xmm3[2,3,2,3] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm4 = xmm3[2,3,2,3] ; AVX2-FAST-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm5 = mem[0,0,2,3,4,4,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm5[1,2,3],ymm4[4],ymm5[5,6,7] @@ -8136,16 +8136,16 @@ ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,3,2,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2],ymm5[3,4,5,6,7] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm5 = ymm3[0,0,0,0,4,4,4,4] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm5 = ymm3[0,0,0,0,4,4,4,4] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm7 = ymm15[0,0,2,3,4,4,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm7 = ymm15[0,0,2,3,4,4,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0,1,2],ymm5[3],ymm7[4,5,6],ymm5[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,0,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm5[5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps 1152(%rdi), %xmm4 ; AVX2-FAST-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm5 = xmm4[2,3,2,3] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm5 = xmm4[2,3,2,3] ; AVX2-FAST-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm7 = mem[0,0,2,3,4,4,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm7[1,2,3],ymm5[4],ymm7[5,6,7] @@ -8156,7 +8156,7 @@ ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,3,2,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2],ymm7[3,4,5,6,7] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm7 = ymm12[0,0,0,0,4,4,4,4] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm7 = ymm12[0,0,0,0,4,4,4,4] ; AVX2-FAST-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm8 = mem[0,0,2,3,4,4,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1,2],ymm7[3],ymm8[4,5,6],ymm7[7] @@ -8165,7 +8165,7 @@ ; AVX2-FAST-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps (%rdi), %xmm4 ; AVX2-FAST-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm5 = xmm4[2,3,2,3] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm5 = xmm4[2,3,2,3] ; AVX2-FAST-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm8 = mem[0,0,2,3,4,4,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm8[1,2,3],ymm5[4],ymm8[5,6,7] @@ -8185,7 +8185,7 @@ ; AVX2-FAST-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps 384(%rdi), %xmm4 ; AVX2-FAST-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm8 = xmm4[2,3,2,3] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm8 = xmm4[2,3,2,3] ; AVX2-FAST-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm9 = mem[0,0,2,3,4,4,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0],ymm9[1,2,3],ymm8[4],ymm9[5,6,7] @@ -8208,21 +8208,21 @@ ; AVX2-FAST-NEXT: # ymm8 = mem[0,1],ymm4[2,3],mem[4,5],ymm4[6,7] ; AVX2-FAST-NEXT: vpermps %ymm8, %ymm0, %ymm8 ; AVX2-FAST-NEXT: vmovaps 768(%rdi), %xmm0 -; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm9 = xmm0[2,3,2,3] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm9 = xmm0[2,3,2,3] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm10 = ymm7[0,0,2,3,4,4,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm10 = ymm7[0,0,2,3,4,4,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0],ymm10[1,2,3],ymm9[4],ymm10[5,6,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[0,3,2,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1,2],ymm8[3,4,5,6,7] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm9 = ymm5[0,0,0,0,4,4,4,4] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm9 = ymm5[0,0,0,0,4,4,4,4] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm10 = ymm4[0,0,2,3,4,4,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm10 = ymm4[0,0,2,3,4,4,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1,2],ymm9[3],ymm10[4,5,6],ymm9[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[0,1,0,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm9[5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm6 = xmm6[3,3,3,3] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm6 = xmm6[3,3,3,3] ; AVX2-FAST-NEXT: vpermilps $244, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm8 = mem[0,1,3,3,4,5,7,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm8[1,2,3],ymm6[4],ymm8[5,6,7] @@ -8241,7 +8241,7 @@ ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,1,0,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm8[5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,3,3,3] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3] ; AVX2-FAST-NEXT: vpermilps $244, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm6 = mem[0,1,3,3,4,5,7,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm6[1,2,3],ymm1[4],ymm6[5,6,7] @@ -8260,7 +8260,7 @@ ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,0,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm6[5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm1 = xmm2[3,3,3,3] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm2[3,3,3,3] ; AVX2-FAST-NEXT: vpermilps $244, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm2 = mem[0,1,3,3,4,5,7,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7] @@ -8271,7 +8271,7 @@ ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,0,3] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,3,2,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm2 = ymm13[0,1,0,1,4,5,4,5] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm13[0,1,0,1,4,5,4,5] ; AVX2-FAST-NEXT: vpermilps $244, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm6 = mem[0,1,3,3,4,5,7,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm6[0,1,2],ymm2[3],ymm6[4,5,6],ymm2[7] @@ -8283,13 +8283,13 @@ ; AVX2-FAST-NEXT: vpermilps $244, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm2 = mem[0,1,3,3,4,5,7,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm2 = ymm14[3,3,3,3,7,7,7,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm14[3,3,3,3,7,7,7,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm11[1],ymm2[2,3,4],ymm11[5],ymm2[6,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,0,3] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,3,2,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm2 = ymm3[0,1,0,1,4,5,4,5] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm3 = ymm15[0,1,3,3,4,5,7,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm3[0,1,0,1,4,5,4,5] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm3 = ymm15[0,1,3,3,4,5,7,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] @@ -8297,7 +8297,7 @@ ; AVX2-FAST-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; AVX2-FAST-NEXT: # xmm1 = mem[3,3,3,3] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm2 = ymm11[0,1,3,3,4,5,7,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm11[0,1,3,3,4,5,7,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7] ; AVX2-FAST-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm2 = mem[3,3,3,3,7,7,7,7] @@ -8306,26 +8306,26 @@ ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,0,3] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,3,2,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm2 = ymm12[0,1,0,1,4,5,4,5] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm12[0,1,0,1,4,5,4,5] ; AVX2-FAST-NEXT: vpermilps $244, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm3 = mem[0,1,3,3,4,5,7,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm1 = ymm7[0,1,3,3,4,5,7,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm7[0,1,3,3,4,5,7,7] ; AVX2-FAST-NEXT: vmovaps %ymm7, %ymm12 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm1 = ymm10[3,3,3,3,7,7,7,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm10[3,3,3,3,7,7,7,7] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm14[1],ymm1[2,3,4],ymm14[5],ymm1[6,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm1 = ymm5[0,1,0,1,4,5,4,5] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm2 = ymm4[0,1,3,3,4,5,7,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm5[0,1,0,1,4,5,4,5] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm4[0,1,3,3,4,5,7,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] @@ -8333,19 +8333,19 @@ ; AVX2-FAST-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; AVX2-FAST-NEXT: # xmm0 = mem[3,3,3,3] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm1 = ymm3[0,1,3,3,4,5,7,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm3[0,1,3,3,4,5,7,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm1 = ymm6[3,3,3,3,7,7,7,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm6[3,3,3,3,7,7,7,7] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm5[1],ymm1[2,3,4],ymm5[5],ymm1[6,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm1 = ymm9[0,1,0,1,4,5,4,5] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm9[0,1,0,1,4,5,4,5] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm2 = ymm7[0,1,3,3,4,5,7,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm7[0,1,3,3,4,5,7,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] @@ -8381,7 +8381,7 @@ ; AVX2-FAST-NEXT: vmovaps 272(%rdi), %xmm0 ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5,6,7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,1,0,2,4,5,4,6] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,0,2,4,5,4,6] ; AVX2-FAST-NEXT: vmovaps {{.*#+}} xmm8 = <4,2,u,u> ; AVX2-FAST-NEXT: vpermps %ymm1, %ymm8, %ymm1 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm0[2,3,4,5,6,7] @@ -8402,7 +8402,7 @@ ; AVX2-FAST-NEXT: vmovaps 464(%rdi), %xmm0 ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,1,0,2,4,5,4,6] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,0,2,4,5,4,6] ; AVX2-FAST-NEXT: vpermps %ymm3, %ymm8, %ymm2 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1],ymm7[2,3],ymm9[4,5,6,7] @@ -8421,7 +8421,7 @@ ; AVX2-FAST-NEXT: vmovaps 656(%rdi), %xmm1 ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5,6,7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,1,0,2,4,5,4,6] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,0,2,4,5,4,6] ; AVX2-FAST-NEXT: vpermps %ymm0, %ymm8, %ymm2 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload @@ -8439,7 +8439,7 @@ ; AVX2-FAST-NEXT: vmovaps 848(%rdi), %xmm1 ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5,6,7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,1,0,2,4,5,4,6] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,0,2,4,5,4,6] ; AVX2-FAST-NEXT: vpermps %ymm0, %ymm8, %ymm2 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -8460,7 +8460,7 @@ ; AVX2-FAST-NEXT: vmovaps 1040(%rdi), %xmm1 ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5,6,7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,1,0,2,4,5,4,6] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,0,2,4,5,4,6] ; AVX2-FAST-NEXT: vpermps %ymm0, %ymm8, %ymm2 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload @@ -8477,7 +8477,7 @@ ; AVX2-FAST-NEXT: vmovaps 1232(%rdi), %xmm1 ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm11[0,1],ymm1[2,3],ymm11[4,5,6,7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,1,0,2,4,5,4,6] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,0,2,4,5,4,6] ; AVX2-FAST-NEXT: vpermps %ymm0, %ymm8, %ymm2 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload @@ -8495,7 +8495,7 @@ ; AVX2-FAST-NEXT: # ymm12 = ymm1[0,1,2,3],mem[4,5,6,7] ; AVX2-FAST-NEXT: vmovaps 1424(%rdi), %xmm13 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm12[0,1],ymm13[2,3],ymm12[4,5,6,7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,1,0,2,4,5,4,6] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,0,2,4,5,4,6] ; AVX2-FAST-NEXT: vpermps %ymm0, %ymm8, %ymm2 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload @@ -8511,7 +8511,7 @@ ; AVX2-FAST-NEXT: # ymm9 = ymm1[0,1,2,3],mem[4,5,6,7] ; AVX2-FAST-NEXT: vmovaps 80(%rdi), %xmm2 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1],ymm2[2,3],ymm5[4,5,6,7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,1,0,2,4,5,4,6] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,0,2,4,5,4,6] ; AVX2-FAST-NEXT: vpermps %ymm9, %ymm8, %ymm3 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3,4,5,6,7] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload @@ -8566,20 +8566,20 @@ ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm15[2,3,4,5,6,7] ; AVX2-FAST-NEXT: vpermps %ymm14, %ymm0, %ymm14 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm14 = ymm7[0,1,2,3,4],ymm14[5,6,7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm7 = ymm11[1,1,1,1,5,5,5,5] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm7 = ymm11[1,1,1,1,5,5,5,5] ; AVX2-FAST-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm7 = ymm7[0,1,2],mem[3],ymm7[4,5,6,7] ; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm11 # 32-byte Folded Reload ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm11[0,1],ymm7[2,3,4,5,6,7] ; AVX2-FAST-NEXT: vpermps %ymm10, %ymm0, %ymm10 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm10 = ymm7[0,1,2,3,4],ymm10[5,6,7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm7 = ymm12[1,1,1,1,5,5,5,5] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm7 = ymm12[1,1,1,1,5,5,5,5] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2],ymm13[3],ymm7[4,5,6,7] ; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm8 # 32-byte Folded Reload ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1],ymm7[2,3,4,5,6,7] ; AVX2-FAST-NEXT: vpermps %ymm6, %ymm0, %ymm6 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3,4],ymm6[5,6,7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm5 = ymm5[1,1,1,1,5,5,5,5] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm5 = ymm5[1,1,1,1,5,5,5,5] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1,2],ymm2[3],ymm5[4,5,6,7] ; AVX2-FAST-NEXT: vpermps %ymm9, %ymm1, %ymm1 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3,4,5,6,7] @@ -8707,7 +8707,7 @@ ; AVX2-FAST-PERLANE-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm9[0,1],ymm8[0,1] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm8[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,2,2,2,4,6,6,6] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2,2,2,4,6,6,6] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm6[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -8724,7 +8724,7 @@ ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm0, %ymm4, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,2,2,2,4,6,6,6] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2,2,2,4,6,6,6] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps 704(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -8749,7 +8749,7 @@ ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm0, %ymm4, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm2[0,2,2,2,4,6,6,6] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,2,2,2,4,6,6,6] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps 1088(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, (%rsp) # 32-byte Spill @@ -8774,7 +8774,7 @@ ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm0, %ymm4, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm2[0,2,2,2,4,6,6,6] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,2,2,2,4,6,6,6] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps 1472(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -8797,7 +8797,7 @@ ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm12 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm12, %ymm4, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm13[0,2,2,2,4,6,6,6] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm13[0,2,2,2,4,6,6,6] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps 128(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -8819,7 +8819,7 @@ ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm7, %ymm4, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm10[0,2,2,2,4,6,6,6] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm10[0,2,2,2,4,6,6,6] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps 512(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -8841,7 +8841,7 @@ ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm3, %ymm4, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm6[0,2,2,2,4,6,6,6] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm6[0,2,2,2,4,6,6,6] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps 896(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -8863,7 +8863,7 @@ ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[0,1],ymm4[0,1] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm1[0,1,2,3,4,5],ymm4[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm4[0,2,2,2,4,6,6,6] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm4[0,2,2,2,4,6,6,6] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps 1280(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -8904,43 +8904,43 @@ ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm12, %ymm9, %ymm12 -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm13 = ymm13[1,3,2,3,5,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm13 = ymm13[1,3,2,3,5,7,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2],ymm13[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm11, %ymm0, %ymm11 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm11 = ymm12[0,1,2,3,4,5],ymm11[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm7, %ymm9, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm10 = ymm10[1,3,2,3,5,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm10 = ymm10[1,3,2,3,5,7,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2],ymm10[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm8, %ymm0, %ymm8 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm8[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm3, %ymm9, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm6 = ymm6[1,3,2,3,5,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm6 = ymm6[1,3,2,3,5,7,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm6[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm5, %ymm0, %ymm5 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm5[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm2, %ymm9, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm3 = ymm4[1,3,2,3,5,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm3 = ymm4[1,3,2,3,5,7,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 192(%rdi), %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm0 = xmm6[2,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm6[2,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0,0,2,3,4,4,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0,1],ymm1[2,3],mem[4,5],ymm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[2,0,2,3,6,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0,2,3,6,4,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm5[0,0,0,0,4,4,4,4] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm5[0,0,0,0,4,4,4,4] ; AVX2-FAST-PERLANE-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[0,0,2,3,4,4,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] @@ -8949,14 +8949,14 @@ ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 576(%rdi), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0,0,2,3,4,4,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm13[0,1],ymm4[2,3],ymm13[4,5],ymm4[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[2,0,2,3,6,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0,2,3,6,4,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] @@ -8970,19 +8970,19 @@ ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 960(%rdi), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm12[0,0,2,3,4,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm12[0,0,2,3,4,4,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0,1],ymm1[2,3],mem[4,5],ymm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[2,0,2,3,6,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0,2,3,6,4,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm3[0,0,0,0,4,4,4,4] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm3[0,0,0,0,4,4,4,4] ; AVX2-FAST-PERLANE-NEXT: vpermilps $224, (%rsp), %ymm2 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[0,0,2,3,4,4,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] @@ -8991,34 +8991,34 @@ ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 1344(%rdi), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0,0,2,3,4,4,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm11[0,1],ymm14[2,3],ymm11[4,5],ymm14[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[2,0,2,3,6,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0,2,3,6,4,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0,0,0,0,4,4,4,4] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm2 = ymm15[0,0,2,3,4,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm15[0,0,2,3,4,4,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 1152(%rdi), %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm0 = xmm2[2,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm2[2,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0,0,2,3,4,4,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0,1],ymm1[2,3],mem[4,5],ymm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[2,0,2,3,6,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0,2,3,6,4,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] @@ -9032,14 +9032,14 @@ ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdi), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0,0,2,3,4,4,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0,1],ymm1[2,3],mem[4,5],ymm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[2,0,2,3,6,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0,2,3,6,4,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] @@ -9053,14 +9053,14 @@ ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 384(%rdi), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm8 = mem[0,0,2,3,4,4,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm8[1,2,3],ymm0[4],ymm8[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm8 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm8 = mem[0,1],ymm1[2,3],mem[4,5],ymm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm8 = ymm8[2,0,2,3,6,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm8 = ymm8[2,0,2,3,6,4,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,2,0,3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm8[3,4,5,6,7] @@ -9073,26 +9073,26 @@ ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm8[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 768(%rdi), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm8 = xmm0[2,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm8 = xmm0[2,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm9 = mem[0,0,2,3,4,4,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0],ymm9[1,2,3],ymm8[4],ymm9[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm9 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm9 = mem[0,1],ymm1[2,3],mem[4,5],ymm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm9 = ymm9[2,0,2,3,6,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm9 = ymm9[2,0,2,3,6,4,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[0,2,0,3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2],ymm9[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm9 = mem[0,0,0,0,4,4,4,4] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm10 = ymm7[0,0,2,3,4,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm10 = ymm7[0,0,2,3,4,4,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1,2],ymm9[3],ymm10[4,5,6],ymm9[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[0,1,0,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm9[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm6 = xmm6[3,3,3,3] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm6 = xmm6[3,3,3,3] ; AVX2-FAST-PERLANE-NEXT: vpermilps $244, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm8 = mem[0,1,3,3,4,5,7,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm8[1,2,3],ymm6[4],ymm8[5,6,7] @@ -9103,7 +9103,7 @@ ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,2,0,3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2],ymm8[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm8 = ymm5[0,1,0,1,4,5,4,5] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm8 = ymm5[0,1,0,1,4,5,4,5] ; AVX2-FAST-PERLANE-NEXT: vpermilps $244, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm9 = mem[0,1,3,3,4,5,7,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1,2],ymm8[3],ymm9[4,5,6],ymm8[7] @@ -9115,7 +9115,7 @@ ; AVX2-FAST-PERLANE-NEXT: vpermilps $244, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm6 = mem[0,1,3,3,4,5,7,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm6[1,2,3],ymm5[4],ymm6[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm6 = ymm4[3,3,3,3,7,7,7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm6 = ymm4[3,3,3,3,7,7,7,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm13[1],ymm6[2,3,4],ymm13[5],ymm6[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,2,0,3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,3,2,3] @@ -9130,7 +9130,7 @@ ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm4 = mem[3,3,3,3] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm5 = ymm12[0,1,3,3,4,5,7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm5 = ymm12[0,1,3,3,4,5,7,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm5[1,2,3],ymm4[4],ymm5[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm5 = mem[3,3,3,3,7,7,7,7] @@ -9139,7 +9139,7 @@ ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,2,0,3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2],ymm5[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm5 = ymm3[0,1,0,1,4,5,4,5] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm5 = ymm3[0,1,0,1,4,5,4,5] ; AVX2-FAST-PERLANE-NEXT: vpermilps $244, (%rsp), %ymm6 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm6 = mem[0,1,3,3,4,5,7,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2],ymm5[3],ymm6[4,5,6],ymm5[7] @@ -9151,24 +9151,24 @@ ; AVX2-FAST-PERLANE-NEXT: vpermilps $244, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm4 = mem[0,1,3,3,4,5,7,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm4[1,2,3],ymm3[4],ymm4[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm4 = ymm14[3,3,3,3,7,7,7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm4 = ymm14[3,3,3,3,7,7,7,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm11[1],ymm4[2,3,4],ymm11[5],ymm4[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,2,0,3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm4[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm4 = mem[0,1,0,1,4,5,4,5] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm5 = ymm15[0,1,3,3,4,5,7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm5 = ymm15[0,1,3,3,4,5,7,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3],ymm5[4,5,6],ymm4[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,0,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[3,3,3,3] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm2 = xmm2[3,3,3,3] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm3 = ymm11[0,1,3,3,4,5,7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm3 = ymm11[0,1,3,3,4,5,7,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm3[1,2,3],ymm2[4],ymm3[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm3 = ymm13[3,3,3,3,7,7,7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm3 = ymm13[3,3,3,3,7,7,7,7] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm12[1],ymm3[2,3,4],ymm12[5],ymm3[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,2,0,3] @@ -9182,19 +9182,19 @@ ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm2 = ymm9[0,1,3,3,4,5,7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm9[0,1,3,3,4,5,7,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm2 = ymm1[3,3,3,3,7,7,7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm1[3,3,3,3,7,7,7,7] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm14[1],ymm2[2,3,4],ymm14[5],ymm2[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,0,3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm2 = ymm10[0,1,0,1,4,5,4,5] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm3 = ymm7[0,1,3,3,4,5,7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm10[0,1,0,1,4,5,4,5] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm3 = ymm7[0,1,3,3,4,5,7,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] @@ -9202,19 +9202,19 @@ ; AVX2-FAST-PERLANE-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm0 = mem[3,3,3,3] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm3[0,1,3,3,4,5,7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm3[0,1,3,3,4,5,7,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm5[3,3,3,3,7,7,7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm5[3,3,3,3,7,7,7,7] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm4[1],ymm1[2,3,4],ymm4[5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm7[0,1,0,1,4,5,4,5] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm7[0,1,0,1,4,5,4,5] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm2 = ymm6[0,1,3,3,4,5,7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm6[0,1,3,3,4,5,7,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] @@ -9250,7 +9250,7 @@ ; AVX2-FAST-PERLANE-NEXT: vmovaps 272(%rdi), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,1,0,2,4,5,4,6] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,0,2,4,5,4,6] ; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} xmm8 = <4,2,u,u> ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm8, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm0[2,3,4,5,6,7] @@ -9270,7 +9270,7 @@ ; AVX2-FAST-PERLANE-NEXT: vmovaps 464(%rdi), %xmm1 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3],ymm3[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,1,0,2,4,5,4,6] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,0,2,4,5,4,6] ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm2, %ymm8, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm7[0,1],ymm6[2,3],ymm7[4,5,6,7] @@ -9289,7 +9289,7 @@ ; AVX2-FAST-PERLANE-NEXT: vmovaps 656(%rdi), %xmm1 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3],ymm3[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,1,0,2,4,5,4,6] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,0,2,4,5,4,6] ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm2, %ymm8, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload @@ -9308,7 +9308,7 @@ ; AVX2-FAST-PERLANE-NEXT: vmovaps 848(%rdi), %xmm1 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3],ymm3[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,1,0,2,4,5,4,6] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,0,2,4,5,4,6] ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm2, %ymm8, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm2 # 32-byte Folded Reload @@ -9328,7 +9328,7 @@ ; AVX2-FAST-PERLANE-NEXT: vmovaps 1040(%rdi), %xmm1 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3],ymm3[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,1,0,2,4,5,4,6] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,0,2,4,5,4,6] ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm2, %ymm8, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups (%rsp), %ymm2 # 32-byte Reload @@ -9344,7 +9344,7 @@ ; AVX2-FAST-PERLANE-NEXT: vmovaps 1232(%rdi), %xmm1 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm11[0,1],ymm1[2,3],ymm11[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,1,0,2,4,5,4,6] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,0,2,4,5,4,6] ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm2, %ymm8, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload @@ -9361,7 +9361,7 @@ ; AVX2-FAST-PERLANE-NEXT: # ymm9 = ymm1[0,1,2,3],mem[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps 1424(%rdi), %xmm12 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm9[0,1],ymm12[2,3],ymm9[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,1,0,2,4,5,4,6] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,0,2,4,5,4,6] ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm13, %ymm8, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload @@ -9378,7 +9378,7 @@ ; AVX2-FAST-PERLANE-NEXT: # ymm4 = ymm1[0,1,2,3],mem[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps 80(%rdi), %xmm2 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1],ymm2[2,3],ymm5[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,1,0,2,4,5,4,6] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,0,2,4,5,4,6] ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm4, %ymm8, %ymm3 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload @@ -9434,20 +9434,20 @@ ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm15[2,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm14, %ymm0, %ymm14 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm14 = ymm7[0,1,2,3,4],ymm14[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm7 = ymm11[1,1,1,1,5,5,5,5] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm7 = ymm11[1,1,1,1,5,5,5,5] ; AVX2-FAST-PERLANE-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm7 = ymm7[0,1,2],mem[3],ymm7[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm11 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm11[0,1],ymm7[2,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm10, %ymm0, %ymm10 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm10 = ymm7[0,1,2,3,4],ymm10[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm7 = ymm9[1,1,1,1,5,5,5,5] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm7 = ymm9[1,1,1,1,5,5,5,5] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2],ymm12[3],ymm7[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm13, %ymm1, %ymm8 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1],ymm7[2,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm6, %ymm0, %ymm6 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3,4],ymm6[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm5 = ymm5[1,1,1,1,5,5,5,5] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm5 = ymm5[1,1,1,1,5,5,5,5] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1,2],ymm2[3],ymm5[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm4, %ymm1, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3,4,5,6,7] diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-7.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-7.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-7.ll @@ -58,21 +58,21 @@ ; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm3 ; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm5 = xmm3[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm3[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm2[0],xmm5[1],xmm2[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm4[0],xmm2[1],xmm4[2,3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm6 = xmm6[1,0,2,3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm7 = xmm2[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm6[1,0,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm2[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0],xmm4[1],xmm7[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm4[0,1,2],xmm2[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[3,2,2,3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm4 = xmm4[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm2[3,2,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm4[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2,3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm1[0,0],ymm0[1,0],ymm1[4,4],ymm0[5,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm4 = ymm4[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm4 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[1,0],ymm0[2,0],ymm1[5,4],ymm0[6,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-ONLY-NEXT: vmovlps %xmm5, (%rsi) ; AVX1-ONLY-NEXT: vmovlps %xmm6, (%rdx) @@ -95,18 +95,18 @@ ; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm4 ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm4[0],xmm3[1],xmm4[2,3] -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm5 = xmm5[1,0,2,3] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm5[1,0,2,3] ; AVX2-ONLY-NEXT: vbroadcastss 8(%rdi), %xmm6 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0],xmm4[1],xmm6[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[3] -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm3 = xmm3[3,2,2,3] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm3[3,2,2,3] ; AVX2-ONLY-NEXT: vmovaps {{.*#+}} xmm4 = <4,3,u,u> ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vpermps %ymm7, %ymm4, %ymm4 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7] -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} ymm7 = ymm7[1,0,2,3,5,4,6,7] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm7[1,0,2,3,5,4,6,7] ; AVX2-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm7 -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7] ; AVX2-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX2-ONLY-NEXT: vmovlps %xmm2, (%rsi) @@ -346,37 +346,37 @@ ; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm4 ; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm5 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[0,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,3,2,3] ; AVX1-ONLY-NEXT: vmovaps 80(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm7 = xmm6[0,1,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm6[0,1,0,1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1,2],xmm7[3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm4[0],xmm3[1],xmm4[2,3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm8 = xmm8[1,0],mem[3,3] ; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm8 = xmm8[0,1,2],xmm6[2] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm9 = xmm5[0,1,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm9 = xmm5[0,1,0,1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm9[0,1,2],xmm6[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm9 = xmm3[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm9 = xmm3[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm9 = xmm9[0],xmm4[1],xmm9[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm9[0,1],xmm6[2,3] ; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm9 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm9[0],xmm5[1],xmm9[2,3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm10 = xmm10[0,1,1,0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm10[0,1,1,0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm3 = xmm3[3,2,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm3[3,2,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm10[2,3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm10 = xmm9[0,1,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm9[0,1,0,1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm5[0,1,2],xmm10[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm4 = xmm4[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm4[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = mem[0],xmm4[1],mem[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],xmm10[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm9[0,1,2],xmm5[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm0[0,0],ymm1[1,0],ymm0[4,4],ymm1[5,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm10 = ymm10[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm10[0,1],xmm5[3,2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1,2],xmm9[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,0],ymm1[2,0],ymm0[5,4],ymm1[6,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm7[2,3] ; AVX1-ONLY-NEXT: vmovaps %xmm2, (%rsi) @@ -401,9 +401,9 @@ ; AVX2-SLOW-NEXT: vbroadcastss 84(%rdi), %xmm3 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[3] ; AVX2-SLOW-NEXT: vmovaps 80(%rdi), %xmm4 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm3 = xmm4[2,2,2,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm4[2,2,2,2] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm6 = ymm5[1,0,3,3,5,4,7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm6 = ymm5[1,0,3,3,5,4,7,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,3,2,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm3 = xmm6[0,1,2],xmm3[3] ; AVX2-SLOW-NEXT: vbroadcastss 8(%rdi), %xmm6 @@ -415,9 +415,9 @@ ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm4 = xmm6[0,1],xmm4[2,3] ; AVX2-SLOW-NEXT: vmovaps 96(%rdi), %xmm6 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm9 = xmm6[0],xmm8[1],xmm6[2,3] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm9 = xmm9[0,1,1,0] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm9 = xmm9[0,1,1,0] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1,2],mem[3] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm7 = xmm7[3,2,2,3] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm7 = xmm7[3,2,2,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm9[2,3] ; AVX2-SLOW-NEXT: vbroadcastss 100(%rdi), %xmm9 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm9 = xmm8[0,1,2],xmm9[3] @@ -426,13 +426,13 @@ ; AVX2-SLOW-NEXT: vpermps %ymm11, %ymm10, %ymm10 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm9 = xmm10[0,1],xmm9[2,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm8 = xmm6[0,1,2],xmm8[3] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm8 = xmm8[0,1,3,2] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm5 = ymm5[1,0,2,3,5,4,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm8 = xmm8[0,1,3,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm5 = ymm5[1,0,2,3,5,4,6,7] ; AVX2-SLOW-NEXT: vextractf128 $1, %ymm5, %xmm5 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1],xmm8[2,3] ; AVX2-SLOW-NEXT: vbroadcastss 80(%rdi), %ymm8 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm6 = xmm8[0,1,2],xmm6[3] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7] ; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm6[2,3] @@ -461,7 +461,7 @@ ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7] ; AVX2-FAST-NEXT: vpermps %ymm4, %ymm3, %ymm3 ; AVX2-FAST-NEXT: vmovaps 80(%rdi), %xmm5 -; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm6 = xmm5[2,2,2,2] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm6 = xmm5[2,2,2,2] ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1,2],xmm6[3] ; AVX2-FAST-NEXT: vbroadcastss 8(%rdi), %xmm6 ; AVX2-FAST-NEXT: vmovaps 32(%rdi), %xmm7 @@ -472,9 +472,9 @@ ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3] ; AVX2-FAST-NEXT: vmovaps 96(%rdi), %xmm6 ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm9 = xmm6[0],xmm8[1],xmm6[2,3] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm9 = xmm9[0,1,1,0] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm9 = xmm9[0,1,1,0] ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1,2],mem[3] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm7 = xmm7[3,2,2,3] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm7 = xmm7[3,2,2,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm9[2,3] ; AVX2-FAST-NEXT: vbroadcastss 100(%rdi), %xmm9 ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm9 = xmm8[0,1,2],xmm9[3] @@ -483,13 +483,13 @@ ; AVX2-FAST-NEXT: vpermps %ymm11, %ymm10, %ymm10 ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm9 = xmm10[0,1],xmm9[2,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm8 = xmm6[0,1,2],xmm8[3] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm8 = xmm8[0,1,3,2] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm4 = ymm4[1,0,2,3,5,4,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm8 = xmm8[0,1,3,2] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm4[1,0,2,3,5,4,6,7] ; AVX2-FAST-NEXT: vextractf128 $1, %ymm4, %xmm4 ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],xmm8[2,3] ; AVX2-FAST-NEXT: vbroadcastss 80(%rdi), %ymm8 ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm6 = xmm8[0,1,2],xmm6[3] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7] ; AVX2-FAST-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm6[2,3] @@ -515,9 +515,9 @@ ; AVX2-FAST-PERLANE-NEXT: vbroadcastss 84(%rdi), %xmm3 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[3] ; AVX2-FAST-PERLANE-NEXT: vmovaps 80(%rdi), %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm3 = xmm4[2,2,2,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm3 = xmm4[2,2,2,2] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm6 = ymm5[1,0,3,3,5,4,7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm6 = ymm5[1,0,3,3,5,4,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm3 = xmm6[0,1,2],xmm3[3] ; AVX2-FAST-PERLANE-NEXT: vbroadcastss 8(%rdi), %xmm6 @@ -529,9 +529,9 @@ ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm4 = xmm6[0,1],xmm4[2,3] ; AVX2-FAST-PERLANE-NEXT: vmovaps 96(%rdi), %xmm6 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm9 = xmm6[0],xmm8[1],xmm6[2,3] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm9 = xmm9[0,1,1,0] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm9 = xmm9[0,1,1,0] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1,2],mem[3] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm7 = xmm7[3,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm7 = xmm7[3,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm9[2,3] ; AVX2-FAST-PERLANE-NEXT: vbroadcastss 100(%rdi), %xmm9 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm9 = xmm8[0,1,2],xmm9[3] @@ -540,13 +540,13 @@ ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm11, %ymm10, %ymm10 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm9 = xmm10[0,1],xmm9[2,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm8 = xmm6[0,1,2],xmm8[3] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm8 = xmm8[0,1,3,2] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm5 = ymm5[1,0,2,3,5,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm8 = xmm8[0,1,3,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm5 = ymm5[1,0,2,3,5,4,6,7] ; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm5, %xmm5 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1],xmm8[2,3] ; AVX2-FAST-PERLANE-NEXT: vbroadcastss 80(%rdi), %ymm8 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm6 = xmm8[0,1,2],xmm6[3] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm6[2,3] @@ -761,7 +761,7 @@ ; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm13 ; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm9 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm13[0,1],xmm5[2,3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm5 = xmm5[0,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm5[0,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2],ymm3[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %xmm5 ; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %xmm6 @@ -781,7 +781,7 @@ ; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm14 = zero,xmm14[1,2],xmm10[2] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm14[5,6,7] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm14 = xmm13[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm13[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm14[0],xmm9[1],xmm14[2,3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm2[3,1],ymm11[0,3],ymm2[7,5],ymm11[4,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm12[2,1],ymm15[2,0],ymm12[6,5],ymm15[6,4] @@ -796,7 +796,7 @@ ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm11[1,0],ymm12[0,0],ymm11[5,4],ymm12[4,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm12[3,1],ymm11[0,2],ymm12[7,5],ymm11[4,6] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm12 = xmm9[0,1,2],xmm13[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm12 = xmm12[3,2,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm12 = xmm12[3,2,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm12[0,1],ymm11[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm14[0,1],ymm4[1,3],ymm14[4,5],ymm4[5,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm8[0,2],ymm12[2,0],ymm8[4,6],ymm12[6,4] @@ -806,9 +806,9 @@ ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[3,0],ymm13[0,0],ymm8[7,4],ymm13[4,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[2,0],ymm12[2,0],ymm8[6,4],ymm12[6,4] ; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm12 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm13 = xmm12[0,1,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm13 = xmm12[0,1,0,1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm13 = xmm15[0,1,2],xmm13[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm9 = xmm9[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm9 = xmm9[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm9 = mem[0],xmm9[1],mem[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm9 = xmm9[0,1],xmm13[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] @@ -818,20 +818,20 @@ ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm9[1,0],ymm4[2,0],ymm9[5,4],ymm4[6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm9 = xmm12[0,1,2],xmm15[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm0[0,0],ymm1[1,0],ymm0[4,4],ymm1[5,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm13 = ymm13[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm13[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm13, %xmm13 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm9 = xmm13[0,1],xmm9[3,2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm9[0,1,2,3],ymm4[4,5,6,7] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm14[2,3,0,1] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm14[3,0],ymm9[0,0],ymm14[7,4],ymm9[4,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm6 = xmm6[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm6[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[0,1],ymm9[2,0],ymm5[4,5],ymm9[6,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[0,1,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,1,0,1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1,2],xmm12[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,0],ymm1[2,0],ymm0[5,4],ymm1[6,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7] @@ -1576,7 +1576,7 @@ ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm15 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm15[0,1],xmm1[2,3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -1595,7 +1595,7 @@ ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm9 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm9[0,1],xmm1[2,3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -1639,7 +1639,7 @@ ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm14[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm2 = xmm15[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm15[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm10[1],xmm2[2,3] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[3,1],ymm5[0,3],ymm14[7,5],ymm5[4,7] @@ -1651,7 +1651,7 @@ ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm12 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm12[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm2 = xmm9[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm9[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm11[1],xmm2[2,3] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm12[3,1],ymm4[0,3],ymm12[7,5],ymm4[4,7] @@ -1666,7 +1666,7 @@ ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm5[1,0],ymm7[0,0],ymm5[5,4],ymm7[4,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm7[3,1],ymm2[0,2],ymm7[7,5],ymm2[4,6] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm10[0,1,2],xmm15[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm5 = xmm5[3,2,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm5[3,2,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1],ymm2[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm7 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm7[0,1],ymm8[1,3],ymm7[4,5],ymm8[5,7] @@ -1676,7 +1676,7 @@ ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm4[1,0],ymm13[0,0],ymm4[5,4],ymm13[4,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm13[3,1],ymm2[0,2],ymm13[7,5],ymm2[4,6] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm11[0,1,2],xmm9[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm4 = xmm4[3,2,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm4[3,2,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %ymm5 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm5[0,1],ymm3[1,3],ymm5[4,5],ymm3[5,7] @@ -1689,9 +1689,9 @@ ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0],ymm2[2,0],ymm1[6,4],ymm2[6,4] ; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm9 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm4 = xmm9[0,1,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm9[0,1,0,1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm2[0,1,2],xmm4[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm6 = xmm10[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm10[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = mem[0],xmm6[1],mem[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm6[0,1],xmm4[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm1[4,5,6,7] @@ -1701,10 +1701,10 @@ ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm5[1,0],ymm3[2,0],ymm5[5,4],ymm3[6,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm1[2,0],ymm0[6,4],ymm1[6,4] ; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %xmm14 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm6 = xmm14[0,1,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm14[0,1,0,1] ; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %xmm10 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm10[0,1,2],xmm6[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm11 = xmm11[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm11 = xmm11[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm11 = mem[0],xmm11[1],mem[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm11[0,1],xmm6[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7] @@ -1719,7 +1719,7 @@ ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm13[0,0],ymm12[1,0],ymm13[4,4],ymm12[5,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm8 = ymm8[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm8[0,1],xmm2[3,2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm2[0,1,2,3],ymm6[4,5,6,7] @@ -1733,13 +1733,13 @@ ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm10[0,0],ymm11[1,0],ymm10[4,4],ymm11[5,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm8 = ymm8[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm8[0,1],xmm6[3,2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm6[0,1,2,3],ymm3[4,5,6,7] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm7[2,3,0,1] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm7[3,0],ymm6[0,0],ymm7[7,4],ymm6[4,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm7 = xmm0[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm0[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0],xmm1[1],xmm7[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,0],ymm7[4,5],ymm6[6,4] @@ -1747,13 +1747,13 @@ ; AVX1-ONLY-NEXT: # xmm7 = mem[0,1,0,1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1,2],xmm9[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm13[1,0],ymm12[2,0],ymm13[5,4],ymm12[6,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm8 = ymm8[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm8[0,1],xmm7[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm5[2,3,0,1] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[3,0],ymm7[0,0],ymm5[7,4],ymm7[4,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm7 = xmm2[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm2[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0],xmm4[1],xmm7[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm7[0,1],ymm5[2,0],ymm7[4,5],ymm5[6,4] @@ -1761,7 +1761,7 @@ ; AVX1-ONLY-NEXT: # xmm7 = mem[0,1,0,1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm7[0,1,2],xmm14[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm10[1,0],ymm11[2,0],ymm10[5,4],ymm11[6,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm7 = ymm7[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm7[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm7 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm7[0,1],xmm1[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5,6,7] @@ -3460,7 +3460,7 @@ ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm10 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm10[0,1],xmm1[2,3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -3481,7 +3481,7 @@ ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %xmm8 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm8[0,1],xmm1[2,3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 608(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -3504,7 +3504,7 @@ ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] ; AVX1-ONLY-NEXT: vmovaps %xmm1, %xmm14 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,3,2,3] ; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %ymm12 ; AVX1-ONLY-NEXT: vmovaps 304(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -3532,7 +3532,7 @@ ; AVX1-ONLY-NEXT: vmovaps 672(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,3,2,3] ; AVX1-ONLY-NEXT: vmovaps 768(%rdi), %ymm2 ; AVX1-ONLY-NEXT: vmovaps 752(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -3636,7 +3636,7 @@ ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4],ymm15[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm14 = xmm10[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm10[2,3,2,3] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm14[0],xmm5[1],xmm14[2,3] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload @@ -3652,7 +3652,7 @@ ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4],ymm15[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm14 = xmm8[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm8[2,3,2,3] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm14[0],xmm1[1],xmm14[2,3] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload @@ -3669,7 +3669,7 @@ ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4],ymm15[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm14 = xmm2[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm2[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm14[0],xmm12[1],xmm14[2,3] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[3,1],ymm6[0,3],ymm15[7,5],ymm6[4,7] @@ -3683,7 +3683,7 @@ ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm14[0,1,2,3,4],ymm6[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm6 = xmm13[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm13[2,3,2,3] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0],xmm0[1],xmm6[2,3] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload @@ -3705,7 +3705,7 @@ ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm10[3,1],ymm6[0,2],ymm10[7,5],ymm6[4,6] ; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm10 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm10 = xmm5[0,1,2],mem[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm10 = xmm10[3,2,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm10[3,2,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm10[0,1],ymm6[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -3719,7 +3719,7 @@ ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm8[3,1],ymm6[0,2],ymm8[7,5],ymm6[4,6] ; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm9 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm9 = xmm1[0,1,2],mem[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm9 = xmm9[3,2,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm9 = xmm9[3,2,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1],ymm6[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 640(%rdi), %ymm6 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload @@ -3730,7 +3730,7 @@ ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm3[1,0],ymm7[0,0],ymm3[5,4],ymm7[4,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm7[3,1],ymm8[0,2],ymm7[7,5],ymm8[4,6] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm0[0,1,2],xmm13[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm8 = xmm8[3,2,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm8 = xmm8[3,2,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1],ymm7[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %ymm9 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm9[0,1],ymm15[1,3],ymm9[4,5],ymm15[5,7] @@ -3745,7 +3745,7 @@ ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm5 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm5 = mem[0,1,2],xmm0[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm5 = xmm5[3,2,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm5[3,2,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1],ymm4[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 864(%rdi), %ymm15 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload @@ -3762,7 +3762,7 @@ ; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm10 ; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm4 = xmm0[0,1,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm0[0,1,0,1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm10[0,1,2],xmm4[3] ; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm7 = mem[2,3,2,3] @@ -3777,7 +3777,7 @@ ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm2[2,0],ymm3[2,0],ymm2[6,4],ymm3[6,4] ; AVX1-ONLY-NEXT: vmovaps 544(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm4 = xmm0[0,1,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm0[0,1,0,1] ; AVX1-ONLY-NEXT: vmovaps 512(%rdi), %xmm12 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm12[0,1,2],xmm4[3] ; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload @@ -3794,7 +3794,7 @@ ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm1[2,0],ymm3[2,0],ymm1[6,4],ymm3[6,4] ; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm4 = xmm0[0,1,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm0[0,1,0,1] ; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm1[0,1,2],xmm4[3] ; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload @@ -3809,7 +3809,7 @@ ; AVX1-ONLY-NEXT: vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm3[2,0],ymm0[6,4],ymm3[6,4] ; AVX1-ONLY-NEXT: vmovaps 768(%rdi), %xmm11 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm3 = xmm11[0,1,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm11[0,1,0,1] ; AVX1-ONLY-NEXT: vmovaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 736(%rdi), %xmm8 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm8[0,1,2],xmm3[3] @@ -3831,7 +3831,7 @@ ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm13 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm13 = ymm3[0,0],mem[1,0],ymm3[4,4],mem[5,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm13 = ymm13[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm13[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm13, %xmm13 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm13[0,1],xmm10[3,2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm0[4,5,6,7] @@ -3848,7 +3848,7 @@ ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm3[0,0],ymm4[1,0],ymm3[4,4],ymm4[5,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm13 = ymm13[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm13[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm13, %xmm13 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm12 = xmm13[0,1],xmm12[3,2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm12[0,1,2,3],ymm10[4,5,6,7] @@ -3864,7 +3864,7 @@ ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm10[0,0],ymm14[1,0],ymm10[4,4],ymm14[5,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm13 = ymm13[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm13[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm13, %xmm13 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm13[0,1],xmm1[3,2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm12[4,5,6,7] @@ -3880,13 +3880,13 @@ ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm15 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm15 = ymm11[0,0],mem[1,0],ymm11[4,4],mem[5,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm15 = ymm15[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm15, %xmm15 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm13 = xmm15[0,1],xmm13[3,2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm13[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm6[2,3,0,1] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm6[3,0],ymm1[0,0],ymm6[7,4],ymm1[4,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm6 = xmm0[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm0[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0],xmm2[1],xmm6[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm6[0,1],ymm1[2,0],ymm6[4,5],ymm1[6,4] @@ -3894,7 +3894,7 @@ ; AVX1-ONLY-NEXT: # xmm6 = mem[0,1,0,1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm6[0,1,2],xmm5[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm3[1,0],ymm4[2,0],ymm3[5,4],ymm4[6,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm6 = ymm6[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm6[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm6, %xmm6 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm6[0,1],xmm2[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm1[4,5,6,7] @@ -3914,14 +3914,14 @@ ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps $33, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm6 = ymm0[1,0],mem[2,0],ymm0[5,4],mem[6,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm6 = ymm6[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm6[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm6, %xmm6 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3,0,1] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm0[3,0],ymm1[0,0],ymm0[7,4],ymm1[4,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm6 = xmm7[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm7[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0],xmm9[1],xmm6[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm6[0,1],ymm1[2,0],ymm6[4,5],ymm1[6,4] @@ -3930,7 +3930,7 @@ ; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm6 = xmm6[0,1,2],mem[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm10[1,0],ymm14[2,0],ymm10[5,4],ymm14[6,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm9 = ymm9[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm9[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm9, %xmm9 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm9[0,1],xmm6[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm1[4,5,6,7] @@ -3948,7 +3948,7 @@ ; AVX1-ONLY-NEXT: # xmm7 = xmm8[0,1,2],mem[3] ; AVX1-ONLY-NEXT: vshufps $33, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm8 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm8 = ymm11[1,0],mem[2,0],ymm11[5,4],mem[6,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm8 = ymm8[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm8[0,1],xmm7[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] @@ -4285,8 +4285,8 @@ ; AVX2-SLOW-NEXT: # ymm0 = ymm0[0],mem[1],ymm0[2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1,2],mem[3] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,2,2,3] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,2,2,3] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovdqa %ymm9, %ymm14 @@ -4358,13 +4358,13 @@ ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm8[4,5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0],ymm6[1],ymm5[2,3,4],ymm6[5],ymm5[6,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm3 = ymm3[1,0,3,3,5,4,7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm3 = ymm3[1,0,3,3,5,4,7,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,3] ; AVX2-SLOW-NEXT: vbroadcastss 216(%rdi), %ymm5 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm5[7] ; AVX2-SLOW-NEXT: vmovaps 96(%rdi), %xmm14 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm14[0,1,2],xmm0[3] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,3,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1,3,2] ; AVX2-SLOW-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm5 = mem[1,0,2,3,5,4,6,7] ; AVX2-SLOW-NEXT: vextractf128 $1, %ymm5, %xmm5 @@ -4372,41 +4372,41 @@ ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] ; AVX2-SLOW-NEXT: vmovaps 544(%rdi), %xmm3 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,1,3,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,3,2] ; AVX2-SLOW-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm5 = mem[1,0,2,3,5,4,6,7] ; AVX2-SLOW-NEXT: vextractf128 $1, %ymm5, %xmm5 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm5[0,1],xmm1[2,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm12[0],ymm13[1],ymm12[2,3,4],ymm13[5],ymm12[6,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm5 = ymm5[1,0,3,3,5,4,7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm5 = ymm5[1,0,3,3,5,4,7,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,0,3] ; AVX2-SLOW-NEXT: vbroadcastss 664(%rdi), %ymm6 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm6[7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5,6,7] ; AVX2-SLOW-NEXT: vmovaps 320(%rdi), %xmm11 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm2 = xmm11[0,1,2],xmm2[3] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[0,1,3,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,1,3,2] ; AVX2-SLOW-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm5 = mem[1,0,2,3,5,4,6,7] ; AVX2-SLOW-NEXT: vextractf128 $1, %ymm5, %xmm5 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3] ; AVX2-SLOW-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm5 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm5 = ymm15[0],mem[1],ymm15[2,3,4],mem[5],ymm15[6,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm5 = ymm5[1,0,3,3,5,4,7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm5 = ymm5[1,0,3,3,5,4,7,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,0,3] ; AVX2-SLOW-NEXT: vbroadcastss 440(%rdi), %ymm6 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm6[7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm2[0,1,2,3],ymm5[4,5,6,7] ; AVX2-SLOW-NEXT: vmovaps 768(%rdi), %xmm2 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm5 = xmm2[0,1,2],xmm7[3] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm5 = xmm5[0,1,3,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm5 = xmm5[0,1,3,2] ; AVX2-SLOW-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm7 = mem[1,0,2,3,5,4,6,7] ; AVX2-SLOW-NEXT: vextractf128 $1, %ymm7, %xmm7 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm5 = xmm7[0,1],xmm5[2,3] ; AVX2-SLOW-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm7 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm7 = ymm10[0],mem[1],ymm10[2,3,4],mem[5],ymm10[6,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm7 = ymm7[1,0,3,3,5,4,7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm7 = ymm7[1,0,3,3,5,4,7,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,0,3] ; AVX2-SLOW-NEXT: vbroadcastss 888(%rdi), %ymm8 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm8[7] @@ -4796,8 +4796,8 @@ ; AVX2-FAST-NEXT: # ymm0 = ymm0[0],mem[1],ymm0[2,3,4,5,6,7] ; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1,2],mem[3] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,2,2,3] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,2,2,3] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload @@ -5308,8 +5308,8 @@ ; AVX2-FAST-PERLANE-NEXT: # ymm0 = ymm0[0],mem[1],ymm0[2,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1,2],mem[3] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm9, %ymm14 @@ -5381,13 +5381,13 @@ ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm8[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0],ymm6[1],ymm5[2,3,4],ymm6[5],ymm5[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm3 = ymm3[1,0,3,3,5,4,7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm3 = ymm3[1,0,3,3,5,4,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,3] ; AVX2-FAST-PERLANE-NEXT: vbroadcastss 216(%rdi), %ymm5 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm5[7] ; AVX2-FAST-PERLANE-NEXT: vmovaps 96(%rdi), %xmm14 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm0 = xmm14[0,1,2],xmm0[3] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,3,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1,3,2] ; AVX2-FAST-PERLANE-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm5 = mem[1,0,2,3,5,4,6,7] ; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm5, %xmm5 @@ -5395,41 +5395,41 @@ ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps 544(%rdi), %xmm3 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,1,3,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,3,2] ; AVX2-FAST-PERLANE-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm5 = mem[1,0,2,3,5,4,6,7] ; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm5, %xmm5 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm1 = xmm5[0,1],xmm1[2,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm12[0],ymm13[1],ymm12[2,3,4],ymm13[5],ymm12[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm5 = ymm5[1,0,3,3,5,4,7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm5 = ymm5[1,0,3,3,5,4,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,0,3] ; AVX2-FAST-PERLANE-NEXT: vbroadcastss 664(%rdi), %ymm6 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm6[7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps 320(%rdi), %xmm11 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm2 = xmm11[0,1,2],xmm2[3] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[0,1,3,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,1,3,2] ; AVX2-FAST-PERLANE-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm5 = mem[1,0,2,3,5,4,6,7] ; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm5, %xmm5 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3] ; AVX2-FAST-PERLANE-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm5 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm5 = ymm15[0],mem[1],ymm15[2,3,4],mem[5],ymm15[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm5 = ymm5[1,0,3,3,5,4,7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm5 = ymm5[1,0,3,3,5,4,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,0,3] ; AVX2-FAST-PERLANE-NEXT: vbroadcastss 440(%rdi), %ymm6 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm6[7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm2[0,1,2,3],ymm5[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps 768(%rdi), %xmm2 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm5 = xmm2[0,1,2],xmm7[3] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm5 = xmm5[0,1,3,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm5 = xmm5[0,1,3,2] ; AVX2-FAST-PERLANE-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm7 = mem[1,0,2,3,5,4,6,7] ; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm7, %xmm7 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm5 = xmm7[0,1],xmm5[2,3] ; AVX2-FAST-PERLANE-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm7 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm7 = ymm10[0],mem[1],ymm10[2,3,4],mem[5],ymm10[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm7 = ymm7[1,0,3,3,5,4,7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm7 = ymm7[1,0,3,3,5,4,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,0,3] ; AVX2-FAST-PERLANE-NEXT: vbroadcastss 888(%rdi), %ymm8 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm8[7] @@ -7429,7 +7429,7 @@ ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm12 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm12[0,1],xmm1[2,3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %xmm4 ; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -7452,7 +7452,7 @@ ; AVX1-ONLY-NEXT: vmovaps 672(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] ; AVX1-ONLY-NEXT: vmovaps %xmm2, %xmm4 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 832(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -7474,7 +7474,7 @@ ; AVX1-ONLY-NEXT: vmovaps 1120(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,3,2,3] ; AVX1-ONLY-NEXT: vmovaps 1216(%rdi), %ymm5 ; AVX1-ONLY-NEXT: vmovaps 1200(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -7501,7 +7501,7 @@ ; AVX1-ONLY-NEXT: vmovaps 1568(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,3,2,3] ; AVX1-ONLY-NEXT: vmovaps 1664(%rdi), %ymm8 ; AVX1-ONLY-NEXT: vmovaps 1648(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -7528,7 +7528,7 @@ ; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,3,2,3] ; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %ymm3 ; AVX1-ONLY-NEXT: vmovaps 80(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -7555,7 +7555,7 @@ ; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,3,2,3] ; AVX1-ONLY-NEXT: vmovaps 544(%rdi), %ymm2 ; AVX1-ONLY-NEXT: vmovaps 528(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -7583,7 +7583,7 @@ ; AVX1-ONLY-NEXT: vmovaps 896(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,3,2,3] ; AVX1-ONLY-NEXT: vmovaps 992(%rdi), %ymm6 ; AVX1-ONLY-NEXT: vmovaps 976(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -7610,7 +7610,7 @@ ; AVX1-ONLY-NEXT: vmovaps 1344(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,3,2,3] ; AVX1-ONLY-NEXT: vmovaps 1440(%rdi), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 1424(%rdi), %xmm1 @@ -7805,7 +7805,7 @@ ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4],ymm14[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm13 = xmm12[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm13 = xmm12[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm13 = xmm13[0],xmm15[1],xmm13[2,3] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps $199, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload @@ -7822,7 +7822,7 @@ ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4],ymm14[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm13 = xmm10[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm13 = xmm10[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm13 = xmm13[0],xmm11[1],xmm13[2,3] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps $199, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload @@ -7839,7 +7839,7 @@ ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4],ymm14[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm13 = xmm7[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm13 = xmm7[2,3,2,3] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm13 = xmm13[0],xmm11[1],xmm13[2,3] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload @@ -7857,7 +7857,7 @@ ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4],ymm14[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm13 = xmm9[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm13 = xmm9[2,3,2,3] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm13 = xmm13[0],xmm9[1],xmm13[2,3] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload @@ -7875,7 +7875,7 @@ ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4],ymm14[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm13 = xmm1[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm13 = xmm1[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm13 = xmm13[0],xmm4[1],xmm13[2,3] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[3,1],ymm6[0,3],ymm14[7,5],ymm6[4,7] @@ -7951,7 +7951,7 @@ ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm13 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm13 = mem[0,1,2],xmm0[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm13 = xmm13[3,2,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm13 = xmm13[3,2,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm13[0,1],ymm8[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -7968,7 +7968,7 @@ ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm14 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm14 = mem[0,1,2],xmm0[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm14 = xmm14[3,2,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm14[3,2,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm14[0,1],ymm8[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 864(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -7982,7 +7982,7 @@ ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm0[3,1],ymm8[0,2],ymm0[7,5],ymm8[4,6] ; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm14 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm14 = xmm11[0,1,2],mem[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm14 = xmm14[3,2,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm14[3,2,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm14[0,1],ymm8[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 1312(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -7996,7 +7996,7 @@ ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm0[3,1],ymm8[0,2],ymm0[7,5],ymm8[4,6] ; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm14 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm14 = xmm9[0,1,2],mem[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm14 = xmm14[3,2,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm14[3,2,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm14[0,1],ymm8[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 1760(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -8012,7 +8012,7 @@ ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm5 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm5 = mem[0,1,2],xmm0[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm5 = xmm5[3,2,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm5[3,2,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1],ymm2[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 1536(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -8026,7 +8026,7 @@ ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm3[3,1],ymm2[0,2],ymm3[7,5],ymm2[4,6] ; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm5 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm5 = xmm1[0,1,2],mem[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm5 = xmm5[3,2,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm5[3,2,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1],ymm2[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 1088(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovaps %ymm6, %ymm14 @@ -8044,7 +8044,7 @@ ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload ; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm5 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm5 = xmm9[0,1,2],mem[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm5 = xmm5[3,2,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm5[3,2,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1],ymm2[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 640(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload @@ -8062,7 +8062,7 @@ ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm5 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm5 = xmm4[0,1,2],mem[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm5 = xmm5[3,2,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm5[3,2,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm2[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm13 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload @@ -8079,9 +8079,9 @@ ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm5 = xmm1[0,1,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm1[0,1,0,1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm2[0,1,2],xmm5[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm7 = xmm4[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm4[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = mem[0],xmm7[1],mem[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm7[0,1],xmm5[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7] @@ -8095,7 +8095,7 @@ ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm4[2,0],ymm0[6,4],ymm4[6,4] ; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm4 = xmm1[0,1,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm1[0,1,0,1] ; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm1[0,1,2],xmm4[3] @@ -8110,13 +8110,13 @@ ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm12[1,0],ymm3[2,0],ymm12[5,4],ymm3[6,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm1[2,0],ymm0[6,4],ymm1[6,4] ; AVX1-ONLY-NEXT: vmovaps 544(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm2[0,1,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm2[0,1,0,1] ; AVX1-ONLY-NEXT: vmovaps %xmm2, %xmm6 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 512(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm4 = xmm9[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm9[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = mem[0],xmm4[1],mem[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm4[0,1],xmm1[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] @@ -8130,7 +8130,7 @@ ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm1[2,0],ymm0[6,4],ymm1[6,4] ; AVX1-ONLY-NEXT: vmovaps 768(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,1,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,0,1] ; AVX1-ONLY-NEXT: vmovaps 736(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3] @@ -8146,7 +8146,7 @@ ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm1[2,0],ymm0[6,4],ymm1[6,4] ; AVX1-ONLY-NEXT: vmovaps 992(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,1,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,0,1] ; AVX1-ONLY-NEXT: vmovaps 960(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3] @@ -8165,7 +8165,7 @@ ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm1[2,0],ymm0[6,4],ymm1[6,4] ; AVX1-ONLY-NEXT: vmovaps 1216(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,1,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,0,1] ; AVX1-ONLY-NEXT: vmovaps 1184(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3] @@ -8183,7 +8183,7 @@ ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm1[2,0],ymm0[6,4],ymm1[6,4] ; AVX1-ONLY-NEXT: vmovaps 1440(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm10 = xmm1[0,1,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm1[0,1,0,1] ; AVX1-ONLY-NEXT: vmovaps 1408(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm1[0,1,2],xmm10[3] ; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload @@ -8200,7 +8200,7 @@ ; AVX1-ONLY-NEXT: # ymm3 = ymm2[1,0],mem[2,0],ymm2[5,4],mem[6,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm0[2,0],ymm3[2,0],ymm0[6,4],ymm3[6,4] ; AVX1-ONLY-NEXT: vmovaps 1664(%rdi), %xmm9 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm10 = xmm9[0,1,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm9[0,1,0,1] ; AVX1-ONLY-NEXT: vmovaps 1632(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm0[0,1,2],xmm10[3] ; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload @@ -8222,7 +8222,7 @@ ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm14[0,0],ymm15[1,0],ymm14[4,4],ymm15[5,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm11 = ymm11[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm11[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm11, %xmm11 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm11[0,1],xmm10[3,2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm10[0,1,2,3],ymm3[4,5,6,7] @@ -8239,7 +8239,7 @@ ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm10 = ymm10[0,0],mem[1,0],ymm10[4,4],mem[5,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm10 = ymm10[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm8 = xmm10[0,1],xmm8[3,2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm8[0,1,2,3],ymm3[4,5,6,7] @@ -8257,7 +8257,7 @@ ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm8 = ymm8[0,0],mem[1,0],ymm8[4,4],mem[5,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm8 = ymm8[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm8[0,1],xmm7[3,2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm7[0,1,2,3],ymm3[4,5,6,7] @@ -8275,7 +8275,7 @@ ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm7 = ymm7[0,0],mem[1,0],ymm7[4,4],mem[5,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm7 = ymm7[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm7[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm7 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm7[0,1],xmm6[3,2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm6[0,1,2,3],ymm3[4,5,6,7] @@ -8294,7 +8294,7 @@ ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm6 = ymm6[0,0],mem[1,0],ymm6[4,4],mem[5,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm6 = ymm6[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm6[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm6, %xmm6 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm6[0,1],xmm5[3,2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5,6,7] @@ -8312,7 +8312,7 @@ ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm5 = ymm5[0,0],mem[1,0],ymm5[4,4],mem[5,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm5 = ymm5[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm5 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm5[0,1],xmm4[3,2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] @@ -8330,7 +8330,7 @@ ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm4 = ymm4[0,0],mem[1,0],ymm4[4,4],mem[5,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm4 = ymm4[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm4 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm4[0,1],xmm1[3,2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] @@ -8346,14 +8346,14 @@ ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm3 = ymm3[0,0],mem[1,0],ymm3[4,4],mem[5,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm3 = ymm3[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm3 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm3[0,1],xmm0[3,2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm13[2,3,0,1] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm13[3,0],ymm0[0,0],ymm13[7,4],ymm0[4,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm12[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm12[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm1 = xmm1[0],mem[1],xmm1[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 @@ -8363,14 +8363,14 @@ ; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm1 = xmm1[0,1,2],mem[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm14[1,0],ymm15[2,0],ymm14[5,4],ymm15[6,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm2 = ymm2[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3,0,1] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[3,0],ymm1[0,0],ymm2[7,4],ymm1[4,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm2 = xmm11[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm11[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm2 = xmm2[0],mem[1],xmm2[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 @@ -8382,7 +8382,7 @@ ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps $33, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm3 = ymm3[1,0],mem[2,0],ymm3[5,4],mem[6,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm3 = ymm3[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm3 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm1[4,5,6,7] @@ -8401,14 +8401,14 @@ ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps $33, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm4 = ymm4[1,0],mem[2,0],ymm4[5,4],mem[6,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm4 = ymm4[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm4 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm4[2,3,0,1] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm4[3,0],ymm3[0,0],ymm4[7,4],ymm3[4,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm4 = xmm8[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm8[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm4 = xmm4[0],mem[1],xmm4[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 @@ -8420,7 +8420,7 @@ ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps $33, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm5 = ymm5[1,0],mem[2,0],ymm5[5,4],mem[6,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm5 = ymm5[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm5 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm3[4,5,6,7] @@ -8440,14 +8440,14 @@ ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps $33, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm6 = ymm6[1,0],mem[2,0],ymm6[5,4],mem[6,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm6 = ymm6[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm6[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm6, %xmm6 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm3[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm6[2,3,0,1] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm6[3,0],ymm3[0,0],ymm6[7,4],ymm3[4,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm6 = xmm7[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm7[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm6 = xmm6[0],mem[1],xmm6[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 @@ -8459,7 +8459,7 @@ ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps $33, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm7 = ymm7[1,0],mem[2,0],ymm7[5,4],mem[6,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm7 = ymm7[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm7[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm7 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm7[0,1],xmm6[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm3[4,5,6,7] @@ -8479,7 +8479,7 @@ ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps $33, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm8 = ymm8[1,0],mem[2,0],ymm8[5,4],mem[6,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm8 = ymm8[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm8[0,1],xmm7[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm7[0,1,2,3],ymm3[4,5,6,7] @@ -8498,7 +8498,7 @@ ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps $33, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm9 = ymm9[1,0],mem[2,0],ymm9[5,4],mem[6,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm9 = ymm9[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm9[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm9, %xmm9 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] @@ -9092,8 +9092,8 @@ ; AVX2-SLOW-NEXT: # ymm13 = ymm5[0],mem[1],ymm5[2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm6 = xmm5[0,1,2],mem[3] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm6 = xmm6[3,2,2,3] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm13 = ymm13[3,1,1,0,7,5,5,4] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm6 = xmm6[3,2,2,3] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm13 = ymm13[3,1,1,0,7,5,5,4] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm13[2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm13 = ymm10[0,2],ymm4[1,3],ymm10[4,6],ymm4[5,7] ; AVX2-SLOW-NEXT: vbroadcastss 432(%rdi), %ymm14 @@ -9105,8 +9105,8 @@ ; AVX2-SLOW-NEXT: # ymm6 = ymm4[0],mem[1],ymm4[2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm5 = xmm4[0,1,2],mem[3] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm5 = xmm5[3,2,2,3] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm6 = ymm6[3,1,1,0,7,5,5,4] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm5 = xmm5[3,2,2,3] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm6 = ymm6[3,1,1,0,7,5,5,4] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm6[2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vshufps $216, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm6 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm6 = ymm3[0,2],mem[1,3],ymm3[4,6],mem[5,7] @@ -9119,8 +9119,8 @@ ; AVX2-SLOW-NEXT: # ymm5 = ymm3[0],mem[1],ymm3[2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm4 = xmm3[0,1,2],mem[3] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm4 = xmm4[3,2,2,3] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm5 = ymm5[3,1,1,0,7,5,5,4] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm4 = xmm4[3,2,2,3] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm5 = ymm5[3,1,1,0,7,5,5,4] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm5[2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload @@ -9134,8 +9134,8 @@ ; AVX2-SLOW-NEXT: # ymm4 = ymm3[0],mem[1],ymm3[2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1,2],mem[3] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm3 = xmm3[3,2,2,3] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm4 = ymm4[3,1,1,0,7,5,5,4] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm3[3,2,2,3] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm4 = ymm4[3,1,1,0,7,5,5,4] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX2-SLOW-NEXT: vshufps $216, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload @@ -9175,8 +9175,8 @@ ; AVX2-SLOW-NEXT: # ymm0 = ymm0[0],mem[1],ymm0[2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1,2],mem[3] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,2,2,3] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,2,2,3] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqa %ymm11, %ymm7 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm12[0,2],ymm11[1,3],ymm12[4,6],ymm11[5,7] @@ -9189,8 +9189,8 @@ ; AVX2-SLOW-NEXT: # ymm0 = ymm0[0],mem[1],ymm0[2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1,2],mem[3] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,2,2,3] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,2,2,3] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload @@ -9321,13 +9321,13 @@ ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload ; AVX2-SLOW-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm5 = ymm5[0],mem[1],ymm5[2,3,4],mem[5],ymm5[6,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm5 = ymm5[1,0,3,3,5,4,7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm5 = ymm5[1,0,3,3,5,4,7,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,0,3] ; AVX2-SLOW-NEXT: vbroadcastss 216(%rdi), %ymm6 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm6[7] ; AVX2-SLOW-NEXT: vmovaps 96(%rdi), %xmm6 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm6[0,1,2],xmm0[3] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,3,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1,3,2] ; AVX2-SLOW-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm8 = mem[1,0,2,3,5,4,6,7] ; AVX2-SLOW-NEXT: vextractf128 $1, %ymm8, %xmm8 @@ -9336,14 +9336,14 @@ ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps 320(%rdi), %xmm0 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm0[0,1,2],xmm1[3] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,1,3,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,3,2] ; AVX2-SLOW-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm5 = mem[1,0,2,3,5,4,6,7] ; AVX2-SLOW-NEXT: vextractf128 $1, %ymm5, %xmm5 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm5[0,1],xmm1[2,3] ; AVX2-SLOW-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm5 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm5 = ymm11[0],mem[1],ymm11[2,3,4],mem[5],ymm11[6,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm5 = ymm5[1,0,3,3,5,4,7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm5 = ymm5[1,0,3,3,5,4,7,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,0,3] ; AVX2-SLOW-NEXT: vbroadcastss 440(%rdi), %ymm8 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm8[7] @@ -9351,7 +9351,7 @@ ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps 544(%rdi), %xmm5 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm5[0,1,2],xmm2[3] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,1,3,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,3,2] ; AVX2-SLOW-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm2 = mem[1,0,2,3,5,4,6,7] ; AVX2-SLOW-NEXT: vextractf128 $1, %ymm2, %xmm2 @@ -9359,7 +9359,7 @@ ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-SLOW-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm2 = ymm2[0],mem[1],ymm2[2,3,4],mem[5],ymm2[6,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm2 = ymm2[1,0,3,3,5,4,7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm2[1,0,3,3,5,4,7,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3] ; AVX2-SLOW-NEXT: vbroadcastss 664(%rdi), %ymm8 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm8[7] @@ -9367,7 +9367,7 @@ ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps 768(%rdi), %xmm8 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm8[0,1,2],xmm3[3] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,1,3,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,3,2] ; AVX2-SLOW-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm2 = mem[1,0,2,3,5,4,6,7] ; AVX2-SLOW-NEXT: vextractf128 $1, %ymm2, %xmm2 @@ -9375,7 +9375,7 @@ ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-SLOW-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm2 = ymm2[0],mem[1],ymm2[2,3,4],mem[5],ymm2[6,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm2 = ymm2[1,0,3,3,5,4,7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm2[1,0,3,3,5,4,7,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3] ; AVX2-SLOW-NEXT: vbroadcastss 888(%rdi), %ymm3 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] @@ -9383,7 +9383,7 @@ ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps 992(%rdi), %xmm1 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm2 = xmm1[0,1,2],xmm4[3] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[0,1,3,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,1,3,2] ; AVX2-SLOW-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm3 = mem[1,0,2,3,5,4,6,7] ; AVX2-SLOW-NEXT: vextractf128 $1, %ymm3, %xmm3 @@ -9391,14 +9391,14 @@ ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-SLOW-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm3 = ymm3[0],mem[1],ymm3[2,3,4],mem[5],ymm3[6,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm3 = ymm3[1,0,3,3,5,4,7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm3 = ymm3[1,0,3,3,5,4,7,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,3] ; AVX2-SLOW-NEXT: vbroadcastss 1112(%rdi), %ymm4 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm4[7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm11 = ymm2[0,1,2,3],ymm3[4,5,6,7] ; AVX2-SLOW-NEXT: vmovaps 1216(%rdi), %xmm2 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm3 = xmm2[0,1,2],xmm7[3] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm3 = xmm3[0,1,3,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,1,3,2] ; AVX2-SLOW-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm4 = mem[1,0,2,3,5,4,6,7] ; AVX2-SLOW-NEXT: vextractf128 $1, %ymm4, %xmm4 @@ -9406,7 +9406,7 @@ ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX2-SLOW-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm4 = ymm4[0],mem[1],ymm4[2,3,4],mem[5],ymm4[6,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm4 = ymm4[1,0,3,3,5,4,7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm4 = ymm4[1,0,3,3,5,4,7,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,0,3] ; AVX2-SLOW-NEXT: vbroadcastss 1336(%rdi), %ymm7 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm7[7] @@ -9414,27 +9414,27 @@ ; AVX2-SLOW-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps 1440(%rdi), %xmm3 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm4 = xmm3[0,1,2],xmm13[3] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm4 = xmm4[0,1,3,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm4 = xmm4[0,1,3,2] ; AVX2-SLOW-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm7 = mem[1,0,2,3,5,4,6,7] ; AVX2-SLOW-NEXT: vextractf128 $1, %ymm7, %xmm7 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm4 = xmm7[0,1],xmm4[2,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm12[0],ymm14[1],ymm12[2,3,4],ymm14[5],ymm12[6,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm7 = ymm7[1,0,3,3,5,4,7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm7 = ymm7[1,0,3,3,5,4,7,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,0,3] ; AVX2-SLOW-NEXT: vbroadcastss 1560(%rdi), %ymm13 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm13[7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm4[0,1,2,3],ymm7[4,5,6,7] ; AVX2-SLOW-NEXT: vmovaps 1664(%rdi), %xmm4 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm13 = xmm4[0,1,2],xmm15[3] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm13 = xmm13[0,1,3,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm13 = xmm13[0,1,3,2] ; AVX2-SLOW-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm15 = mem[1,0,2,3,5,4,6,7] ; AVX2-SLOW-NEXT: vextractf128 $1, %ymm15, %xmm15 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm13 = xmm15[0,1],xmm13[2,3] ; AVX2-SLOW-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm15 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm15 = ymm10[0],mem[1],ymm10[2,3,4],mem[5],ymm10[6,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm15 = ymm15[1,0,3,3,5,4,7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm15 = ymm15[1,0,3,3,5,4,7,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[0,1,0,3] ; AVX2-SLOW-NEXT: vbroadcastss 1784(%rdi), %ymm12 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm12 = ymm15[0,1,2,3,4,5,6],ymm12[7] @@ -10134,8 +10134,8 @@ ; AVX2-FAST-NEXT: # ymm13 = ymm4[0],mem[1],ymm4[2,3,4,5,6,7] ; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm6 = xmm4[0,1,2],mem[3] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm6 = xmm6[3,2,2,3] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm13 = ymm13[3,1,1,0,7,5,5,4] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm6 = xmm6[3,2,2,3] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm13 = ymm13[3,1,1,0,7,5,5,4] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm13[2,3,4,5,6,7] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm13 = ymm15[0,2],ymm3[1,3],ymm15[4,6],ymm3[5,7] @@ -10161,8 +10161,8 @@ ; AVX2-FAST-NEXT: # ymm5 = ymm3[0],mem[1],ymm3[2,3,4,5,6,7] ; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm4 = xmm3[0,1,2],mem[3] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm4 = xmm4[3,2,2,3] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm5 = ymm5[3,1,1,0,7,5,5,4] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm4 = xmm4[3,2,2,3] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm5 = ymm5[3,1,1,0,7,5,5,4] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm5[2,3,4,5,6,7] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-FAST-NEXT: vshufps $216, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm5 # 32-byte Folded Reload @@ -10176,8 +10176,8 @@ ; AVX2-FAST-NEXT: # ymm4 = ymm3[0],mem[1],ymm3[2,3,4,5,6,7] ; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1,2],mem[3] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm3 = xmm3[3,2,2,3] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm4 = ymm4[3,1,1,0,7,5,5,4] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm3 = xmm3[3,2,2,3] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm4[3,1,1,0,7,5,5,4] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3,4,5,6,7] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX2-FAST-NEXT: vshufps $216, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload @@ -10217,8 +10217,8 @@ ; AVX2-FAST-NEXT: # ymm0 = ymm0[0],mem[1],ymm0[2,3,4,5,6,7] ; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1,2],mem[3] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,2,2,3] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,2,2,3] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm9[0,2],ymm7[1,3],ymm9[4,6],ymm7[5,7] @@ -10231,8 +10231,8 @@ ; AVX2-FAST-NEXT: # ymm0 = ymm0[0],mem[1],ymm0[2,3,4,5,6,7] ; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1,2],mem[3] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,2,2,3] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,2,2,3] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-FAST-NEXT: vmovups (%rsp), %ymm4 # 32-byte Reload @@ -10370,7 +10370,7 @@ ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm6[7] ; AVX2-FAST-NEXT: vmovaps 96(%rdi), %xmm9 ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm0 = xmm9[0,1,2],xmm0[3] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,3,2] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1,3,2] ; AVX2-FAST-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm6 = mem[1,0,2,3,5,4,6,7] ; AVX2-FAST-NEXT: vextractf128 $1, %ymm6, %xmm6 @@ -10379,7 +10379,7 @@ ; AVX2-FAST-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps 320(%rdi), %xmm6 ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm0 = xmm6[0,1,2],xmm1[3] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,3,2] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1,3,2] ; AVX2-FAST-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm1 = mem[1,0,2,3,5,4,6,7] ; AVX2-FAST-NEXT: vextractf128 $1, %ymm1, %xmm1 @@ -10394,7 +10394,7 @@ ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps 544(%rdi), %xmm5 ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm0 = xmm5[0,1,2],xmm2[3] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,3,2] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1,3,2] ; AVX2-FAST-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm1 = mem[1,0,2,3,5,4,6,7] ; AVX2-FAST-NEXT: vextractf128 $1, %ymm1, %xmm1 @@ -10409,7 +10409,7 @@ ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps 768(%rdi), %xmm0 ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm0[0,1,2],xmm3[3] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,1,3,2] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,3,2] ; AVX2-FAST-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm2 = mem[1,0,2,3,5,4,6,7] ; AVX2-FAST-NEXT: vextractf128 $1, %ymm2, %xmm2 @@ -10423,7 +10423,7 @@ ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps 992(%rdi), %xmm1 ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm2 = xmm1[0,1,2],xmm4[3] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[0,1,3,2] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,1,3,2] ; AVX2-FAST-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm3 = mem[1,0,2,3,5,4,6,7] ; AVX2-FAST-NEXT: vextractf128 $1, %ymm3, %xmm3 @@ -10438,7 +10438,7 @@ ; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps 1216(%rdi), %xmm2 ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm3 = xmm2[0,1,2],xmm7[3] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm3 = xmm3[0,1,3,2] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,1,3,2] ; AVX2-FAST-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm4 = mem[1,0,2,3,5,4,6,7] ; AVX2-FAST-NEXT: vextractf128 $1, %ymm4, %xmm4 @@ -10452,7 +10452,7 @@ ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm3[0,1,2,3],ymm4[4,5,6,7] ; AVX2-FAST-NEXT: vmovaps 1440(%rdi), %xmm3 ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm4 = xmm3[0,1,2],xmm8[3] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm4 = xmm4[0,1,3,2] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm4 = xmm4[0,1,3,2] ; AVX2-FAST-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm8 = mem[1,0,2,3,5,4,6,7] ; AVX2-FAST-NEXT: vextractf128 $1, %ymm8, %xmm8 @@ -10465,7 +10465,7 @@ ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm4[0,1,2,3],ymm8[4,5,6,7] ; AVX2-FAST-NEXT: vmovaps 1664(%rdi), %xmm4 ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm11 = xmm4[0,1,2],xmm11[3] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm11 = xmm11[0,1,3,2] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm11 = xmm11[0,1,3,2] ; AVX2-FAST-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm15 = mem[1,0,2,3,5,4,6,7] ; AVX2-FAST-NEXT: vextractf128 $1, %ymm15, %xmm15 @@ -11183,8 +11183,8 @@ ; AVX2-FAST-PERLANE-NEXT: # ymm13 = ymm5[0],mem[1],ymm5[2,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm6 = xmm5[0,1,2],mem[3] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm6 = xmm6[3,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm13 = ymm13[3,1,1,0,7,5,5,4] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm6 = xmm6[3,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm13 = ymm13[3,1,1,0,7,5,5,4] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm13[2,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm13 = ymm10[0,2],ymm4[1,3],ymm10[4,6],ymm4[5,7] ; AVX2-FAST-PERLANE-NEXT: vbroadcastss 432(%rdi), %ymm14 @@ -11196,8 +11196,8 @@ ; AVX2-FAST-PERLANE-NEXT: # ymm6 = ymm4[0],mem[1],ymm4[2,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm5 = xmm4[0,1,2],mem[3] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm5 = xmm5[3,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm6 = ymm6[3,1,1,0,7,5,5,4] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm5 = xmm5[3,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm6 = ymm6[3,1,1,0,7,5,5,4] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm6[2,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vshufps $216, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm6 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm6 = ymm3[0,2],mem[1,3],ymm3[4,6],mem[5,7] @@ -11210,8 +11210,8 @@ ; AVX2-FAST-PERLANE-NEXT: # ymm5 = ymm3[0],mem[1],ymm3[2,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm4 = xmm3[0,1,2],mem[3] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm4 = xmm4[3,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm5 = ymm5[3,1,1,0,7,5,5,4] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm4 = xmm4[3,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm5 = ymm5[3,1,1,0,7,5,5,4] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm5[2,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload @@ -11225,8 +11225,8 @@ ; AVX2-FAST-PERLANE-NEXT: # ymm4 = ymm3[0],mem[1],ymm3[2,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1,2],mem[3] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm3 = xmm3[3,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm4 = ymm4[3,1,1,0,7,5,5,4] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm3 = xmm3[3,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm4 = ymm4[3,1,1,0,7,5,5,4] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vshufps $216, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload @@ -11266,8 +11266,8 @@ ; AVX2-FAST-PERLANE-NEXT: # ymm0 = ymm0[0],mem[1],ymm0[2,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1,2],mem[3] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm11, %ymm7 ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm12[0,2],ymm11[1,3],ymm12[4,6],ymm11[5,7] @@ -11280,8 +11280,8 @@ ; AVX2-FAST-PERLANE-NEXT: # ymm0 = ymm0[0],mem[1],ymm0[2,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1,2],mem[3] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload @@ -11412,13 +11412,13 @@ ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm5 = ymm5[0],mem[1],ymm5[2,3,4],mem[5],ymm5[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm5 = ymm5[1,0,3,3,5,4,7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm5 = ymm5[1,0,3,3,5,4,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,0,3] ; AVX2-FAST-PERLANE-NEXT: vbroadcastss 216(%rdi), %ymm6 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm6[7] ; AVX2-FAST-PERLANE-NEXT: vmovaps 96(%rdi), %xmm6 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm0 = xmm6[0,1,2],xmm0[3] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,3,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1,3,2] ; AVX2-FAST-PERLANE-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm8 = mem[1,0,2,3,5,4,6,7] ; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm8, %xmm8 @@ -11427,14 +11427,14 @@ ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 320(%rdi), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm1 = xmm0[0,1,2],xmm1[3] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,1,3,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,3,2] ; AVX2-FAST-PERLANE-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm5 = mem[1,0,2,3,5,4,6,7] ; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm5, %xmm5 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm1 = xmm5[0,1],xmm1[2,3] ; AVX2-FAST-PERLANE-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm5 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm5 = ymm11[0],mem[1],ymm11[2,3,4],mem[5],ymm11[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm5 = ymm5[1,0,3,3,5,4,7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm5 = ymm5[1,0,3,3,5,4,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,0,3] ; AVX2-FAST-PERLANE-NEXT: vbroadcastss 440(%rdi), %ymm8 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm8[7] @@ -11442,7 +11442,7 @@ ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 544(%rdi), %xmm5 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm1 = xmm5[0,1,2],xmm2[3] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,1,3,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,3,2] ; AVX2-FAST-PERLANE-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[1,0,2,3,5,4,6,7] ; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm2, %xmm2 @@ -11450,7 +11450,7 @@ ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm2 = ymm2[0],mem[1],ymm2[2,3,4],mem[5],ymm2[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm2 = ymm2[1,0,3,3,5,4,7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm2[1,0,3,3,5,4,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3] ; AVX2-FAST-PERLANE-NEXT: vbroadcastss 664(%rdi), %ymm8 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm8[7] @@ -11458,7 +11458,7 @@ ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 768(%rdi), %xmm8 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm1 = xmm8[0,1,2],xmm3[3] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,1,3,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,3,2] ; AVX2-FAST-PERLANE-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[1,0,2,3,5,4,6,7] ; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm2, %xmm2 @@ -11466,7 +11466,7 @@ ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm2 = ymm2[0],mem[1],ymm2[2,3,4],mem[5],ymm2[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm2 = ymm2[1,0,3,3,5,4,7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm2[1,0,3,3,5,4,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3] ; AVX2-FAST-PERLANE-NEXT: vbroadcastss 888(%rdi), %ymm3 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] @@ -11474,7 +11474,7 @@ ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 992(%rdi), %xmm1 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm2 = xmm1[0,1,2],xmm4[3] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[0,1,3,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,1,3,2] ; AVX2-FAST-PERLANE-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm3 = mem[1,0,2,3,5,4,6,7] ; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm3, %xmm3 @@ -11482,14 +11482,14 @@ ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm3 = ymm3[0],mem[1],ymm3[2,3,4],mem[5],ymm3[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm3 = ymm3[1,0,3,3,5,4,7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm3 = ymm3[1,0,3,3,5,4,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,3] ; AVX2-FAST-PERLANE-NEXT: vbroadcastss 1112(%rdi), %ymm4 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm4[7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm11 = ymm2[0,1,2,3],ymm3[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps 1216(%rdi), %xmm2 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm3 = xmm2[0,1,2],xmm7[3] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm3 = xmm3[0,1,3,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,1,3,2] ; AVX2-FAST-PERLANE-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm4 = mem[1,0,2,3,5,4,6,7] ; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm4, %xmm4 @@ -11497,7 +11497,7 @@ ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm4 = ymm4[0],mem[1],ymm4[2,3,4],mem[5],ymm4[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm4 = ymm4[1,0,3,3,5,4,7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm4 = ymm4[1,0,3,3,5,4,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,0,3] ; AVX2-FAST-PERLANE-NEXT: vbroadcastss 1336(%rdi), %ymm7 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm7[7] @@ -11505,27 +11505,27 @@ ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 1440(%rdi), %xmm3 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm4 = xmm3[0,1,2],xmm13[3] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm4 = xmm4[0,1,3,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm4 = xmm4[0,1,3,2] ; AVX2-FAST-PERLANE-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm7 = mem[1,0,2,3,5,4,6,7] ; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm7, %xmm7 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm4 = xmm7[0,1],xmm4[2,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm12[0],ymm14[1],ymm12[2,3,4],ymm14[5],ymm12[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm7 = ymm7[1,0,3,3,5,4,7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm7 = ymm7[1,0,3,3,5,4,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,0,3] ; AVX2-FAST-PERLANE-NEXT: vbroadcastss 1560(%rdi), %ymm13 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm13[7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm4[0,1,2,3],ymm7[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps 1664(%rdi), %xmm4 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm13 = xmm4[0,1,2],xmm15[3] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm13 = xmm13[0,1,3,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm13 = xmm13[0,1,3,2] ; AVX2-FAST-PERLANE-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm15 = mem[1,0,2,3,5,4,6,7] ; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm15, %xmm15 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm13 = xmm15[0,1],xmm13[2,3] ; AVX2-FAST-PERLANE-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm15 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm15 = ymm10[0],mem[1],ymm10[2,3,4],mem[5],ymm10[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm15 = ymm15[1,0,3,3,5,4,7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm15 = ymm15[1,0,3,3,5,4,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[0,1,0,3] ; AVX2-FAST-PERLANE-NEXT: vbroadcastss 1784(%rdi), %ymm12 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm12 = ymm15[0,1,2,3,4,5,6],ymm12[7] diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-8.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-8.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-8.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-8.ll @@ -61,12 +61,12 @@ ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm3 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm3 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm1[1,0],ymm0[1,0],ymm1[5,4],ymm0[5,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm6 = ymm6[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm6[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm6, %xmm6 ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm7 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm7 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[3,0],ymm0[3,0],ymm1[7,4],ymm0[7,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-ONLY-NEXT: vmovq %xmm4, (%rsi) ; AVX1-ONLY-NEXT: vmovq %xmm5, (%rdx) @@ -95,12 +95,12 @@ ; AVX2-ONLY-NEXT: vbroadcastss 48(%rdi), %xmm3 ; AVX2-ONLY-NEXT: vbroadcastss 16(%rdi), %ymm6 ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm3 = xmm6[0],xmm3[0],xmm6[1],xmm3[1] -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} ymm6 = ymm1[1,1,1,1,5,5,5,5] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm1[1,1,1,1,5,5,5,5] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm0[1],ymm6[2,3,4],ymm0[5],ymm6[6,7] ; AVX2-ONLY-NEXT: vextractf128 $1, %ymm6, %xmm6 ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm0 = ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[6],ymm0[6],ymm1[7],ymm0[7] ; AVX2-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] ; AVX2-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX2-ONLY-NEXT: vmovq %xmm4, (%rsi) ; AVX2-ONLY-NEXT: vmovq %xmm5, (%rdx) @@ -128,12 +128,12 @@ ; AVX512F-SLOW-NEXT: vmovaps (%rdi), %ymm4 ; AVX512F-SLOW-NEXT: vunpcklps {{.*#+}} ymm5 = ymm4[0],ymm1[0],ymm4[1],ymm1[1],ymm4[4],ymm1[4],ymm4[5],ymm1[5] ; AVX512F-SLOW-NEXT: vextractf128 $1, %ymm5, %xmm5 -; AVX512F-SLOW-NEXT: vpermilps {{.*#+}} ymm6 = ymm4[1,1,1,1,5,5,5,5] +; AVX512F-SLOW-NEXT: vshufps {{.*#+}} ymm6 = ymm4[1,1,1,1,5,5,5,5] ; AVX512F-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm1[1],ymm6[2,3,4],ymm1[5],ymm6[6,7] ; AVX512F-SLOW-NEXT: vextractf128 $1, %ymm6, %xmm6 ; AVX512F-SLOW-NEXT: vunpckhps {{.*#+}} ymm1 = ymm4[2],ymm1[2],ymm4[3],ymm1[3],ymm4[6],ymm1[6],ymm4[7],ymm1[7] ; AVX512F-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm4 -; AVX512F-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[2,3,2,3,6,7,6,7] +; AVX512F-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,3,2,3,6,7,6,7] ; AVX512F-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX512F-SLOW-NEXT: vmovq %xmm2, (%rsi) ; AVX512F-SLOW-NEXT: vmovq %xmm3, (%rdx) @@ -195,12 +195,12 @@ ; AVX512BW-SLOW-NEXT: vmovaps (%rdi), %ymm4 ; AVX512BW-SLOW-NEXT: vunpcklps {{.*#+}} ymm5 = ymm4[0],ymm1[0],ymm4[1],ymm1[1],ymm4[4],ymm1[4],ymm4[5],ymm1[5] ; AVX512BW-SLOW-NEXT: vextractf128 $1, %ymm5, %xmm5 -; AVX512BW-SLOW-NEXT: vpermilps {{.*#+}} ymm6 = ymm4[1,1,1,1,5,5,5,5] +; AVX512BW-SLOW-NEXT: vshufps {{.*#+}} ymm6 = ymm4[1,1,1,1,5,5,5,5] ; AVX512BW-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm1[1],ymm6[2,3,4],ymm1[5],ymm6[6,7] ; AVX512BW-SLOW-NEXT: vextractf128 $1, %ymm6, %xmm6 ; AVX512BW-SLOW-NEXT: vunpckhps {{.*#+}} ymm1 = ymm4[2],ymm1[2],ymm4[3],ymm1[3],ymm4[6],ymm1[6],ymm4[7],ymm1[7] ; AVX512BW-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm4 -; AVX512BW-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[2,3,2,3,6,7,6,7] +; AVX512BW-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,3,2,3,6,7,6,7] ; AVX512BW-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX512BW-SLOW-NEXT: vmovq %xmm2, (%rsi) ; AVX512BW-SLOW-NEXT: vmovq %xmm3, (%rdx) @@ -330,17 +330,17 @@ ; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm8 ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm9 = xmm8[0],xmm7[0],xmm8[1],xmm7[1] ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm4 = xmm4[0],xmm9[0] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm10 = xmm6[1,1,1,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm6[1,1,1,1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm10[0],xmm5[1],xmm10[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm9 = xmm10[0,1],xmm9[2,3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm10 = xmm7[2,2,2,2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm7[2,2,2,2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm8[0,1,2],xmm10[3] ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm5 = xmm6[2],xmm5[2],xmm6[3],xmm5[3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm5[0,1],xmm10[2,3] ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm7 = xmm8[2],xmm7[2],xmm8[3],xmm7[3] ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm5 = xmm5[1],xmm7[1] ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm7 = ymm7[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm7[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm7 ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm8 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 @@ -348,11 +348,11 @@ ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm8 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[4],ymm3[4],ymm2[5],ymm3[5] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm1[1,0],ymm0[1,0],ymm1[5,4],ymm0[5,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm10 = ymm10[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm10[0,1],xmm8[2,3] ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm10 = ymm3[1],ymm2[1],ymm3[3],ymm2[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm10 = ymm10[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm11 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm11, %xmm11 @@ -360,7 +360,7 @@ ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm2 = ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[6],ymm3[6],ymm2[7],ymm3[7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[3,0],ymm0[3,0],ymm1[7,4],ymm0[7,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3] ; AVX1-ONLY-NEXT: vmovaps %xmm4, (%rsi) @@ -393,10 +393,10 @@ ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm5 = xmm7[0],xmm8[0],xmm7[1],xmm8[1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm5 = xmm9[0],xmm6[0],xmm9[1],xmm6[1] -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm10 = xmm7[1,1,1,1] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm7[1,1,1,1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm10[0],xmm8[1],xmm10[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm10[0,1],xmm5[2,3] -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm10 = xmm6[2,2,2,2] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm6[2,2,2,2] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm9[0,1,2],xmm10[3] ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm7 = xmm7[2],xmm8[2],xmm7[3],xmm8[3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm7[0,1],xmm10[2,3] @@ -605,18 +605,18 @@ ; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm14 ; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %xmm15 ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm6 = xmm15[0],xmm14[0],xmm15[1],xmm14[1] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm7 = xmm6[0,1,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm6[0,1,0,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm7[6,7] ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm7 = xmm11[0],xmm9[0],xmm11[1],xmm9[1] ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm4 = xmm4[0],xmm7[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm5 = xmm10[1,1,1,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm10[1,1,1,1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0],xmm8[1],xmm5[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1],xmm7[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm7 = xmm13[1,1,1,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm13[1,1,1,1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0],xmm12[1],xmm7[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm6[6,7] @@ -626,11 +626,11 @@ ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm8 = xmm10[2],xmm8[2],xmm10[3],xmm8[3] ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm12 = xmm13[2],xmm12[2],xmm13[3],xmm12[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm10 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm13 = xmm14[2,2,2,2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm13 = xmm14[2,2,2,2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm13 = xmm15[0,1,2],xmm13[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm0, %ymm13 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm13[6,7] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm13 = xmm9[2,2,2,2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm13 = xmm9[2,2,2,2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm13 = xmm11[0,1,2],xmm13[3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm13 = xmm8[0,1],xmm13[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm10[4,5,6,7] @@ -640,7 +640,7 @@ ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm8 = xmm8[1],xmm9[1] ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm9 = xmm15[2],xmm14[2],xmm15[3],xmm14[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm9 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm12 = xmm12[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm12 = xmm12[2,3,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm12 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm12[0,1,2,3,4,5],ymm9[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm8[0,1,2,3],ymm9[4,5,6,7] @@ -648,7 +648,7 @@ ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm12 = ymm7[0],ymm6[0],ymm7[1],ymm6[1],ymm7[4],ymm6[4],ymm7[5],ymm6[5] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm12[0,1],ymm8[2,0],ymm12[4,5],ymm8[6,4] ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm12 = ymm12[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm12[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm12, %xmm12 ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm14 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm14 @@ -660,7 +660,7 @@ ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm14 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[4],ymm3[4],ymm2[5],ymm3[5] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm14 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm1[1,0],ymm0[1,0],ymm1[5,4],ymm0[5,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm15 = ymm15[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm15, %xmm15 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm15[0,1],xmm14[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm14[0,1,2,3],ymm8[4,5,6,7] @@ -668,7 +668,7 @@ ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm15 = ymm7[2],ymm6[2],ymm7[3],ymm6[3],ymm7[6],ymm6[6],ymm7[7],ymm6[7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm15[0,1],ymm14[2,0],ymm15[4,5],ymm14[6,4] ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm15 = ymm3[1],ymm2[1],ymm3[3],ymm2[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm15 = ymm15[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm15, %xmm15 ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm4 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm4 @@ -680,7 +680,7 @@ ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm2 = ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[6],ymm3[6],ymm2[7],ymm3[7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[3,0],ymm0[3,0],ymm1[7,4],ymm0[7,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm1 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7] @@ -728,12 +728,12 @@ ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm5 = xmm14[0],xmm11[0],xmm14[1],xmm11[1] -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm6 = xmm12[1,1,1,1] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm12[1,1,1,1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0],xmm13[1],xmm6[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3] ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm6 = xmm10[0],xmm8[0],xmm10[1],xmm8[1] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm15 = xmm9[1,1,1,1] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm9[1,1,1,1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm15[0],xmm7[1],xmm15[2,3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0,1,2,3,4,5],ymm6[6,7] @@ -741,13 +741,13 @@ ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm15[4,5,6,7] ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm15 = xmm9[2],xmm7[2],xmm9[3],xmm7[3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm7 -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm9 = xmm8[2,2,2,2] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm9 = xmm8[2,2,2,2] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm9 = xmm10[0,1,2],xmm9[3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm9 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm9[6,7] ; AVX2-ONLY-NEXT: vmovdqa 192(%rdi), %ymm9 ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm12[2],xmm13[2],xmm12[3],xmm13[3] -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm12 = xmm11[2,2,2,2] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm12 = xmm11[2,2,2,2] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm12 = xmm14[0,1,2],xmm12[3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm12 = xmm0[0,1],xmm12[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm12[0,1,2,3],ymm7[4,5,6,7] @@ -755,7 +755,7 @@ ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm8 = xmm10[2],xmm8[2],xmm10[3],xmm8[3] ; AVX2-ONLY-NEXT: vmovdqa 128(%rdi), %ymm13 ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm10 = xmm15[2,3,2,3] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm15[2,3,2,3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm10 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm10[0,1,2,3,4,5],ymm8[6,7] ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm10 = xmm14[2],xmm11[2],xmm14[3],xmm11[3] @@ -1146,7 +1146,7 @@ ; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %xmm3 ; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm5 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm7 = xmm5[0,1,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm5[0,1,0,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm7[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] @@ -1160,7 +1160,7 @@ ; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm4 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm13 = xmm4[0,1,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm13 = xmm4[0,1,0,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm0, %ymm13 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],ymm13[6,7] ; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %xmm15 @@ -1172,22 +1172,22 @@ ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm9[1,1,1,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm9[1,1,1,1] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm12[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm5 = xmm6[1,1,1,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm6[1,1,1,1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0],xmm8[1],xmm5[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1,2,3,4,5],ymm2[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm7[1,1,1,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm7[1,1,1,1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm15[1],xmm1[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm2 = xmm11[1,1,1,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm11[1,1,1,1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm10[1],xmm2[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] @@ -1196,14 +1196,14 @@ ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm9[2],xmm3[2],xmm9[3],xmm3[3] ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm6[2],xmm8[2],xmm6[3],xmm8[3] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm2 = xmm12[2,2,2,2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm12[2,2,2,2] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm9[0,1,2],xmm2[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm3 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm3 = xmm8[2,2,2,2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm8[2,2,2,2] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm6[0,1,2],xmm3[3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm0[0,1],xmm3[2,3] @@ -1212,13 +1212,13 @@ ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm7[2],xmm15[2],xmm7[3],xmm15[3] ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm3 = xmm11[2],xmm10[2],xmm11[3],xmm10[3] ; AVX1-ONLY-NEXT: vmovaps (%rsp), %xmm10 # 16-byte Reload -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm4 = xmm10[2,2,2,2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm10[2,2,2,2] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm7[0,1,2],xmm4[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm5 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm5 = xmm13[2,2,2,2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm13[2,2,2,2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm14[0,1,2],xmm5[3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm2[0,1],xmm5[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] @@ -1229,7 +1229,7 @@ ; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %ymm5 ; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm4[6,7] ; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %ymm4 @@ -1247,7 +1247,7 @@ ; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %ymm7 ; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm2 = xmm3[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm3[2,3,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] @@ -1261,7 +1261,7 @@ ; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %ymm10 ; AVX1-ONLY-NEXT: vmovaps 352(%rdi), %ymm9 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm9[0],ymm10[0],ymm9[2],ymm10[2] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm4 = ymm4[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm4 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm2[4,5,6,7] @@ -1280,7 +1280,7 @@ ; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm2[0],ymm0[0],ymm2[2],ymm0[2] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm0 ; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm14 ; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %ymm13 @@ -1300,7 +1300,7 @@ ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm0[1,0],ymm2[1,0],ymm0[5,4],ymm2[5,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm12 = ymm12[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm12[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm12, %xmm12 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm12[0,1],xmm1[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm6[4,5,6,7] @@ -1313,7 +1313,7 @@ ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm6[0],ymm8[0],ymm6[1],ymm8[1],ymm6[4],ymm8[4],ymm6[5],ymm8[5] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm13[1,0],ymm14[1,0],ymm13[5,4],ymm14[5,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm12 = ymm12[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm12[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm12, %xmm12 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm12[0,1],xmm1[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm7[4,5,6,7] @@ -1326,7 +1326,7 @@ ; AVX1-ONLY-NEXT: vmovaps %ymm0, %ymm5 ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm12 = ymm9[1],ymm10[1],ymm9[3],ymm10[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm12 = ymm12[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm12[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm12, %xmm12 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm12[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm1[0,1,2,3],ymm3[4,5,6,7] @@ -1336,7 +1336,7 @@ ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[6],ymm2[6],ymm3[7],ymm2[7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4] ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm8[1],ymm6[1],ymm8[3],ymm6[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm12 = ymm14[2],ymm13[2],ymm14[3],ymm13[3],ymm14[6],ymm13[6],ymm14[7],ymm13[7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm12, %xmm12 @@ -1352,7 +1352,7 @@ ; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm5 = ymm5[3,0],mem[3,0],ymm5[7,4],mem[7,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm5 = ymm5[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm5 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm5[0,1],xmm1[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] @@ -1362,7 +1362,7 @@ ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm3 = ymm6[2],ymm8[2],ymm6[3],ymm8[3],ymm6[6],ymm8[6],ymm6[7],ymm8[7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm13[3,0],ymm14[3,0],ymm13[7,4],ymm14[7,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm3 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm2 = ymm2[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] @@ -1461,19 +1461,19 @@ ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm6[0,1],xmm0[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm8[1,1,1,1] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm8[1,1,1,1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm7[1],xmm0[2,3] ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm14[0],xmm12[0],xmm14[1],xmm12[1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm11[0],xmm10[0],xmm11[1],xmm10[1] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm6 = xmm4[1,1,1,1] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm4[1,1,1,1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0],xmm5[1],xmm6[2,3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm15[1,1,1,1] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm15[1,1,1,1] ; AVX2-ONLY-NEXT: vmovaps %xmm15, %xmm10 ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm5[1],xmm0[2,3] @@ -1485,7 +1485,7 @@ ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm9[0],xmm2[1],xmm9[1] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm6 = xmm13[1,1,1,1] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm13[1,1,1,1] ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0],xmm4[1],xmm6[2,3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 @@ -1494,12 +1494,12 @@ ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm10[2],xmm5[2],xmm10[3],xmm5[3] ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm13[2],xmm4[2],xmm13[3],xmm4[3] -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm6 = xmm9[2,2,2,2] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm9[2,2,2,2] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm2[0,1,2],xmm6[3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm13 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm13[0,1,2,3,4,5],ymm6[6,7] -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm13 = xmm15[2,2,2,2] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm13 = xmm15[2,2,2,2] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm13 = xmm3[0,1,2],xmm13[3] ; AVX2-ONLY-NEXT: vmovaps %xmm3, %xmm9 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm13 = xmm0[0,1],xmm13[2,3] @@ -1509,13 +1509,13 @@ ; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm4 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm4 = xmm2[2],mem[2],xmm2[3],mem[3] ; AVX2-ONLY-NEXT: vmovaps (%rsp), %xmm10 # 16-byte Reload -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm5 = xmm10[2,2,2,2] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm10[2,2,2,2] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm11[0,1,2],xmm5[3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm6 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5],ymm5[6,7] ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm8[2],xmm7[2],xmm8[3],xmm7[3] -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm3 = xmm12[2,2,2,2] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm12[2,2,2,2] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm14[0,1,2],xmm3[3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm2[0,1],xmm3[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm5[4,5,6,7] @@ -1528,7 +1528,7 @@ ; AVX2-ONLY-NEXT: vmovdqa 96(%rdi), %ymm5 ; AVX2-ONLY-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[2,3,2,3] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7] ; AVX2-ONLY-NEXT: vmovdqa 64(%rdi), %ymm6 @@ -1540,7 +1540,7 @@ ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm11[2],xmm10[2],xmm11[3],xmm10[3] ; AVX2-ONLY-NEXT: vmovdqa 192(%rdi), %ymm13 ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm4[2,3,2,3] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm4[2,3,2,3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovdqa 160(%rdi), %ymm7 @@ -2473,7 +2473,7 @@ ; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %xmm3 ; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm15 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm2 = xmm15[0,1,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm15[0,1,0,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] @@ -2490,7 +2490,7 @@ ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,1,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,0,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovaps 800(%rdi), %xmm1 @@ -2519,7 +2519,7 @@ ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,1,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,0,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm11 @@ -2546,7 +2546,7 @@ ; AVX1-ONLY-NEXT: vmovaps 704(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm12 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm8 = xmm12[0,1,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm8 = xmm12[0,1,0,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm7[0,1,2,3,4,5],ymm8[6,7] ; AVX1-ONLY-NEXT: vmovaps 544(%rdi), %xmm7 @@ -2560,19 +2560,19 @@ ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm3[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm14[1,1,1,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm14[1,1,1,1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm9[1],xmm0[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm10[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm15 = xmm10[1,1,1,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm10[1,1,1,1] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm15[0],xmm8[1],xmm15[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm15[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm6[1,1,1,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm6[1,1,1,1] ; AVX1-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = xmm0[0],mem[1],xmm0[2,3] ; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload @@ -2586,7 +2586,7 @@ ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm4[1,1,1,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm4[1,1,1,1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm11[1],xmm0[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm13[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 16-byte Folded Reload @@ -2598,7 +2598,7 @@ ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm5[1,1,1,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm5[1,1,1,1] ; AVX1-ONLY-NEXT: vmovaps %xmm5, %xmm11 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm7[1],xmm0[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3] @@ -2622,7 +2622,7 @@ ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm3 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm3 = xmm0[2,2,2,2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm0[2,2,2,2] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm15[0,1,2],xmm3[3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm1[0,1],xmm3[2,3] @@ -2644,7 +2644,7 @@ ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm3 = xmm10[2,2,2,2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm10[2,2,2,2] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm5[0,1],xmm3[2,3] @@ -2658,14 +2658,14 @@ ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm3 = xmm2[2],xmm6[2],xmm2[3],xmm6[3] ; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm2 = xmm13[2,2,2,2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm13[2,2,2,2] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm12[0,1,2],xmm2[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm3 = xmm6[2,2,2,2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm6[2,2,2,2] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm5[0,1,2],xmm3[3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm8[0,1],xmm3[2,3] @@ -2677,14 +2677,14 @@ ; AVX1-ONLY-NEXT: # xmm2 = xmm2[2],mem[2],xmm2[3],mem[3] ; AVX1-ONLY-NEXT: vmovaps %xmm2, (%rsp) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm7 = xmm11[2,2,2,2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm11[2,2,2,2] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm9[0,1,2],xmm7[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm8 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1,2,3,4,5],ymm7[6,7] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm8 = xmm3[2,2,2,2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm8 = xmm3[2,2,2,2] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm2[0,1,2],xmm8[3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm14[0,1],xmm8[2,3] @@ -2759,7 +2759,7 @@ ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm4[0],ymm2[0],ymm4[2],ymm2[2] ; AVX1-ONLY-NEXT: vmovaps %ymm4, %ymm14 ; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] @@ -2786,7 +2786,7 @@ ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm2[0],ymm4[0],ymm2[2],ymm4[2] ; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] @@ -2813,7 +2813,7 @@ ; AVX1-ONLY-NEXT: vmovaps 608(%rdi), %ymm5 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm5[0],ymm0[0],ymm5[2],ymm0[2] ; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm13 = ymm13[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm13[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm13, %xmm13 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm12 = xmm12[0,1],xmm13[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm10[4,5,6,7] @@ -2834,7 +2834,7 @@ ; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm2[0],ymm0[0],ymm2[2],ymm0[2] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm12 = ymm12[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm12[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm12, %xmm0 ; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -2854,7 +2854,7 @@ ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm7[1,0],ymm6[1,0],ymm7[5,4],ymm6[5,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm13 = ymm13[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm13[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm13, %xmm13 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm13[0,1],xmm1[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] @@ -2871,7 +2871,7 @@ ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm7[1,0],ymm9[1,0],ymm7[5,4],ymm9[5,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm13 = ymm13[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm13[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm13, %xmm13 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm13[0,1],xmm1[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] @@ -2888,7 +2888,7 @@ ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm5[1,0],ymm2[1,0],ymm5[5,4],ymm2[5,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm13 = ymm13[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm13[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm13, %xmm13 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm13[0,1],xmm1[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] @@ -2906,7 +2906,7 @@ ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-ONLY-NEXT: vshufps $17, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm13 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm13 = ymm15[1,0],mem[1,0],ymm15[5,4],mem[5,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm13 = ymm13[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm13[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm13, %xmm13 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm13[0,1],xmm1[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] @@ -2922,7 +2922,7 @@ ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm13 = ymm3[1],ymm12[1],ymm3[3],ymm12[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm13 = ymm13[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm13[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm13, %xmm13 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm13[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] @@ -2936,7 +2936,7 @@ ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm13 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm13 = ymm3[1],mem[1],ymm3[3],mem[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm13 = ymm13[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm13[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm13, %xmm13 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm13[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] @@ -2952,7 +2952,7 @@ ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm13 = ymm2[1],ymm4[1],ymm2[3],ymm4[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm13 = ymm13[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm13[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm13, %xmm13 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm13[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] @@ -2966,7 +2966,7 @@ ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm1 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm1 = ymm5[1],mem[1],ymm5[3],mem[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm13 = ymm9[2],ymm15[2],ymm9[3],ymm15[3],ymm9[6],ymm15[6],ymm9[7],ymm15[7] @@ -2986,7 +2986,7 @@ ; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm11 = ymm11[3,0],mem[3,0],ymm11[7,4],mem[7,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm11 = ymm11[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm11[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm11, %xmm11 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm11[0,1],xmm1[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm1[0,1,2,3],ymm0[4,5,6,7] @@ -2999,7 +2999,7 @@ ; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm2 = ymm2[3,0],mem[3,0],ymm2[7,4],mem[7,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm2 = ymm2[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] @@ -3017,7 +3017,7 @@ ; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm3 = ymm3[3,0],mem[3,0],ymm3[7,4],mem[7,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm3 = ymm3[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm3 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] @@ -3029,7 +3029,7 @@ ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm3 = ymm3[2],ymm5[2],ymm3[3],ymm5[3],ymm3[6],ymm5[6],ymm3[7],ymm5[7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm15[3,0],ymm9[3,0],ymm15[7,4],ymm9[7,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm3 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm4 = ymm4[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm4 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] @@ -3225,7 +3225,7 @@ ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm3[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm11[1,1,1,1] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm11[1,1,1,1] ; AVX2-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm0 = xmm0[0],mem[1],xmm0[2,3] ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm10[0],xmm13[0],xmm10[1],xmm13[1] @@ -3261,21 +3261,21 @@ ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm14[1,1,1,1] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm14[1,1,1,1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm4[1],xmm0[2,3] ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] ; AVX2-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm1 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm1 = xmm8[0],mem[0],xmm8[1],mem[1] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm2 = xmm7[1,1,1,1] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm7[1,1,1,1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm9[1],xmm2[2,3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm8[1,1,1,1] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm8[1,1,1,1] ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm7[1],xmm0[2,3] ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload @@ -3287,7 +3287,7 @@ ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm2 = xmm6[1,1,1,1] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm6[1,1,1,1] ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm5[1],xmm2[2,3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 @@ -3301,13 +3301,13 @@ ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm0[2],xmm11[2],xmm0[3],xmm11[3] ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm15[2,2,2,2] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm15[2,2,2,2] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm13[0,1,2],xmm0[3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm15 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3,4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm15 = xmm2[2,2,2,2] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm2[2,2,2,2] ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm1[0,1,2],xmm15[3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm14[0,1],xmm15[2,3] @@ -3320,13 +3320,13 @@ ; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm0 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm0 = xmm11[2],mem[2],xmm11[3],mem[3] ; AVX2-ONLY-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm14 = xmm12[2,2,2,2] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm12[2,2,2,2] ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm12[0,1,2],xmm14[3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14 ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm13 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],ymm14[6,7] -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm14 = xmm10[2,2,2,2] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm10[2,2,2,2] ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm0[0,1,2],xmm14[3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm15[0,1],xmm14[2,3] @@ -3334,12 +3334,12 @@ ; AVX2-ONLY-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm13 = xmm8[2],xmm7[2],xmm8[3],xmm7[3] ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm10 = xmm6[2],xmm5[2],xmm6[3],xmm5[3] -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm8 = xmm3[2,2,2,2] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm8 = xmm3[2,2,2,2] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm4[0,1,2],xmm8[3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm14 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm14[0,1,2,3,4,5],ymm8[6,7] -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm14 = xmm9[2,2,2,2] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm9[2,2,2,2] ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm3[0,1,2],xmm14[3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm13[0,1],xmm14[2,3] @@ -3349,7 +3349,7 @@ ; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm9 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm9 = xmm4[2],mem[2],xmm4[3],mem[3] ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm6 = xmm14[2,2,2,2] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm14[2,2,2,2] ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm5[0,1,2],xmm6[3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 @@ -3359,7 +3359,7 @@ ; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm7 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm7 = xmm4[2],mem[2],xmm4[3],mem[3] ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm4 = xmm8[2,2,2,2] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm8[2,2,2,2] ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm11[0,1,2],xmm4[3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm7[0,1],xmm4[2,3] @@ -3397,14 +3397,14 @@ ; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm2 = xmm10[2,3,2,3] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm10[2,3,2,3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm5[2],xmm14[2],xmm5[3],xmm14[3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm9[2,3,2,3] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm9[2,3,2,3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm11[2],xmm8[2],xmm11[3],xmm8[3] @@ -5428,7 +5428,7 @@ ; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %xmm3 ; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm14 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm2 = xmm14[0,1,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm14[0,1,0,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] @@ -5445,7 +5445,7 @@ ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,1,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,0,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovaps 800(%rdi), %xmm1 @@ -5474,7 +5474,7 @@ ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,1,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,0,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovaps 1312(%rdi), %xmm1 @@ -5502,7 +5502,7 @@ ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,1,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,0,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovaps 1824(%rdi), %xmm1 @@ -5530,7 +5530,7 @@ ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,1,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,0,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm6 @@ -5559,7 +5559,7 @@ ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,1,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,0,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovaps 544(%rdi), %xmm4 @@ -5588,7 +5588,7 @@ ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm8 = xmm1[0,1,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm8 = xmm1[0,1,0,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm8[6,7] ; AVX1-ONLY-NEXT: vmovaps 1056(%rdi), %xmm12 @@ -5616,7 +5616,7 @@ ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm11 = xmm1[0,1,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm11 = xmm1[0,1,0,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm11 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5],ymm11[6,7] ; AVX1-ONLY-NEXT: vmovaps 1568(%rdi), %xmm8 @@ -5632,7 +5632,7 @@ ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps %xmm15, %xmm11 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm15[1,1,1,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm15[1,1,1,1] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm7[1],xmm0[2,3] ; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload @@ -5668,7 +5668,7 @@ ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm13[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm13 = xmm15[1,1,1,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm13 = xmm15[1,1,1,1] ; AVX1-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm13 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm13 = xmm13[0],mem[1],xmm13[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm0, %ymm13 @@ -5717,7 +5717,7 @@ ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm3[1,1,1,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm3[1,1,1,1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm12[1],xmm0[2,3] ; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = xmm0[0,1],mem[2,3] @@ -5730,11 +5730,11 @@ ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm5[1,1,1,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm5[1,1,1,1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm8[1],xmm0[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm2 = xmm9[1,1,1,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm9[1,1,1,1] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm5[1],xmm2[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 @@ -5860,7 +5860,7 @@ ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm14 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm14[0,1,2,3,4,5],ymm12[6,7] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm14 = xmm13[2,2,2,2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm13[2,2,2,2] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm1[0,1,2],xmm14[3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm2[0,1],xmm14[2,3] @@ -5882,7 +5882,7 @@ ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm11 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],ymm12[6,7] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm12 = xmm10[2,2,2,2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm12 = xmm10[2,2,2,2] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm12 = xmm7[0,1,2],xmm12[3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm12 = xmm6[0,1],xmm12[2,3] @@ -5894,14 +5894,14 @@ ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm9[2],xmm5[2],xmm9[3],xmm5[3] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm9 = xmm12[2,2,2,2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm9 = xmm12[2,2,2,2] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm9 = xmm11[0,1,2],xmm9[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm9 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm8 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm9[6,7] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm9 = xmm6[2,2,2,2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm9 = xmm6[2,2,2,2] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm9 = xmm5[0,1,2],xmm9[3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm9 = xmm0[0,1],xmm9[2,3] @@ -6035,7 +6035,7 @@ ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm2[0],ymm12[0],ymm2[2],ymm12[2] ; AVX1-ONLY-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] @@ -6062,7 +6062,7 @@ ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm2[0],ymm9[0],ymm2[2],ymm9[2] ; AVX1-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] @@ -6089,7 +6089,7 @@ ; AVX1-ONLY-NEXT: vmovaps 864(%rdi), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] @@ -6116,7 +6116,7 @@ ; AVX1-ONLY-NEXT: vmovaps 1120(%rdi), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] @@ -6143,7 +6143,7 @@ ; AVX1-ONLY-NEXT: vmovaps 1376(%rdi), %ymm3 ; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm7 = ymm7[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm7[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm7 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm7[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] @@ -6170,7 +6170,7 @@ ; AVX1-ONLY-NEXT: vmovaps 1632(%rdi), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm14 = ymm14[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm14 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm14[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7] @@ -6197,7 +6197,7 @@ ; AVX1-ONLY-NEXT: vmovaps 1888(%rdi), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm8 = ymm8[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm8[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm14[4,5,6,7] @@ -6218,7 +6218,7 @@ ; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm8 = ymm8[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 ; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -6242,7 +6242,7 @@ ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps $17, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm14 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm14 = ymm1[1,0],mem[1,0],ymm1[5,4],mem[5,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm14 = ymm14[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm14 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm14[0,1],xmm8[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7] @@ -6259,7 +6259,7 @@ ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm1[1,0],ymm5[1,0],ymm1[5,4],ymm5[5,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm14 = ymm14[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm14 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm14[0,1],xmm8[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7] @@ -6278,7 +6278,7 @@ ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps $17, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm14 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm14 = ymm1[1,0],mem[1,0],ymm1[5,4],mem[5,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm14 = ymm14[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm14 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm14[0,1],xmm8[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7] @@ -6297,7 +6297,7 @@ ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps $17, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm14 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm14 = ymm1[1,0],mem[1,0],ymm1[5,4],mem[5,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm14 = ymm14[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm14 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm14[0,1],xmm8[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7] @@ -6316,7 +6316,7 @@ ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps $17, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm14 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm14 = ymm1[1,0],mem[1,0],ymm1[5,4],mem[5,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm14 = ymm14[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm14 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm14[0,1],xmm8[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7] @@ -6334,7 +6334,7 @@ ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps $17, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm14 = ymm0[1,0],mem[1,0],ymm0[5,4],mem[5,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm14 = ymm14[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm14 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm14[0,1],xmm8[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm2[4,5,6,7] @@ -6353,7 +6353,7 @@ ; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps $17, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm14 = ymm0[1,0],mem[1,0],ymm0[5,4],mem[5,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm14 = ymm14[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm14 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm14[0,1],xmm8[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm1[4,5,6,7] @@ -6372,7 +6372,7 @@ ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps $17, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm14 = ymm14[1,0],mem[1,0],ymm14[5,4],mem[5,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm14 = ymm14[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm14 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm14[0,1],xmm8[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7] @@ -6389,7 +6389,7 @@ ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm14 = ymm10[1],ymm6[1],ymm10[3],ymm6[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm14 = ymm14[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm14 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0,1],xmm14[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7] @@ -6407,7 +6407,7 @@ ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm14 = ymm3[1],ymm10[1],ymm3[3],ymm10[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm14 = ymm14[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm14 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0,1],xmm14[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7] @@ -6423,7 +6423,7 @@ ; AVX1-ONLY-NEXT: # ymm8 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm14 = ymm7[1],ymm11[1],ymm7[3],ymm11[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm14 = ymm14[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm14 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0,1],xmm14[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7] @@ -6440,7 +6440,7 @@ ; AVX1-ONLY-NEXT: # ymm8 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm14 = ymm5[1],ymm12[1],ymm5[3],ymm12[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm14 = ymm14[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm14 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0,1],xmm14[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7] @@ -6458,7 +6458,7 @@ ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm14 = ymm9[1],ymm2[1],ymm9[3],ymm2[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm14 = ymm14[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm14 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0,1],xmm14[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7] @@ -6475,7 +6475,7 @@ ; AVX1-ONLY-NEXT: # ymm8 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm14 = ymm13[1],ymm15[1],ymm13[3],ymm15[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm14 = ymm14[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm14 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0,1],xmm14[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7] @@ -6494,7 +6494,7 @@ ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm14 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm14 = ymm1[1],mem[1],ymm1[3],mem[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm14 = ymm14[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm14 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0,1],xmm14[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7] @@ -6509,7 +6509,7 @@ ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm8 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm8 = ymm3[1],mem[1],ymm3[3],mem[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm8 = ymm8[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm14 # 32-byte Folded Reload @@ -6530,7 +6530,7 @@ ; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm8 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm8 = ymm1[3,0],mem[3,0],ymm1[7,4],mem[7,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm6, %xmm6 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm8 = ymm8[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm8[0,1],xmm6[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm0[4,5,6,7] @@ -6547,7 +6547,7 @@ ; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm8 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm8 = ymm1[3,0],mem[3,0],ymm1[7,4],mem[7,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm5 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm8 = ymm8[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm8[0,1],xmm5[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm0[4,5,6,7] @@ -6564,7 +6564,7 @@ ; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm8 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm8 = ymm1[3,0],mem[3,0],ymm1[7,4],mem[7,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm4 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm8 = ymm8[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm8[0,1],xmm4[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm0[4,5,6,7] @@ -6582,7 +6582,7 @@ ; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm8 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm8 = ymm1[3,0],mem[3,0],ymm1[7,4],mem[7,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm3 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm8 = ymm8[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm8[0,1],xmm3[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm0[4,5,6,7] @@ -6599,7 +6599,7 @@ ; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm8 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm8 = ymm1[3,0],mem[3,0],ymm1[7,4],mem[7,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm8 = ymm8[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm8[0,1],xmm2[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm0[4,5,6,7] @@ -6617,7 +6617,7 @@ ; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm8 = ymm8[3,0],mem[3,0],ymm8[7,4],mem[7,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm8 = ymm8[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm8[0,1],xmm1[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7] @@ -6633,7 +6633,7 @@ ; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm8 = ymm8[3,0],mem[3,0],ymm8[7,4],mem[7,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm7 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm8 = ymm8[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm8[0,1],xmm7[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7] @@ -6647,7 +6647,7 @@ ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm9[3,0],ymm15[3,0],ymm9[7,4],ymm15[7,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm9 = ymm9[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm9[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm9, %xmm9 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] @@ -7020,7 +7020,7 @@ ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm3[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm12[1,1,1,1] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm12[1,1,1,1] ; AVX2-ONLY-NEXT: vblendps $2, (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm0 = xmm0[0],mem[1],xmm0[2,3] ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload @@ -7058,7 +7058,7 @@ ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm13[1,1,1,1] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm13[1,1,1,1] ; AVX2-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm0 = xmm0[0],mem[1],xmm0[2,3] ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload @@ -7077,7 +7077,7 @@ ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm15[1,1,1,1] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm15[1,1,1,1] ; AVX2-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm0 = xmm0[0],mem[1],xmm0[2,3] ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload @@ -7096,13 +7096,13 @@ ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm14[1,1,1,1] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm14[1,1,1,1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm4[1],xmm0[2,3] ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm10[0],xmm6[0],xmm10[1],xmm6[1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm7[0],xmm5[0],xmm7[1],xmm5[1] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm2 = xmm8[1,1,1,1] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm8[1,1,1,1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm9[1],xmm2[2,3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] @@ -7176,7 +7176,7 @@ ; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm2 = xmm0[2],mem[2],xmm0[3],mem[3] ; AVX2-ONLY-NEXT: vmovaps %xmm2, (%rsp) # 16-byte Spill -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm12[2,2,2,2] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm12[2,2,2,2] ; AVX2-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm0 = mem[0,1,2],xmm0[3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 @@ -7197,14 +7197,14 @@ ; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm2 = xmm0[2],mem[2],xmm0[3],mem[3] ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm11[2,2,2,2] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm11[2,2,2,2] ; AVX2-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm0 = mem[0,1,2],xmm0[3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm11[2,2,2,2] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm11[2,2,2,2] ; AVX2-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm1 = mem[0,1,2],xmm1[3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3] @@ -7218,7 +7218,7 @@ ; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm2 = xmm0[2],mem[2],xmm0[3],mem[3] ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm13[2,2,2,2] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm13[2,2,2,2] ; AVX2-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm0 = mem[0,1,2],xmm0[3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 @@ -7239,7 +7239,7 @@ ; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm2 = xmm0[2],mem[2],xmm0[3],mem[3] ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm15[2,2,2,2] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm15[2,2,2,2] ; AVX2-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm0 = mem[0,1,2],xmm0[3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 @@ -7259,13 +7259,13 @@ ; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm1 = xmm0[2],mem[2],xmm0[3],mem[3] ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm6[2,2,2,2] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm6[2,2,2,2] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm8[0,1,2],xmm0[3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm6[2,2,2,2] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm6[2,2,2,2] ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] @@ -7279,14 +7279,14 @@ ; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm1 = xmm0[2],mem[2],xmm0[3],mem[3] ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm4[2,2,2,2] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm4[2,2,2,2] ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm4[0,1,2],xmm0[3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm2[2,2,2,2] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm2[2,2,2,2] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm10[0,1,2],xmm1[3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm8[0,1],xmm1[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] @@ -7299,13 +7299,13 @@ ; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm1 = xmm0[2],mem[2],xmm0[3],mem[3] ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm5[2,2,2,2] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm5[2,2,2,2] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm7[0,1,2],xmm0[3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm15 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3,4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm15 = xmm1[2,2,2,2] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm1[2,2,2,2] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm9[0,1,2],xmm15[3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm8[0,1],xmm15[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-3.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-3.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-3.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-3.ll @@ -169,7 +169,7 @@ ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5] ; AVX2-ONLY-NEXT: vbroadcastsd 80(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovaps 16(%rdi), %xmm1 @@ -298,11 +298,11 @@ ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5],ymm5[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3],ymm2[4,5,6,7] -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} ymm2 = ymm2[2,3,0,1,6,7,4,5] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,3,0,1,6,7,4,5] ; AVX2-ONLY-NEXT: vbroadcastsd 80(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5] ; AVX2-ONLY-NEXT: vbroadcastsd 176(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovaps 16(%rdi), %xmm1 @@ -543,19 +543,19 @@ ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm8[4,5,6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm12[0,1,2,3,4,5],ymm4[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm11[2,3],ymm3[4,5,6,7] -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} ymm3 = ymm3[2,3,0,1,6,7,4,5] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[2,3,0,1,6,7,4,5] ; AVX2-ONLY-NEXT: vbroadcastsd 176(%rdi), %ymm11 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm11[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm10[2,3],ymm5[4,5,6,7] -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} ymm5 = ymm5[2,3,0,1,6,7,4,5] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[2,3,0,1,6,7,4,5] ; AVX2-ONLY-NEXT: vbroadcastsd 368(%rdi), %ymm10 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm10[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm9[2,3],ymm8[4,5,6,7] -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} ymm8 = ymm8[2,3,0,1,6,7,4,5] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[2,3,0,1,6,7,4,5] ; AVX2-ONLY-NEXT: vbroadcastsd 80(%rdi), %ymm9 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm9[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm7[2,3],ymm6[4,5,6,7] -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} ymm6 = ymm6[2,3,0,1,6,7,4,5] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm6[2,3,0,1,6,7,4,5] ; AVX2-ONLY-NEXT: vbroadcastsd 272(%rdi), %ymm7 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm7[6,7] ; AVX2-ONLY-NEXT: vmovaps 112(%rdi), %xmm7 @@ -1070,39 +1070,39 @@ ; AVX2-ONLY-NEXT: vinsertf128 $1, 640(%rdi), %ymm0, %ymm8 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm8[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm5[2,3],ymm4[4,5,6,7] -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} ymm4 = ymm4[2,3,0,1,6,7,4,5] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[2,3,0,1,6,7,4,5] ; AVX2-ONLY-NEXT: vbroadcastsd 176(%rdi), %ymm5 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm9[0,1],ymm7[2,3],ymm9[4,5,6,7] -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} ymm4 = ymm4[2,3,0,1,6,7,4,5] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[2,3,0,1,6,7,4,5] ; AVX2-ONLY-NEXT: vbroadcastsd 368(%rdi), %ymm5 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm11[0,1],ymm10[2,3],ymm11[4,5,6,7] -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} ymm4 = ymm4[2,3,0,1,6,7,4,5] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[2,3,0,1,6,7,4,5] ; AVX2-ONLY-NEXT: vbroadcastsd 560(%rdi), %ymm5 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm4[0,1,2,3,4,5],ymm5[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm13[0,1],ymm12[2,3],ymm13[4,5,6,7] -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} ymm4 = ymm4[2,3,0,1,6,7,4,5] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[2,3,0,1,6,7,4,5] ; AVX2-ONLY-NEXT: vbroadcastsd 752(%rdi), %ymm8 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm4[0,1,2,3,4,5],ymm8[6,7] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX2-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm4 = ymm4[0,1],mem[2,3],ymm4[4,5,6,7] -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} ymm4 = ymm4[2,3,0,1,6,7,4,5] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[2,3,0,1,6,7,4,5] ; AVX2-ONLY-NEXT: vbroadcastsd 80(%rdi), %ymm8 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm4[0,1,2,3,4,5],ymm8[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm14[0,1],ymm15[2,3],ymm14[4,5,6,7] -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} ymm4 = ymm4[2,3,0,1,6,7,4,5] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[2,3,0,1,6,7,4,5] ; AVX2-ONLY-NEXT: vbroadcastsd 272(%rdi), %ymm8 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm4[0,1,2,3,4,5],ymm8[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3],ymm3[4,5,6,7] -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5] ; AVX2-ONLY-NEXT: vbroadcastsd 464(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm0[0,1,2,3,4,5],ymm3[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm1[2,3],ymm2[4,5,6,7] -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5] ; AVX2-ONLY-NEXT: vbroadcastsd 656(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovaps 112(%rdi), %xmm0 @@ -2193,97 +2193,97 @@ ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload ; AVX2-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm11 = ymm11[0,1],mem[2,3],ymm11[4,5,6,7] -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} ymm11 = ymm11[2,3,0,1,6,7,4,5] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm11[2,3,0,1,6,7,4,5] ; AVX2-ONLY-NEXT: vbroadcastsd 176(%rdi), %ymm12 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],ymm12[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload ; AVX2-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm11 = ymm11[0,1],mem[2,3],ymm11[4,5,6,7] -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} ymm11 = ymm11[2,3,0,1,6,7,4,5] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm11[2,3,0,1,6,7,4,5] ; AVX2-ONLY-NEXT: vbroadcastsd 368(%rdi), %ymm12 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],ymm12[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload ; AVX2-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm11 = ymm11[0,1],mem[2,3],ymm11[4,5,6,7] -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} ymm11 = ymm11[2,3,0,1,6,7,4,5] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm11[2,3,0,1,6,7,4,5] ; AVX2-ONLY-NEXT: vbroadcastsd 560(%rdi), %ymm12 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],ymm12[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload ; AVX2-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm11 = ymm11[0,1],mem[2,3],ymm11[4,5,6,7] -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} ymm11 = ymm11[2,3,0,1,6,7,4,5] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm11[2,3,0,1,6,7,4,5] ; AVX2-ONLY-NEXT: vbroadcastsd 752(%rdi), %ymm12 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],ymm12[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload ; AVX2-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm11 = ymm11[0,1],mem[2,3],ymm11[4,5,6,7] -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} ymm11 = ymm11[2,3,0,1,6,7,4,5] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm11[2,3,0,1,6,7,4,5] ; AVX2-ONLY-NEXT: vbroadcastsd 944(%rdi), %ymm12 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],ymm12[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm10 = mem[0,1],ymm10[2,3],mem[4,5,6,7] -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} ymm10 = ymm10[2,3,0,1,6,7,4,5] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[2,3,0,1,6,7,4,5] ; AVX2-ONLY-NEXT: vbroadcastsd 1136(%rdi), %ymm11 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm11[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm9 = mem[0,1],ymm9[2,3],mem[4,5,6,7] -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} ymm9 = ymm9[2,3,0,1,6,7,4,5] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm9[2,3,0,1,6,7,4,5] ; AVX2-ONLY-NEXT: vbroadcastsd 1328(%rdi), %ymm10 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm10[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm8 = mem[0,1],ymm8[2,3],mem[4,5,6,7] -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} ymm8 = ymm8[2,3,0,1,6,7,4,5] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[2,3,0,1,6,7,4,5] ; AVX2-ONLY-NEXT: vbroadcastsd 1520(%rdi), %ymm9 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm9[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm7 = mem[0,1],ymm7[2,3],mem[4,5,6,7] -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} ymm7 = ymm7[2,3,0,1,6,7,4,5] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm7[2,3,0,1,6,7,4,5] ; AVX2-ONLY-NEXT: vbroadcastsd 80(%rdi), %ymm8 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm8[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm6 = mem[0,1],ymm6[2,3],mem[4,5,6,7] -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} ymm6 = ymm6[2,3,0,1,6,7,4,5] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm6[2,3,0,1,6,7,4,5] ; AVX2-ONLY-NEXT: vbroadcastsd 272(%rdi), %ymm7 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm7[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm14[0,1],ymm5[2,3],ymm14[4,5,6,7] -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} ymm5 = ymm5[2,3,0,1,6,7,4,5] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[2,3,0,1,6,7,4,5] ; AVX2-ONLY-NEXT: vbroadcastsd 464(%rdi), %ymm6 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm6[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vblendps $243, (%rsp), %ymm4, %ymm4 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm4 = mem[0,1],ymm4[2,3],mem[4,5,6,7] -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} ymm4 = ymm4[2,3,0,1,6,7,4,5] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[2,3,0,1,6,7,4,5] ; AVX2-ONLY-NEXT: vbroadcastsd 656(%rdi), %ymm5 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm4, (%rsp) # 32-byte Spill ; AVX2-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm3 = mem[0,1],ymm3[2,3],mem[4,5,6,7] -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} ymm3 = ymm3[2,3,0,1,6,7,4,5] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[2,3,0,1,6,7,4,5] ; AVX2-ONLY-NEXT: vbroadcastsd 848(%rdi), %ymm4 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm4[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm15[0,1],ymm2[2,3],ymm15[4,5,6,7] -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} ymm2 = ymm2[2,3,0,1,6,7,4,5] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,3,0,1,6,7,4,5] ; AVX2-ONLY-NEXT: vbroadcastsd 1040(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm13[0,1],ymm1[2,3],ymm13[4,5,6,7] -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[2,3,0,1,6,7,4,5] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,3,0,1,6,7,4,5] ; AVX2-ONLY-NEXT: vbroadcastsd 1232(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm1[0,1,2,3,4,5],ymm2[6,7] ; AVX2-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = mem[0,1],ymm0[2,3],mem[4,5,6,7] -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5] ; AVX2-ONLY-NEXT: vbroadcastsd 1424(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovaps 112(%rdi), %xmm0 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-7.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-7.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-7.ll @@ -267,7 +267,7 @@ ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[3,1,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm4[0,1,2,0,4,5,6,7] ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm6 = mem[0,0] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm7 = xmm6[1,1,1,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm6[1,1,1,1] ; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm7[0] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm5[2,3,4],xmm3[5,6,7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,3,1,3,4,5,6,7] @@ -1692,8 +1692,8 @@ ; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,xmm4[0,1,2,3,4,5,6,7,8,9,10,11,12,13] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm12[0,1,0,1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm3[5],xmm2[6,7] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm4[0,1,0,1] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm3 = xmm12[0,0,0,0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm4[0,1,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm12[0,0,0,0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535] @@ -1740,8 +1740,8 @@ ; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm4, %ymm2 ; AVX1-ONLY-NEXT: vandps %ymm4, %ymm3, %ymm3 ; AVX1-ONLY-NEXT: vorps %ymm2, %ymm3, %ymm2 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm3 = xmm14[0,1,0,1] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm4 = xmm13[0,0,0,0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm14[0,1,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm13[0,0,0,0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[3] ; AVX1-ONLY-NEXT: vpsrld $16, %xmm5, %xmm4 ; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm11[2],xmm4[2],xmm11[3],xmm4[3] @@ -3842,7 +3842,7 @@ ; AVX1-ONLY-NEXT: vorps %ymm4, %ymm5, %ymm4 ; AVX1-ONLY-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm5 = mem[0,1,0,1] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm6 = xmm13[0,0,0,0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm13[0,0,0,0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1,2],xmm6[3] ; AVX1-ONLY-NEXT: vpsrld $16, %xmm2, %xmm6 ; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm6 = xmm1[2],xmm6[2],xmm1[3],xmm6[3] @@ -8117,7 +8117,7 @@ ; AVX1-ONLY-NEXT: vorps %ymm12, %ymm13, %ymm12 ; AVX1-ONLY-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm13 = mem[0,1,0,1] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm14 = xmm11[0,0,0,0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm11[0,0,0,0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm13 = xmm13[0,1,2],xmm14[3] ; AVX1-ONLY-NEXT: vpsrld $16, %xmm1, %xmm14 ; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm14 = xmm0[2],xmm14[2],xmm0[3],xmm14[3] diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-8.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-8.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-8.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-8.ll @@ -1174,8 +1174,8 @@ ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3,4],ymm3[5],ymm2[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm2 = xmm3[0,0,0,0] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm3 = xmm3[0,1,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm3[0,0,0,0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,1,0,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 ; AVX1-ONLY-NEXT: vmovdqu (%rsp), %ymm5 # 32-byte Reload ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm5[0,1,0,1] @@ -1183,10 +1183,10 @@ ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm3, %ymm3 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm3 = xmm5[1,1,1,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm5[1,1,1,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm5, %ymm3 ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm5 = xmm6[0,0,1,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm6[0,0,1,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm5[1],ymm3[2,3,4],ymm5[5],ymm3[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7] diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-2.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-2.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-2.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-2.ll @@ -57,8 +57,8 @@ ; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm1 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm2 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,0,3,1,6,4,7,5] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm1 = ymm2[0,2,1,3,4,6,5,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0,3,1,6,4,7,5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,2,1,3,4,6,5,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6],ymm1[7] ; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rdx) ; AVX1-ONLY-NEXT: vzeroupper diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-3.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-3.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-3.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-3.ll @@ -39,7 +39,7 @@ ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm2 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,1,0,3,7,5,4,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,1,0,3,7,5,4,7] ; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm1 = ymm2[0,2,u,1,u,5,u,u] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3],ymm0[4],ymm1[5,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm1 @@ -273,12 +273,12 @@ ; AVX2-SLOW-NEXT: vbroadcastsd (%rdx), %ymm4 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7] ; AVX2-SLOW-NEXT: vbroadcastsd 24(%rdi), %ymm4 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm5 = ymm1[1,2,3,3,5,6,7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm5 = ymm1[1,2,3,3,5,6,7,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[2,2,2,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm5 = ymm2[2,1,3,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm5[1],ymm4[2,3],ymm5[4],ymm4[5,6],ymm5[7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,0,3,3,4,4,7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,0,3,3,4,4,7,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,1,2,2] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm2[1,1,2,2] @@ -301,7 +301,7 @@ ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6],ymm3[7] ; AVX2-FAST-NEXT: vbroadcastsd (%rdx), %ymm4 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm4 = ymm1[0,0,3,3,4,4,7,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm1[0,0,3,3,4,4,7,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,1,2,2] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm4[2],ymm0[3,4],ymm4[5],ymm0[6,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm4 = ymm2[1,1,2,2] @@ -330,12 +330,12 @@ ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd (%rdx), %ymm4 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 24(%rdi), %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm5 = ymm1[1,2,3,3,5,6,7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm5 = ymm1[1,2,3,3,5,6,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[2,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm5 = ymm2[2,1,3,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm5[1],ymm4[2,3],ymm5[4],ymm4[5,6],ymm5[7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,0,3,3,4,4,7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,0,3,3,4,4,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm2[1,1,2,2] @@ -526,7 +526,7 @@ ; AVX2-SLOW-NEXT: vbroadcastsd (%rdx), %ymm7 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm7[2],ymm2[3,4],ymm7[5],ymm2[6,7] ; AVX2-SLOW-NEXT: vbroadcastsd 56(%rdi), %ymm7 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm8 = ymm3[1,2,3,3,5,6,7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm8 = ymm3[1,2,3,3,5,6,7,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[2,2,2,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1],ymm7[2],ymm8[3,4],ymm7[5],ymm8[6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm8 = ymm1[2,1,3,3] @@ -538,17 +538,17 @@ ; AVX2-SLOW-NEXT: vbroadcastsd 32(%rdx), %ymm9 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm9[2],ymm8[3,4],ymm9[5],ymm8[6,7] ; AVX2-SLOW-NEXT: vbroadcastsd 24(%rdi), %ymm9 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm10 = ymm5[1,2,3,3,5,6,7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm10 = ymm5[1,2,3,3,5,6,7,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[2,2,2,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1],ymm9[2],ymm10[3,4],ymm9[5],ymm10[6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm10 = ymm6[2,1,3,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0],ymm10[1],ymm9[2,3],ymm10[4],ymm9[5,6],ymm10[7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm5 = ymm5[0,0,3,3,4,4,7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm5 = ymm5[0,0,3,3,4,4,7,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[1,1,2,2] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm5 = ymm6[1,1,2,2] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0],ymm4[1,2],ymm5[3],ymm4[4,5],ymm5[6],ymm4[7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm3 = ymm3[0,0,3,3,4,4,7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,0,3,3,4,4,7,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,1,2,2] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm3[2],ymm0[3,4],ymm3[5],ymm0[6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[1,1,2,2] @@ -570,7 +570,7 @@ ; AVX2-FAST-NEXT: vmovaps 32(%rsi), %ymm4 ; AVX2-FAST-NEXT: vmovaps (%rdx), %ymm5 ; AVX2-FAST-NEXT: vmovaps 32(%rdx), %ymm6 -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm0 = ymm3[0,0,3,3,4,4,7,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm3[0,0,3,3,4,4,7,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm7 = ymm1[1,1,2,2] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1],ymm0[2],ymm7[3,4],ymm0[5],ymm7[6,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm7 = ymm5[1,1,2,2] @@ -582,7 +582,7 @@ ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm8[1],ymm1[2,3],ymm8[4],ymm1[5,6],ymm8[7] ; AVX2-FAST-NEXT: vbroadcastsd (%rdx), %ymm8 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm8[2],ymm1[3,4],ymm8[5],ymm1[6,7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm8 = ymm4[0,0,3,3,4,4,7,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm8 = ymm4[0,0,3,3,4,4,7,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm9 = ymm2[1,1,2,2] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1],ymm8[2],ymm9[3,4],ymm8[5],ymm9[6,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm9 = ymm6[1,1,2,2] @@ -627,7 +627,7 @@ ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd (%rdx), %ymm7 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm7[2],ymm2[3,4],ymm7[5],ymm2[6,7] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 56(%rdi), %ymm7 -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm8 = ymm3[1,2,3,3,5,6,7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm8 = ymm3[1,2,3,3,5,6,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[2,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1],ymm7[2],ymm8[3,4],ymm7[5],ymm8[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm8 = ymm1[2,1,3,3] @@ -639,17 +639,17 @@ ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 32(%rdx), %ymm9 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm9[2],ymm8[3,4],ymm9[5],ymm8[6,7] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 24(%rdi), %ymm9 -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm10 = ymm5[1,2,3,3,5,6,7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm10 = ymm5[1,2,3,3,5,6,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[2,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1],ymm9[2],ymm10[3,4],ymm9[5],ymm10[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm10 = ymm6[2,1,3,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0],ymm10[1],ymm9[2,3],ymm10[4],ymm9[5,6],ymm10[7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm5 = ymm5[0,0,3,3,4,4,7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm5 = ymm5[0,0,3,3,4,4,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[1,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm5 = ymm6[1,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0],ymm4[1,2],ymm5[3],ymm4[4,5],ymm5[6],ymm4[7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm3 = ymm3[0,0,3,3,4,4,7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,0,3,3,4,4,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm3[2],ymm0[3,4],ymm3[5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[1,1,2,2] @@ -1024,7 +1024,7 @@ ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm4[2],ymm0[3,4],ymm4[5],ymm0[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill ; AVX2-SLOW-NEXT: vbroadcastsd 88(%rdi), %ymm4 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm11 = ymm9[1,2,3,3,5,6,7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm11 = ymm9[1,2,3,3,5,6,7,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[2,2,2,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm11[0,1],ymm4[2],ymm11[3,4],ymm4[5],ymm11[6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm11 = ymm10[2,1,3,3] @@ -1038,7 +1038,7 @@ ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm11[0,1],ymm12[2],ymm11[3,4],ymm12[5],ymm11[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vbroadcastsd 56(%rdi), %ymm12 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm13 = ymm5[1,2,3,3,5,6,7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm13 = ymm5[1,2,3,3,5,6,7,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[2,2,2,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0,1],ymm12[2],ymm13[3,4],ymm12[5],ymm13[6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm13 = ymm7[2,1,3,3] @@ -1052,7 +1052,7 @@ ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1],ymm14[2],ymm13[3,4],ymm14[5],ymm13[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vbroadcastsd 120(%rdi), %ymm14 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm15 = ymm2[1,2,3,3,5,6,7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm15 = ymm2[1,2,3,3,5,6,7,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[2,2,2,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1],ymm14[2],ymm15[3,4],ymm14[5],ymm15[6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm15 = ymm3[2,1,3,3] @@ -1065,29 +1065,29 @@ ; AVX2-SLOW-NEXT: vbroadcastsd 96(%rdx), %ymm15 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm15[2],ymm4[3,4],ymm15[5],ymm4[6,7] ; AVX2-SLOW-NEXT: vmovaps (%rsi), %ymm15 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm11 = ymm15[1,2,3,3,5,6,7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm11 = ymm15[1,2,3,3,5,6,7,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[2,2,2,3] ; AVX2-SLOW-NEXT: vbroadcastsd 24(%rdi), %ymm12 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1],ymm12[2],ymm11[3,4],ymm12[5],ymm11[6,7] ; AVX2-SLOW-NEXT: vmovaps (%rdx), %ymm12 ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm13 = ymm12[2,1,3,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0],ymm13[1],ymm11[2,3],ymm13[4],ymm11[5,6],ymm13[7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm13 = ymm15[0,0,3,3,4,4,7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm13 = ymm15[0,0,3,3,4,4,7,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[1,1,2,2] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm13[2],ymm8[3,4],ymm13[5],ymm8[6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm12 = ymm12[1,1,2,2] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm12[0],ymm8[1,2],ymm12[3],ymm8[4,5],ymm12[6],ymm8[7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm9 = ymm9[0,0,3,3,4,4,7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm9 = ymm9[0,0,3,3,4,4,7,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[1,1,2,2] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm9[2],ymm6[3,4],ymm9[5],ymm6[6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm9 = ymm10[1,1,2,2] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm9[0],ymm6[1,2],ymm9[3],ymm6[4,5],ymm9[6],ymm6[7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm5 = ymm5[0,0,3,3,4,4,7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm5 = ymm5[0,0,3,3,4,4,7,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[1,1,2,2] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm5[2],ymm1[3,4],ymm5[5],ymm1[6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm5 = ymm7[1,1,2,2] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0],ymm1[1,2],ymm5[3],ymm1[4,5],ymm5[6],ymm1[7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm2 = ymm2[0,0,3,3,4,4,7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,0,3,3,4,4,7,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,1,2,2] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2],ymm0[3,4],ymm2[5],ymm0[6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm3[1,1,2,2] @@ -1125,7 +1125,7 @@ ; AVX2-FAST-NEXT: vmovaps (%rdx), %ymm2 ; AVX2-FAST-NEXT: vmovaps 32(%rdx), %ymm12 ; AVX2-FAST-NEXT: vmovaps 64(%rdx), %ymm14 -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm0 = ymm1[0,0,3,3,4,4,7,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,0,3,3,4,4,7,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm5 = ymm4[1,1,2,2] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1],ymm0[2],ymm5[3,4],ymm0[5],ymm5[6,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm5 = ymm2[1,1,2,2] @@ -1150,12 +1150,12 @@ ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm15[0],ymm7[1],ymm15[2,3],ymm7[4],ymm15[5,6],ymm7[7] ; AVX2-FAST-NEXT: vbroadcastsd 64(%rdx), %ymm15 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm15[2],ymm7[3,4],ymm15[5],ymm7[6,7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm13 = ymm13[0,0,3,3,4,4,7,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm13 = ymm13[0,0,3,3,4,4,7,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[1,1,2,2] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1],ymm13[2],ymm11[3,4],ymm13[5],ymm11[6,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm13 = ymm14[1,1,2,2] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm11 = ymm13[0],ymm11[1,2],ymm13[3],ymm11[4,5],ymm13[6],ymm11[7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm13 = ymm10[0,0,3,3,4,4,7,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm13 = ymm10[0,0,3,3,4,4,7,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm14 = ymm6[1,1,2,2] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0,1],ymm13[2],ymm14[3,4],ymm13[5],ymm14[6,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm14 = ymm12[1,1,2,2] @@ -1171,7 +1171,7 @@ ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm10[1],ymm6[2,3],ymm10[4],ymm6[5,6],ymm10[7] ; AVX2-FAST-NEXT: vbroadcastsd 32(%rdx), %ymm10 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm10[2],ymm6[3,4],ymm10[5],ymm6[6,7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm10 = ymm15[0,0,3,3,4,4,7,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm10 = ymm15[0,0,3,3,4,4,7,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm14 = ymm3[1,1,2,2] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm10 = ymm14[0,1],ymm10[2],ymm14[3,4],ymm10[5],ymm14[6,7] ; AVX2-FAST-NEXT: vmovaps 96(%rdx), %ymm14 @@ -1229,7 +1229,7 @@ ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm4[2],ymm0[3,4],ymm4[5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 88(%rdi), %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm11 = ymm9[1,2,3,3,5,6,7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm11 = ymm9[1,2,3,3,5,6,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[2,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm11[0,1],ymm4[2],ymm11[3,4],ymm4[5],ymm11[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm11 = ymm10[2,1,3,3] @@ -1243,7 +1243,7 @@ ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm11[0,1],ymm12[2],ymm11[3,4],ymm12[5],ymm11[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 56(%rdi), %ymm12 -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm13 = ymm5[1,2,3,3,5,6,7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm13 = ymm5[1,2,3,3,5,6,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[2,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0,1],ymm12[2],ymm13[3,4],ymm12[5],ymm13[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm13 = ymm7[2,1,3,3] @@ -1257,7 +1257,7 @@ ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1],ymm14[2],ymm13[3,4],ymm14[5],ymm13[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 120(%rdi), %ymm14 -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm15 = ymm2[1,2,3,3,5,6,7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm15 = ymm2[1,2,3,3,5,6,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[2,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1],ymm14[2],ymm15[3,4],ymm14[5],ymm15[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm15 = ymm3[2,1,3,3] @@ -1270,29 +1270,29 @@ ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 96(%rdx), %ymm15 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm15[2],ymm4[3,4],ymm15[5],ymm4[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps (%rsi), %ymm15 -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm11 = ymm15[1,2,3,3,5,6,7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm11 = ymm15[1,2,3,3,5,6,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[2,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 24(%rdi), %ymm12 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1],ymm12[2],ymm11[3,4],ymm12[5],ymm11[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdx), %ymm12 ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm13 = ymm12[2,1,3,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0],ymm13[1],ymm11[2,3],ymm13[4],ymm11[5,6],ymm13[7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm13 = ymm15[0,0,3,3,4,4,7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm13 = ymm15[0,0,3,3,4,4,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[1,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm13[2],ymm8[3,4],ymm13[5],ymm8[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm12 = ymm12[1,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm12[0],ymm8[1,2],ymm12[3],ymm8[4,5],ymm12[6],ymm8[7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm9 = ymm9[0,0,3,3,4,4,7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm9 = ymm9[0,0,3,3,4,4,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[1,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm9[2],ymm6[3,4],ymm9[5],ymm6[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm9 = ymm10[1,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm9[0],ymm6[1,2],ymm9[3],ymm6[4,5],ymm9[6],ymm6[7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm5 = ymm5[0,0,3,3,4,4,7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm5 = ymm5[0,0,3,3,4,4,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[1,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm5[2],ymm1[3,4],ymm5[5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm5 = ymm7[1,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0],ymm1[1,2],ymm5[3],ymm1[4,5],ymm5[6],ymm1[7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm2 = ymm2[0,0,3,3,4,4,7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,0,3,3,4,4,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2],ymm0[3,4],ymm2[5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm2 = ymm3[1,1,2,2] @@ -2048,7 +2048,7 @@ ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vbroadcastsd 24(%rdi), %ymm0 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm2[1,2,3,3,5,6,7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm2[1,2,3,3,5,6,7,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm6[2,1,3,3] @@ -2062,7 +2062,7 @@ ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vbroadcastsd 56(%rdi), %ymm0 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm4[1,2,3,3,5,6,7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm4[1,2,3,3,5,6,7,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm7[2,1,3,3] @@ -2076,7 +2076,7 @@ ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vbroadcastsd 88(%rdi), %ymm0 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm8[1,2,3,3,5,6,7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm8[1,2,3,3,5,6,7,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm5[2,1,3,3] @@ -2092,7 +2092,7 @@ ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps 96(%rsi), %ymm13 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm13[1,2,3,3,5,6,7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm13[1,2,3,3,5,6,7,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,3] ; AVX2-SLOW-NEXT: vbroadcastsd 120(%rdi), %ymm1 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] @@ -2110,7 +2110,7 @@ ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps 128(%rsi), %ymm10 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm10[1,2,3,3,5,6,7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm10[1,2,3,3,5,6,7,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,3] ; AVX2-SLOW-NEXT: vbroadcastsd 152(%rdi), %ymm1 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] @@ -2127,7 +2127,7 @@ ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps 160(%rsi), %ymm7 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm7[1,2,3,3,5,6,7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm7[1,2,3,3,5,6,7,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,3] ; AVX2-SLOW-NEXT: vbroadcastsd 184(%rdi), %ymm1 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] @@ -2144,7 +2144,7 @@ ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps 192(%rsi), %ymm4 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm4[1,2,3,3,5,6,7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm4[1,2,3,3,5,6,7,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,3] ; AVX2-SLOW-NEXT: vbroadcastsd 216(%rdi), %ymm1 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] @@ -2161,7 +2161,7 @@ ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps 224(%rsi), %ymm1 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm1[1,2,3,3,5,6,7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm1[1,2,3,3,5,6,7,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,3] ; AVX2-SLOW-NEXT: vbroadcastsd 248(%rdi), %ymm11 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm11 = ymm0[0,1],ymm11[2],ymm0[3,4],ymm11[5],ymm0[6,7] @@ -2193,29 +2193,29 @@ ; AVX2-SLOW-NEXT: vpermpd $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm15 = mem[1,1,2,2] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0],ymm14[1,2],ymm15[3],ymm14[4,5],ymm15[6],ymm14[7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm15 = ymm13[0,0,3,3,4,4,7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm15 = ymm13[0,0,3,3,4,4,7,7] ; AVX2-SLOW-NEXT: vpermpd $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm13 = mem[1,1,2,2] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1],ymm15[2],ymm13[3,4],ymm15[5],ymm13[6,7] ; AVX2-SLOW-NEXT: vpermpd $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm15 = mem[1,1,2,2] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm13 = ymm15[0],ymm13[1,2],ymm15[3],ymm13[4,5],ymm15[6],ymm13[7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm10 = ymm10[0,0,3,3,4,4,7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm10 = ymm10[0,0,3,3,4,4,7,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm12 = ymm12[1,1,2,2] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm10 = ymm12[0,1],ymm10[2],ymm12[3,4],ymm10[5],ymm12[6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[1,1,2,2] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0],ymm10[1,2],ymm9[3],ymm10[4,5],ymm9[6],ymm10[7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm7 = ymm7[0,0,3,3,4,4,7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm7 = ymm7[0,0,3,3,4,4,7,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[1,1,2,2] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1],ymm7[2],ymm8[3,4],ymm7[5],ymm8[6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[1,1,2,2] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm7[1,2],ymm6[3],ymm7[4,5],ymm6[6],ymm7[7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm4 = ymm4[0,0,3,3,4,4,7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,0,3,3,4,4,7,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[1,1,2,2] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[1,1,2,2] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm4[1,2],ymm3[3],ymm4[4,5],ymm3[6],ymm4[7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,0,3,3,4,4,7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,0,3,3,4,4,7,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[1,1,2,2] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,1,2,2] @@ -2288,7 +2288,7 @@ ; AVX2-FAST-NEXT: vbroadcastsd (%rdx), %ymm11 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm11[2],ymm7[3,4],ymm11[5],ymm7[6,7] ; AVX2-FAST-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm7 = ymm8[0,0,3,3,4,4,7,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm7 = ymm8[0,0,3,3,4,4,7,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[1,1,2,2] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm7[2],ymm4[3,4],ymm7[5],ymm4[6,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm7 = ymm10[1,1,2,2] @@ -2307,7 +2307,7 @@ ; AVX2-FAST-NEXT: vbroadcastsd 32(%rdx), %ymm8 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm8[2],ymm4[3,4],ymm8[5],ymm4[6,7] ; AVX2-FAST-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm4 = ymm5[0,0,3,3,4,4,7,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm5[0,0,3,3,4,4,7,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[1,1,2,2] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm4[2],ymm2[3,4],ymm4[5],ymm2[6,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm4 = ymm3[1,1,2,2] @@ -2325,7 +2325,7 @@ ; AVX2-FAST-NEXT: vbroadcastsd 64(%rdx), %ymm3 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7] ; AVX2-FAST-NEXT: vmovups %ymm2, (%rsp) # 32-byte Spill -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm2 = ymm0[0,0,3,3,4,4,7,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm0[0,0,3,3,4,4,7,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm3 = ymm9[1,1,2,2] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm3 = ymm1[1,1,2,2] @@ -2343,7 +2343,7 @@ ; AVX2-FAST-NEXT: vbroadcastsd 96(%rdx), %ymm1 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm0 = ymm13[0,0,3,3,4,4,7,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm13[0,0,3,3,4,4,7,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm12[1,1,2,2] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm14[1,1,2,2] @@ -2362,7 +2362,7 @@ ; AVX2-FAST-NEXT: vbroadcastsd 128(%rdx), %ymm3 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm14 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[1,1,2,2] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm2 = ymm0[0,0,3,3,4,4,7,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm0[0,0,3,3,4,4,7,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7] ; AVX2-FAST-NEXT: vmovaps 128(%rdx), %ymm2 ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm3 = ymm2[1,1,2,2] @@ -2380,7 +2380,7 @@ ; AVX2-FAST-NEXT: vbroadcastsd 160(%rdx), %ymm4 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm2[0,1],ymm4[2],ymm2[3,4],ymm4[5],ymm2[6,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[1,1,2,2] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm4 = ymm0[0,0,3,3,4,4,7,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm0[0,0,3,3,4,4,7,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm4[2],ymm1[3,4],ymm4[5],ymm1[6,7] ; AVX2-FAST-NEXT: vmovaps 160(%rdx), %ymm4 ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm5 = ymm4[1,1,2,2] @@ -2398,7 +2398,7 @@ ; AVX2-FAST-NEXT: vbroadcastsd 192(%rdx), %ymm9 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm9[2],ymm0[3,4],ymm9[5],ymm0[6,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[1,1,2,2] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm9 = ymm5[0,0,3,3,4,4,7,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm9 = ymm5[0,0,3,3,4,4,7,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm9[2],ymm4[3,4],ymm9[5],ymm4[6,7] ; AVX2-FAST-NEXT: vmovaps 192(%rdx), %ymm9 ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm10 = ymm9[1,1,2,2] @@ -2416,7 +2416,7 @@ ; AVX2-FAST-NEXT: vbroadcastsd 224(%rdx), %ymm11 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm11[2],ymm6[3,4],ymm11[5],ymm6[6,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[1,1,2,2] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm11 = ymm9[0,0,3,3,4,4,7,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm11 = ymm9[0,0,3,3,4,4,7,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1],ymm11[2],ymm10[3,4],ymm11[5],ymm10[6,7] ; AVX2-FAST-NEXT: vmovaps 224(%rdx), %ymm11 ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm12 = ymm11[1,1,2,2] @@ -2493,7 +2493,7 @@ ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 24(%rdi), %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm2[1,2,3,3,5,6,7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm2[1,2,3,3,5,6,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm6[2,1,3,3] @@ -2507,7 +2507,7 @@ ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 56(%rdi), %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm4[1,2,3,3,5,6,7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm4[1,2,3,3,5,6,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm7[2,1,3,3] @@ -2521,7 +2521,7 @@ ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 88(%rdi), %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm8[1,2,3,3,5,6,7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm8[1,2,3,3,5,6,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm5[2,1,3,3] @@ -2537,7 +2537,7 @@ ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 96(%rsi), %ymm13 -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm0 = ymm13[1,2,3,3,5,6,7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm13[1,2,3,3,5,6,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 120(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] @@ -2555,7 +2555,7 @@ ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 128(%rsi), %ymm10 -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm0 = ymm10[1,2,3,3,5,6,7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm10[1,2,3,3,5,6,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 152(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] @@ -2572,7 +2572,7 @@ ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 160(%rsi), %ymm7 -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm0 = ymm7[1,2,3,3,5,6,7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm7[1,2,3,3,5,6,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 184(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] @@ -2589,7 +2589,7 @@ ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 192(%rsi), %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm0 = ymm4[1,2,3,3,5,6,7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm4[1,2,3,3,5,6,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 216(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] @@ -2606,7 +2606,7 @@ ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 224(%rsi), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm0 = ymm1[1,2,3,3,5,6,7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm1[1,2,3,3,5,6,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 248(%rdi), %ymm11 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm11 = ymm0[0,1],ymm11[2],ymm0[3,4],ymm11[5],ymm0[6,7] @@ -2638,29 +2638,29 @@ ; AVX2-FAST-PERLANE-NEXT: vpermpd $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm15 = mem[1,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0],ymm14[1,2],ymm15[3],ymm14[4,5],ymm15[6],ymm14[7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm15 = ymm13[0,0,3,3,4,4,7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm15 = ymm13[0,0,3,3,4,4,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm13 = mem[1,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1],ymm15[2],ymm13[3,4],ymm15[5],ymm13[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm15 = mem[1,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm13 = ymm15[0],ymm13[1,2],ymm15[3],ymm13[4,5],ymm15[6],ymm13[7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm10 = ymm10[0,0,3,3,4,4,7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm10 = ymm10[0,0,3,3,4,4,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm12 = ymm12[1,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm10 = ymm12[0,1],ymm10[2],ymm12[3,4],ymm10[5],ymm12[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[1,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0],ymm10[1,2],ymm9[3],ymm10[4,5],ymm9[6],ymm10[7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm7 = ymm7[0,0,3,3,4,4,7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm7 = ymm7[0,0,3,3,4,4,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[1,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1],ymm7[2],ymm8[3,4],ymm7[5],ymm8[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[1,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm7[1,2],ymm6[3],ymm7[4,5],ymm6[6],ymm7[7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm4 = ymm4[0,0,3,3,4,4,7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,0,3,3,4,4,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[1,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[1,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm4[1,2],ymm3[3],ymm4[4,5],ymm3[6],ymm4[7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,0,3,3,4,4,7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,0,3,3,4,4,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[1,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,1,2,2] diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-4.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-4.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-4.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-4.ll @@ -41,8 +41,8 @@ ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm2 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,3,0,2,5,7,4,6] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm1 = ymm2[0,2,1,3,4,6,5,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3,0,2,5,7,4,6] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,2,1,3,4,6,5,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovaps %ymm0, (%r8) ; AVX1-ONLY-NEXT: vzeroupper @@ -57,7 +57,7 @@ ; AVX2-SLOW-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero ; AVX2-SLOW-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3] ; AVX2-SLOW-NEXT: vmovaps %ymm0, (%r8) ; AVX2-SLOW-NEXT: vzeroupper @@ -87,7 +87,7 @@ ; AVX2-FAST-PERLANE-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero ; AVX2-FAST-PERLANE-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] ; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3] ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, (%r8) ; AVX2-FAST-PERLANE-NEXT: vzeroupper @@ -102,7 +102,7 @@ ; AVX512F-SLOW-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero ; AVX512F-SLOW-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] ; AVX512F-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX512F-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7] +; AVX512F-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7] ; AVX512F-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3] ; AVX512F-SLOW-NEXT: vmovaps %ymm0, (%r8) ; AVX512F-SLOW-NEXT: vzeroupper @@ -132,7 +132,7 @@ ; AVX512BW-SLOW-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero ; AVX512BW-SLOW-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] ; AVX512BW-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX512BW-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7] +; AVX512BW-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7] ; AVX512BW-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3] ; AVX512BW-SLOW-NEXT: vmovaps %ymm0, (%r8) ; AVX512BW-SLOW-NEXT: vzeroupper @@ -198,16 +198,16 @@ ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm4 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm5 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm3 = ymm2[0,1,1,0,4,5,5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm2[0,1,1,0,4,5,5,4] ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} ymm6 = ymm5[0,0,2,2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm6[0,1,2],ymm3[3],ymm6[4,5],ymm3[6],ymm6[7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm1 = ymm0[1,0,2,3,5,4,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm0[1,0,2,3,5,4,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0],ymm1[1],ymm4[2,3],ymm1[4],ymm4[5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3],ymm1[4,5],ymm3[6,7] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm2 = ymm2[0,1,3,2,4,5,7,6] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,1,3,2,4,5,7,6] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1,2],ymm2[3],ymm5[4,5],ymm2[6],ymm5[7] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,2,3,7,6,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,2,2,3,7,6,6,7] ; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} ymm3 = ymm4[1,0,3,2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0],ymm0[1],ymm3[2,3],ymm0[4],ymm3[5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5],ymm2[6,7] @@ -321,7 +321,7 @@ ; AVX1-ONLY-NEXT: vmovaps 16(%rdx), %xmm8 ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm9 = xmm7[0],xmm5[0],xmm7[1],xmm5[1] ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm10 = xmm5[0],xmm7[0] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm10 = xmm10[0,1,2,0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm10[0,1,2,0] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm10, %ymm9 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm9[2,3],ymm3[4,5],ymm9[6,7] ; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm9 = xmm0[1],xmm1[1],zero,zero @@ -329,7 +329,7 @@ ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm10, %ymm9 ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm10 = xmm8[0],xmm6[0],xmm8[1],xmm6[1] ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm11 = xmm6[0],xmm8[0] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm11 = xmm11[0,1,2,0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm11 = xmm11[0,1,2,0] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm11, %ymm10 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1],ymm10[2,3],ymm9[4,5],ymm10[6,7] ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm10 = xmm7[2],xmm5[2],xmm7[3],xmm5[3] @@ -337,7 +337,7 @@ ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm5, %ymm5 ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm7 = xmm2[2],xmm4[2],xmm2[3],xmm4[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm4[3,0],xmm2[3,0] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[2,0,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm2[2,0,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm7, %ymm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm5[2,3],ymm2[4,5],ymm5[6,7] ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm4 = xmm8[2],xmm6[2],xmm8[3],xmm6[3] @@ -345,7 +345,7 @@ ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm5, %ymm4 ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm5 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm1[3,0],xmm0[3,0] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,0,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,0,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm5, %ymm0 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm4[2,3],ymm0[4,5],ymm4[6,7] ; AVX1-ONLY-NEXT: vmovaps %ymm0, 96(%r8) @@ -529,7 +529,7 @@ ; AVX1-ONLY-NEXT: vmovaps 48(%rdx), %xmm12 ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm15[0],xmm2[1],xmm15[1] ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm13 = xmm15[0],xmm2[0] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm13 = xmm13[0,1,2,0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm13 = xmm13[0,1,2,0] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm13, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -539,7 +539,7 @@ ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm12[0],xmm11[0],xmm12[1],xmm11[1] ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm13 = xmm11[0],xmm12[0] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm13 = xmm13[0,1,2,0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm13 = xmm13[0,1,2,0] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm13, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -549,7 +549,7 @@ ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm6[0],xmm8[0] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,1,2,0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,0] ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm13 = xmm8[0],xmm6[0],xmm8[1],xmm6[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] @@ -562,7 +562,7 @@ ; AVX1-ONLY-NEXT: vmovaps 16(%rcx), %xmm1 ; AVX1-ONLY-NEXT: vmovaps 16(%rdx), %xmm0 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm10 = xmm1[0],xmm0[0] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm10 = xmm10[0,1,2,0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm10[0,1,2,0] ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm4 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm10, %ymm4 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm3[0,1],ymm4[2,3],ymm3[4,5],ymm4[6,7] @@ -571,7 +571,7 @@ ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm3 = xmm7[2],xmm9[2],xmm7[3],xmm9[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm9[3,0],xmm7[3,0] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm4 = xmm4[2,0,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm4[2,0,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7] ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm3 = xmm12[2],xmm11[2],xmm12[3],xmm11[3] @@ -580,7 +580,7 @@ ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm4 = xmm7[2],xmm5[2],xmm7[3],xmm5[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm5[3,0],xmm7[3,0] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm5 = xmm5[2,0,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm5[2,0,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm4 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1],ymm3[2,3],ymm4[4,5],ymm3[6,7] ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm4 = xmm8[2],xmm6[2],xmm8[3],xmm6[3] @@ -590,7 +590,7 @@ ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm5 = xmm7[2],xmm6[2],xmm7[3],xmm6[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm6[3,0],xmm7[3,0] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm6 = xmm6[2,0,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm6[2,0,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1],ymm4[2,3],ymm5[4,5],ymm4[6,7] ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm5 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] @@ -598,7 +598,7 @@ ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm13[2],xmm14[2],xmm13[3],xmm14[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm14[3,0],xmm13[3,0] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm5 = xmm5[2,0,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm5[2,0,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] ; AVX1-ONLY-NEXT: vmovaps %ymm0, 96(%r8) @@ -987,7 +987,7 @@ ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm3[0],xmm2[0] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[0,1,2,0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,1,2,0] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -1004,7 +1004,7 @@ ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm3[0],xmm2[0] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[0,1,2,0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,1,2,0] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -1019,7 +1019,7 @@ ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 80(%rdx), %xmm12 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm12[0] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,1,2,0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,0] ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm12[0],xmm2[0],xmm12[1],xmm2[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] @@ -1034,7 +1034,7 @@ ; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %xmm10 ; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %xmm8 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm10[0],xmm8[0] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,1,2,0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,0] ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm8[0],xmm10[0],xmm8[1],xmm10[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] @@ -1049,7 +1049,7 @@ ; AVX1-ONLY-NEXT: vmovaps 48(%rcx), %xmm9 ; AVX1-ONLY-NEXT: vmovaps 48(%rdx), %xmm7 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm9[0],xmm7[0] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,1,2,0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,0] ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm7[0],xmm9[0],xmm7[1],xmm9[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] @@ -1064,7 +1064,7 @@ ; AVX1-ONLY-NEXT: vmovaps 96(%rcx), %xmm6 ; AVX1-ONLY-NEXT: vmovaps 96(%rdx), %xmm5 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm6[0],xmm5[0] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,1,2,0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,0] ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] @@ -1079,7 +1079,7 @@ ; AVX1-ONLY-NEXT: vmovaps 112(%rcx), %xmm4 ; AVX1-ONLY-NEXT: vmovaps 112(%rdx), %xmm3 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm4[0],xmm3[0] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,1,2,0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,0] ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm11 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] @@ -1093,7 +1093,7 @@ ; AVX1-ONLY-NEXT: vmovaps (%rcx), %xmm11 ; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm2 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm15 = xmm11[0],xmm2[0] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm15 = xmm15[0,1,2,0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm15[0,1,2,0] ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm14 = xmm2[0],xmm11[0],xmm2[1],xmm11[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm15, %ymm14 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm14[2,3],ymm0[4,5],ymm14[6,7] @@ -1107,7 +1107,7 @@ ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm14 = xmm0[2],xmm15[2],xmm0[3],xmm15[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm15[3,0],xmm0[3,0] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm15 = xmm15[2,0,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm15[2,0,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm14, %ymm14 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1],ymm1[2,3],ymm14[4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -1120,7 +1120,7 @@ ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm14 = xmm15[2],xmm1[2],xmm15[3],xmm1[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm1[3,0],xmm15[3,0] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm15 = xmm15[2,0,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm15[2,0,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm14, %ymm14 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm14[0,1],ymm0[2,3],ymm14[4,5],ymm0[6,7] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload @@ -1131,7 +1131,7 @@ ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm12 = xmm14[2],xmm1[2],xmm14[3],xmm1[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm1[3,0],xmm14[3,0] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm14 = xmm14[2,0,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm14[2,0,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm12, %ymm12 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm12[0,1],ymm0[2,3],ymm12[4,5],ymm0[6,7] ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm12 = xmm8[2],xmm10[2],xmm8[3],xmm10[3] @@ -1141,7 +1141,7 @@ ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm10 = xmm1[2],xmm12[2],xmm1[3],xmm12[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm12 = xmm12[3,0],xmm1[3,0] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm12 = xmm12[2,0,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm12 = xmm12[2,0,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm10, %ymm10 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm10[0,1],ymm8[2,3],ymm10[4,5],ymm8[6,7] ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm10 = xmm7[2],xmm9[2],xmm7[3],xmm9[3] @@ -1151,7 +1151,7 @@ ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm9 = xmm1[2],xmm10[2],xmm1[3],xmm10[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm10[3,0],xmm1[3,0] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm10 = xmm10[2,0,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm10[2,0,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm9, %ymm9 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm9[0,1],ymm7[2,3],ymm9[4,5],ymm7[6,7] ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm9 = xmm5[2],xmm6[2],xmm5[3],xmm6[3] @@ -1161,7 +1161,7 @@ ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm6 = xmm9[2],xmm1[2],xmm9[3],xmm1[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm9 = xmm1[3,0],xmm9[3,0] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm9 = xmm9[2,0,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm9 = xmm9[2,0,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm6, %ymm6 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1],ymm5[2,3],ymm6[4,5],ymm5[6,7] ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm6 = xmm3[2],xmm4[2],xmm3[3],xmm4[3] @@ -1171,7 +1171,7 @@ ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm4 = xmm6[2],xmm1[2],xmm6[3],xmm1[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm1[3,0],xmm6[3,0] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm6 = xmm6[2,0,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm6[2,0,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm4, %ymm4 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1],ymm3[2,3],ymm4[4,5],ymm3[6,7] ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm4 = xmm2[2],xmm11[2],xmm2[3],xmm11[3] @@ -1180,7 +1180,7 @@ ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm4[2],xmm13[2],xmm4[3],xmm13[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm13[3,0],xmm4[3,0] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm4 = xmm4[2,0,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm4[2,0,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%r8) @@ -1941,7 +1941,7 @@ ; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %xmm8 ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm11[0],xmm2[1],xmm11[1] ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm11[0],xmm2[0] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[0,1,2,0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,1,2,0] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -1954,7 +1954,7 @@ ; AVX1-ONLY-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm9[0],xmm10[0] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[0,1,2,0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,1,2,0] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -1966,7 +1966,7 @@ ; AVX1-ONLY-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm7[0],xmm8[0] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,1,2,0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,0] ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm8[0],xmm7[0],xmm8[1],xmm7[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] @@ -1983,7 +1983,7 @@ ; AVX1-ONLY-NEXT: vmovaps 48(%rdx), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm3[0],xmm2[0] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,1,2,0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,0] ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] @@ -2000,7 +2000,7 @@ ; AVX1-ONLY-NEXT: vmovaps 64(%rdx), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm3[0],xmm2[0] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,1,2,0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,0] ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] @@ -2017,7 +2017,7 @@ ; AVX1-ONLY-NEXT: vmovaps 80(%rdx), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm3[0],xmm2[0] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,1,2,0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,0] ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] @@ -2034,7 +2034,7 @@ ; AVX1-ONLY-NEXT: vmovaps 96(%rdx), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm3[0],xmm2[0] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,1,2,0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,0] ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] @@ -2051,7 +2051,7 @@ ; AVX1-ONLY-NEXT: vmovaps 112(%rdx), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm3[0],xmm2[0] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,1,2,0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,0] ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] @@ -2068,7 +2068,7 @@ ; AVX1-ONLY-NEXT: vmovaps 128(%rdx), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm3[0],xmm2[0] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,1,2,0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,0] ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] @@ -2085,7 +2085,7 @@ ; AVX1-ONLY-NEXT: vmovaps 144(%rdx), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm3[0],xmm2[0] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,1,2,0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,0] ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] @@ -2102,7 +2102,7 @@ ; AVX1-ONLY-NEXT: vmovaps 160(%rdx), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm3[0],xmm2[0] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,1,2,0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,0] ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] @@ -2118,7 +2118,7 @@ ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 176(%rdx), %xmm12 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm12[0] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,1,2,0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,0] ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm12[0],xmm2[0],xmm12[1],xmm2[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] @@ -2133,7 +2133,7 @@ ; AVX1-ONLY-NEXT: vmovaps 192(%rcx), %xmm9 ; AVX1-ONLY-NEXT: vmovaps 192(%rdx), %xmm8 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm9[0],xmm8[0] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,1,2,0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,0] ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm8[0],xmm9[0],xmm8[1],xmm9[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] @@ -2148,7 +2148,7 @@ ; AVX1-ONLY-NEXT: vmovaps 208(%rcx), %xmm5 ; AVX1-ONLY-NEXT: vmovaps 208(%rdx), %xmm4 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm5[0],xmm4[0] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,1,2,0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,0] ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] @@ -2162,7 +2162,7 @@ ; AVX1-ONLY-NEXT: vmovaps 224(%rcx), %xmm3 ; AVX1-ONLY-NEXT: vmovaps 224(%rdx), %xmm2 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm3[0],xmm2[0] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,1,2,0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,0] ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm6 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] @@ -2175,7 +2175,7 @@ ; AVX1-ONLY-NEXT: vmovaps 240(%rcx), %xmm6 ; AVX1-ONLY-NEXT: vmovaps 240(%rdx), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm14 = xmm6[0],xmm1[0] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm14 = xmm14[0,1,2,0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm14[0,1,2,0] ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm13 = xmm1[0],xmm6[0],xmm1[1],xmm6[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm14, %ymm13 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1],ymm13[2,3],ymm15[4,5],ymm13[6,7] @@ -2189,7 +2189,7 @@ ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm14 = xmm0[2],xmm15[2],xmm0[3],xmm15[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm15[3,0],xmm0[3,0] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm15 = xmm15[2,0,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm15[2,0,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm14, %ymm14 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1],ymm13[2,3],ymm14[4,5],ymm13[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -2202,7 +2202,7 @@ ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm14 = xmm15[2],xmm0[2],xmm15[3],xmm0[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm0[3,0],xmm15[3,0] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm15 = xmm15[2,0,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm15[2,0,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm14, %ymm14 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1],ymm13[2,3],ymm14[4,5],ymm13[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -2215,7 +2215,7 @@ ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm14 = xmm15[2],xmm0[2],xmm15[3],xmm0[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm0[3,0],xmm15[3,0] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm15 = xmm15[2,0,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm15[2,0,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm14, %ymm14 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1],ymm13[2,3],ymm14[4,5],ymm13[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -2228,7 +2228,7 @@ ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm14 = xmm0[2],xmm15[2],xmm0[3],xmm15[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm15[3,0],xmm0[3,0] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm15 = xmm15[2,0,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm15[2,0,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm14, %ymm14 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1],ymm13[2,3],ymm14[4,5],ymm13[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -2241,7 +2241,7 @@ ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm14 = xmm15[2],xmm0[2],xmm15[3],xmm0[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm0[3,0],xmm15[3,0] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm15 = xmm15[2,0,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm15[2,0,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm14, %ymm14 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1],ymm13[2,3],ymm14[4,5],ymm13[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -2254,7 +2254,7 @@ ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm14 = xmm15[2],xmm0[2],xmm15[3],xmm0[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm0[3,0],xmm15[3,0] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm15 = xmm15[2,0,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm15[2,0,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm14, %ymm14 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1],ymm13[2,3],ymm14[4,5],ymm13[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -2267,7 +2267,7 @@ ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm14 = xmm15[2],xmm0[2],xmm15[3],xmm0[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm0[3,0],xmm15[3,0] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm15 = xmm15[2,0,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm15[2,0,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm14, %ymm14 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1],ymm13[2,3],ymm14[4,5],ymm13[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -2280,7 +2280,7 @@ ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm14 = xmm15[2],xmm0[2],xmm15[3],xmm0[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm0[3,0],xmm15[3,0] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm15 = xmm15[2,0,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm15[2,0,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm14, %ymm14 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1],ymm13[2,3],ymm14[4,5],ymm13[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -2293,7 +2293,7 @@ ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm14 = xmm15[2],xmm0[2],xmm15[3],xmm0[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm0[3,0],xmm15[3,0] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm15 = xmm15[2,0,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm15[2,0,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm14, %ymm14 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1],ymm13[2,3],ymm14[4,5],ymm13[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -2306,7 +2306,7 @@ ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm14 = xmm15[2],xmm0[2],xmm15[3],xmm0[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm0[3,0],xmm15[3,0] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm15 = xmm15[2,0,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm15[2,0,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm14, %ymm14 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1],ymm13[2,3],ymm14[4,5],ymm13[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -2319,7 +2319,7 @@ ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm14 = xmm15[2],xmm0[2],xmm15[3],xmm0[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm0[3,0],xmm15[3,0] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm15 = xmm15[2,0,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm15[2,0,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm14, %ymm14 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm14[0,1],ymm13[2,3],ymm14[4,5],ymm13[6,7] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -2330,7 +2330,7 @@ ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm13 = xmm14[2],xmm0[2],xmm14[3],xmm0[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm0[3,0],xmm14[3,0] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm14 = xmm14[2,0,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm14[2,0,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm13, %ymm13 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0,1],ymm12[2,3],ymm13[4,5],ymm12[6,7] ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm13 = xmm8[2],xmm9[2],xmm8[3],xmm9[3] @@ -2340,7 +2340,7 @@ ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm9 = xmm13[2],xmm0[2],xmm13[3],xmm0[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm13 = xmm0[3,0],xmm13[3,0] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm13 = xmm13[2,0,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm13 = xmm13[2,0,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm9, %ymm9 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1],ymm8[2,3],ymm9[4,5],ymm8[6,7] ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm9 = xmm4[2],xmm5[2],xmm4[3],xmm5[3] @@ -2350,7 +2350,7 @@ ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm5 = xmm9[2],xmm0[2],xmm9[3],xmm0[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm9 = xmm0[3,0],xmm9[3,0] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm9 = xmm9[2,0,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm9 = xmm9[2,0,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm5, %ymm5 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1],ymm4[2,3],ymm5[4,5],ymm4[6,7] ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm5 = xmm2[2],xmm3[2],xmm2[3],xmm3[3] @@ -2359,7 +2359,7 @@ ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm3 = xmm0[2],xmm11[2],xmm0[3],xmm11[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm11[3,0],xmm0[3,0] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm5 = xmm5[2,0,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm5[2,0,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm3, %ymm3 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7] ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm3 = xmm1[2],xmm6[2],xmm1[3],xmm6[3] @@ -2367,7 +2367,7 @@ ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm10[2],xmm7[2],xmm10[3],xmm7[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm7[3,0],xmm10[3,0] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm3 = xmm3[2,0,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm3[2,0,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] ; AVX1-ONLY-NEXT: vmovaps %ymm0, 992(%r8) diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-5.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-5.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-5.ll @@ -49,13 +49,13 @@ ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm2 ; AVX1-ONLY-NEXT: vmovsd {{.*#+}} xmm3 = mem[0],zero -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm2 = ymm2[0,2,2,1,4,6,6,5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,2,2,1,4,6,6,5] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[u,u,0,2,u,5,7,u] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6],ymm2[7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4],ymm0[5,6,7] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,3,3,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2,3] ; AVX1-ONLY-NEXT: vmovlps %xmm1, 32(%r9) ; AVX1-ONLY-NEXT: vmovaps %ymm0, (%r9) @@ -246,7 +246,7 @@ ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm5[1],ymm4[2,3,4,5],ymm5[6],ymm4[7] ; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm5 = ; AVX2-FAST-NEXT: vpermps %ymm2, %ymm5, %ymm2 -; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,3,3,3] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3] ; AVX2-FAST-NEXT: vmovaps %ymm4, 32(%r9) ; AVX2-FAST-NEXT: vmovaps %ymm0, (%r9) @@ -410,7 +410,7 @@ ; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm11 ; AVX1-ONLY-NEXT: vmovaps (%rcx), %xmm12 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm13 = xmm12[0],xmm11[0] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm13 = xmm13[0,1,2,0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm13 = xmm13[0,1,2,0] ; AVX1-ONLY-NEXT: vbroadcastss 4(%rdx), %xmm14 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm13, %ymm13 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm13[2,3],ymm5[4,5,6],ymm13[7] @@ -441,9 +441,9 @@ ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm10[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2],ymm10[3,4],ymm8[5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm9[2],ymm8[3,4,5,6],ymm9[7] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm3 = ymm3[3,0,2,3,7,4,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[3,0,2,3,7,4,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4],ymm3[5,6,7] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm3 = ymm4[0,1,3,0,4,5,7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm4[0,1,3,0,4,5,7,4] ; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} ymm2 = ymm2[1,0,2,2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4,5,6],ymm3[7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4,5],ymm2[6,7] @@ -466,29 +466,29 @@ ; AVX2-SLOW-NEXT: vmovaps (%rcx), %xmm6 ; AVX2-SLOW-NEXT: vmovaps (%rdx), %xmm7 ; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm5 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm5 = xmm5[0,1,2,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm5 = xmm5[0,1,2,2] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,0,2,1] ; AVX2-SLOW-NEXT: vmovaps (%rsi), %xmm8 ; AVX2-SLOW-NEXT: vmovaps (%rdi), %xmm9 ; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm10 = xmm9[0],xmm8[0],xmm9[1],xmm8[1] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm10 = xmm10[0,1,3,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm10 = xmm10[0,1,3,2] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm11 = ymm10[0,1,1,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm11[0,1],ymm5[2,3],ymm11[4,5,6],ymm5[7] ; AVX2-SLOW-NEXT: vinsertf128 $1, (%r8), %ymm10, %ymm10 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm10[0],ymm5[1,2,3],ymm10[4],ymm5[5,6,7] ; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} xmm8 = xmm9[2],xmm8[2],xmm9[3],xmm8[3] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm8 = xmm8[0,1,2,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm8 = xmm8[0,1,2,2] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,0,2,1] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm6 = xmm6[1,1,2,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm6 = xmm6[1,1,2,2] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1],xmm7[2],xmm6[3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,1,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm8[2,3],ymm6[4,5],ymm8[6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm7 = ymm0[0,1,2,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm7[1],ymm6[2,3,4,5],ymm7[6],ymm6[7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm7 = ymm4[1,1,1,1,5,5,5,5] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm7 = ymm4[1,1,1,1,5,5,5,5] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0],ymm2[1],ymm7[2,3,4],ymm2[5],ymm7[6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[2,2,2,2] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm8 = ymm3[1,1,2,2,5,5,6,6] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm8 = ymm3[1,1,2,2,5,5,6,6] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0],ymm1[1,2],ymm8[3,4],ymm1[5,6],ymm8[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[2,2,2,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1,2],ymm7[3,4],ymm8[5,6,7] @@ -496,17 +496,17 @@ ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0],ymm7[1,2,3,4],ymm8[5],ymm7[6,7] ; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm8 = ymm3[2],ymm1[2],ymm3[3],ymm1[3],ymm3[6],ymm1[6],ymm3[7],ymm1[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[3,3,3,3] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm9 = ymm4[0,2,3,3,4,6,7,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm10 = ymm2[2,3,2,3,6,7,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm9 = ymm4[0,2,3,3,4,6,7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm10 = ymm2[2,3,2,3,6,7,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0],ymm9[1,2],ymm10[3,4],ymm9[5,6],ymm10[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[2,1,3,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1,2],ymm8[3,4],ymm9[5,6,7] ; AVX2-SLOW-NEXT: vbroadcastsd 24(%r8), %ymm9 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm9[2],ymm8[3,4,5,6],ymm9[7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm3 = ymm3[3,0,2,3,7,4,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm3 = ymm3[3,0,2,3,7,4,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4],ymm3[5,6,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm3 = ymm4[0,1,3,0,4,5,7,4] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm2 = ymm2[2,3,0,1,6,7,4,5] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm3 = ymm4[0,1,3,0,4,5,7,4] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,3,0,1,6,7,4,5] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4,5,6],ymm3[7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4,5],ymm2[6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7] @@ -532,7 +532,7 @@ ; AVX2-FAST-NEXT: vpermps %ymm5, %ymm8, %ymm5 ; AVX2-FAST-NEXT: vmovaps (%rdx), %xmm9 ; AVX2-FAST-NEXT: vmovaps (%rcx), %xmm10 -; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm11 = xmm10[1,1,2,2] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm11 = xmm10[1,1,2,2] ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm11 = xmm11[0,1],xmm9[2],xmm11[3] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[0,1,1,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm11[0,1],ymm5[2,3],ymm11[4,5],ymm5[6,7] @@ -546,10 +546,10 @@ ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm8[2,3],ymm7[4,5,6],ymm8[7] ; AVX2-FAST-NEXT: vinsertf128 $1, (%r8), %ymm6, %ymm6 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm7[1,2,3],ymm6[4],ymm7[5,6,7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm7 = ymm4[1,1,1,1,5,5,5,5] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm7 = ymm4[1,1,1,1,5,5,5,5] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0],ymm2[1],ymm7[2,3,4],ymm2[5],ymm7[6,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[2,2,2,2] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm8 = ymm3[1,1,2,2,5,5,6,6] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm8 = ymm3[1,1,2,2,5,5,6,6] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0],ymm1[1,2],ymm8[3,4],ymm1[5,6],ymm8[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[2,2,2,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1,2],ymm7[3,4],ymm8[5,6,7] @@ -557,17 +557,17 @@ ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0],ymm7[1,2,3,4],ymm8[5],ymm7[6,7] ; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm8 = ymm3[2],ymm1[2],ymm3[3],ymm1[3],ymm3[6],ymm1[6],ymm3[7],ymm1[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[3,3,3,3] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm9 = ymm4[0,2,3,3,4,6,7,7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm10 = ymm2[2,3,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm9 = ymm4[0,2,3,3,4,6,7,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm10 = ymm2[2,3,2,3,6,7,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0],ymm9[1,2],ymm10[3,4],ymm9[5,6],ymm10[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[2,1,3,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1,2],ymm8[3,4],ymm9[5,6,7] ; AVX2-FAST-NEXT: vbroadcastsd 24(%r8), %ymm9 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm9[2],ymm8[3,4,5,6],ymm9[7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm3 = ymm3[3,0,2,3,7,4,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm3 = ymm3[3,0,2,3,7,4,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4],ymm3[5,6,7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm3 = ymm4[0,1,3,0,4,5,7,4] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm2 = ymm2[2,3,0,1,6,7,4,5] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm3 = ymm4[0,1,3,0,4,5,7,4] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,3,0,1,6,7,4,5] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4,5,6],ymm3[7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4,5],ymm2[6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7] @@ -589,29 +589,29 @@ ; AVX2-FAST-PERLANE-NEXT: vmovaps (%rcx), %xmm6 ; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdx), %xmm7 ; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm5 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm5 = xmm5[0,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm5 = xmm5[0,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,0,2,1] ; AVX2-FAST-PERLANE-NEXT: vmovaps (%rsi), %xmm8 ; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdi), %xmm9 ; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm10 = xmm9[0],xmm8[0],xmm9[1],xmm8[1] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm10 = xmm10[0,1,3,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm10 = xmm10[0,1,3,2] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm11 = ymm10[0,1,1,1] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm11[0,1],ymm5[2,3],ymm11[4,5,6],ymm5[7] ; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, (%r8), %ymm10, %ymm10 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm10[0],ymm5[1,2,3],ymm10[4],ymm5[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} xmm8 = xmm9[2],xmm8[2],xmm9[3],xmm8[3] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm8 = xmm8[0,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm8 = xmm8[0,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm6 = xmm6[1,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm6 = xmm6[1,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1],xmm7[2],xmm6[3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,1,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm8[2,3],ymm6[4,5],ymm8[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm7 = ymm0[0,1,2,1] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm7[1],ymm6[2,3,4,5],ymm7[6],ymm6[7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm7 = ymm4[1,1,1,1,5,5,5,5] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm7 = ymm4[1,1,1,1,5,5,5,5] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0],ymm2[1],ymm7[2,3,4],ymm2[5],ymm7[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm8 = ymm3[1,1,2,2,5,5,6,6] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm8 = ymm3[1,1,2,2,5,5,6,6] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0],ymm1[1,2],ymm8[3,4],ymm1[5,6],ymm8[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[2,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1,2],ymm7[3,4],ymm8[5,6,7] @@ -619,17 +619,17 @@ ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0],ymm7[1,2,3,4],ymm8[5],ymm7[6,7] ; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm8 = ymm3[2],ymm1[2],ymm3[3],ymm1[3],ymm3[6],ymm1[6],ymm3[7],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[3,3,3,3] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm9 = ymm4[0,2,3,3,4,6,7,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm10 = ymm2[2,3,2,3,6,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm9 = ymm4[0,2,3,3,4,6,7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm10 = ymm2[2,3,2,3,6,7,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0],ymm9[1,2],ymm10[3,4],ymm9[5,6],ymm10[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[2,1,3,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1,2],ymm8[3,4],ymm9[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 24(%r8), %ymm9 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm9[2],ymm8[3,4,5,6],ymm9[7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm3 = ymm3[3,0,2,3,7,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm3 = ymm3[3,0,2,3,7,4,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4],ymm3[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm3 = ymm4[0,1,3,0,4,5,7,4] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm2 = ymm2[2,3,0,1,6,7,4,5] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm3 = ymm4[0,1,3,0,4,5,7,4] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,3,0,1,6,7,4,5] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4,5,6],ymm3[7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4,5],ymm2[6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7] @@ -875,7 +875,7 @@ ; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm11 ; AVX1-ONLY-NEXT: vmovaps (%rcx), %xmm12 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm9 = xmm12[0],xmm11[0] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm9 = xmm9[0,1,2,0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm9 = xmm9[0,1,2,0] ; AVX1-ONLY-NEXT: vbroadcastss 4(%rdx), %xmm13 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm9, %ymm9 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm9[2,3],ymm4[4,5,6],ymm9[7] @@ -886,7 +886,7 @@ ; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %xmm13 ; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %xmm14 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm9 = xmm14[0],xmm13[0] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm9 = xmm9[0,1,2,0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm9 = xmm9[0,1,2,0] ; AVX1-ONLY-NEXT: vbroadcastss 36(%rdx), %xmm15 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm9, %ymm9 ; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm15 = xmm8[0],xmm7[0],zero,zero @@ -959,16 +959,16 @@ ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm4[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm0[2],ymm8[3,4,5,6],ymm0[7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4],ymm0[5],ymm1[6,7] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm1 = ymm3[3,0,2,3,7,4,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm3[3,0,2,3,7,4,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4],ymm1[5,6,7] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm3 = ymm9[0,1,3,0,4,5,7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm9[0,1,3,0,4,5,7,4] ; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} ymm2 = ymm6[1,0,2,2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4,5,6],ymm3[7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4,5],ymm2[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm12[3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm2 = ymm15[3,0,2,3,7,4,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm15[3,0,2,3,7,4,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4],ymm2[5,6,7] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm3 = ymm14[0,1,3,0,4,5,7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm14[0,1,3,0,4,5,7,4] ; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} ymm9 = ymm13[1,0,2,2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm9[0,1],ymm3[2],ymm9[3,4,5,6],ymm3[7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm3[1,2,3],ymm2[4,5],ymm3[6,7] @@ -1005,11 +1005,11 @@ ; AVX2-SLOW-NEXT: vmovaps (%rdi), %xmm10 ; AVX2-SLOW-NEXT: vmovaps 32(%rdi), %xmm12 ; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} xmm8 = xmm10[2],xmm9[2],xmm10[3],xmm9[3] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm8 = xmm8[0,1,2,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm8 = xmm8[0,1,2,2] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,0,2,1] ; AVX2-SLOW-NEXT: vmovaps (%rdx), %xmm13 ; AVX2-SLOW-NEXT: vmovaps (%rcx), %xmm14 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm15 = xmm14[1,1,2,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm15 = xmm14[1,1,2,2] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm15 = xmm15[0,1],xmm13[2],xmm15[3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[0,1,1,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm15[0,1],ymm8[2,3],ymm15[4,5],ymm8[6,7] @@ -1019,37 +1019,37 @@ ; AVX2-SLOW-NEXT: vmovaps 32(%rdx), %xmm15 ; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm13 = xmm13[0],xmm14[0],xmm13[1],xmm14[1] ; AVX2-SLOW-NEXT: vmovaps 32(%rcx), %xmm14 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm13 = xmm13[0,1,2,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm13 = xmm13[0,1,2,2] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[0,0,2,1] ; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm9 = xmm9[0,1,3,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm9 = xmm9[0,1,3,2] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm10 = ymm9[0,1,1,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1],ymm13[2,3],ymm10[4,5,6],ymm13[7] ; AVX2-SLOW-NEXT: vinsertf128 $1, (%r8), %ymm9, %ymm9 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0],ymm10[1,2,3],ymm9[4],ymm10[5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} xmm10 = xmm12[2],xmm11[2],xmm12[3],xmm11[3] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm10 = xmm10[0,1,2,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm10 = xmm10[0,1,2,2] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[0,0,2,1] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm13 = xmm14[1,1,2,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm13 = xmm14[1,1,2,2] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm13 = xmm13[0,1],xmm15[2],xmm13[3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[0,1,1,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm10 = ymm13[0,1],ymm10[2,3],ymm13[4,5],ymm10[6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm13 = ymm6[0,1,2,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0],ymm13[1],ymm10[2,3,4,5],ymm13[6],ymm10[7] ; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm13 = xmm15[0],xmm14[0],xmm15[1],xmm14[1] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm13 = xmm13[0,1,2,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm13 = xmm13[0,1,2,2] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[0,0,2,1] ; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm11 = xmm12[0],xmm11[0],xmm12[1],xmm11[1] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm11 = xmm11[0,1,3,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm11 = xmm11[0,1,3,2] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm12 = ymm11[0,1,1,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1],ymm13[2,3],ymm12[4,5,6],ymm13[7] ; AVX2-SLOW-NEXT: vinsertf128 $1, 32(%r8), %ymm11, %ymm11 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0],ymm12[1,2,3],ymm11[4],ymm12[5,6,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm12 = ymm7[1,1,1,1,5,5,5,5] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm12 = ymm7[1,1,1,1,5,5,5,5] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0],ymm4[1],ymm12[2,3,4],ymm4[5],ymm12[6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm12 = ymm12[2,2,2,2] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm13 = ymm5[1,1,2,2,5,5,6,6] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm13 = ymm5[1,1,2,2,5,5,6,6] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0],ymm3[1,2],ymm13[3,4],ymm3[5,6],ymm13[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[2,2,2,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0,1,2],ymm12[3,4],ymm13[5,6,7] @@ -1057,15 +1057,15 @@ ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0],ymm12[1,2,3,4],ymm13[5],ymm12[6,7] ; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm13 = ymm5[2],ymm3[2],ymm5[3],ymm3[3],ymm5[6],ymm3[6],ymm5[7],ymm3[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[3,3,3,3] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm14 = ymm7[0,2,3,3,4,6,7,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm15 = ymm4[2,3,2,3,6,7,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm14 = ymm7[0,2,3,3,4,6,7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm15 = ymm4[2,3,2,3,6,7,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0],ymm14[1,2],ymm15[3,4],ymm14[5,6],ymm15[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[2,1,3,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0,1,2],ymm13[3,4],ymm14[5,6,7] ; AVX2-SLOW-NEXT: vbroadcastsd 56(%r8), %ymm14 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1],ymm14[2],ymm13[3,4,5,6],ymm14[7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm14 = ymm2[0,2,3,3,4,6,7,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm15 = ymm1[2,3,2,3,6,7,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm14 = ymm2[0,2,3,3,4,6,7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm15 = ymm1[2,3,2,3,6,7,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0],ymm14[1,2],ymm15[3,4],ymm14[5,6],ymm15[7] ; AVX2-SLOW-NEXT: vmovaps (%rdi), %ymm15 ; AVX2-SLOW-NEXT: vmovaps (%rsi), %ymm0 @@ -1075,26 +1075,26 @@ ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm14[0,1,2],ymm8[3,4],ymm14[5,6,7] ; AVX2-SLOW-NEXT: vbroadcastsd 24(%r8), %ymm14 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm14[2],ymm8[3,4,5,6],ymm14[7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm14 = ymm2[1,1,1,1,5,5,5,5] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm14 = ymm2[1,1,1,1,5,5,5,5] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0],ymm1[1],ymm14[2,3,4],ymm1[5],ymm14[6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[2,2,2,2] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm9 = ymm0[1,1,2,2,5,5,6,6] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm9 = ymm0[1,1,2,2,5,5,6,6] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0],ymm15[1,2],ymm9[3,4],ymm15[5,6],ymm9[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[2,2,2,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2],ymm14[3,4],ymm9[5,6,7] ; AVX2-SLOW-NEXT: vbroadcastsd 16(%r8), %ymm14 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm14[0],ymm9[1,2,3,4],ymm14[5],ymm9[6,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm5 = ymm5[3,0,2,3,7,4,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm5 = ymm5[3,0,2,3,7,4,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4],ymm5[5,6,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm5 = ymm7[0,1,3,0,4,5,7,4] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm4 = ymm4[2,3,0,1,6,7,4,5] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm5 = ymm7[0,1,3,0,4,5,7,4] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm4 = ymm4[2,3,0,1,6,7,4,5] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm5[2],ymm4[3,4,5,6],ymm5[7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm4[1,2,3],ymm3[4,5],ymm4[6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm6[3],ymm3[4,5,6,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,0,2,3,7,4,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,0,2,3,7,4,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4],ymm0[5,6,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm2 = ymm2[0,1,3,0,4,5,7,4] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[2,3,0,1,6,7,4,5] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,1,3,0,4,5,7,4] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,3,0,1,6,7,4,5] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4,5,6],ymm2[7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4,5],ymm1[6,7] ; AVX2-SLOW-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload @@ -1132,7 +1132,7 @@ ; AVX2-FAST-NEXT: vmovaps 32(%rdx), %xmm13 ; AVX2-FAST-NEXT: vmovaps (%rcx), %xmm14 ; AVX2-FAST-NEXT: vmovaps 32(%rcx), %xmm15 -; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm5 = xmm14[1,1,2,2] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm5 = xmm14[1,1,2,2] ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1],xmm12[2],xmm5[3] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,1,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1],ymm4[2,3],ymm5[4,5],ymm4[6,7] @@ -1141,7 +1141,7 @@ ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vunpckhps {{.*#+}} xmm5 = xmm9[2],xmm7[2],xmm9[3],xmm7[3] ; AVX2-FAST-NEXT: vpermps %ymm5, %ymm11, %ymm5 -; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm8 = xmm15[1,1,2,2] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm8 = xmm15[1,1,2,2] ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0,1],xmm13[2],xmm8[3] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,1,1,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm8[0,1],ymm5[2,3],ymm8[4,5],ymm5[6,7] @@ -1158,17 +1158,17 @@ ; AVX2-FAST-NEXT: vmovaps 32(%rcx), %ymm12 ; AVX2-FAST-NEXT: vinsertf128 $1, (%r8), %ymm6, %ymm6 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm10[1,2,3],ymm6[4],ymm10[5,6,7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm10 = ymm12[1,1,1,1,5,5,5,5] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm10 = ymm12[1,1,1,1,5,5,5,5] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0],ymm8[1],ymm10[2,3,4],ymm8[5],ymm10[6,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[2,2,2,2] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm14 = ymm2[1,1,2,2,5,5,6,6] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm14 = ymm2[1,1,2,2,5,5,6,6] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0],ymm1[1,2],ymm14[3,4],ymm1[5,6],ymm14[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[2,2,2,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm10 = ymm14[0,1,2],ymm10[3,4],ymm14[5,6,7] ; AVX2-FAST-NEXT: vbroadcastsd 48(%r8), %ymm14 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm10 = ymm14[0],ymm10[1,2,3,4],ymm14[5],ymm10[6,7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm14 = ymm12[0,2,3,3,4,6,7,7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm4 = ymm8[2,3,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm14 = ymm12[0,2,3,3,4,6,7,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm8[2,3,2,3,6,7,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm14[1,2],ymm4[3,4],ymm14[5,6],ymm4[7] ; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm14 = ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[6],ymm1[6],ymm2[7],ymm1[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[3,3,3,3] @@ -1185,8 +1185,8 @@ ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm4[2,3],ymm0[4,5,6],ymm4[7] ; AVX2-FAST-NEXT: vinsertf128 $1, 32(%r8), %ymm7, %ymm4 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm4[0],ymm0[1,2,3],ymm4[4],ymm0[5,6,7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm0 = ymm9[0,2,3,3,4,6,7,7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm4 = ymm11[2,3,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm9[0,2,3,3,4,6,7,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm11[2,3,2,3,6,7,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0],ymm0[1,2],ymm4[3,4],ymm0[5,6],ymm4[7] ; AVX2-FAST-NEXT: vmovaps (%rdi), %ymm4 ; AVX2-FAST-NEXT: vmovaps (%rsi), %ymm13 @@ -1196,26 +1196,26 @@ ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm15[3,4],ymm0[5,6,7] ; AVX2-FAST-NEXT: vbroadcastsd 24(%r8), %ymm15 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm15[2],ymm0[3,4,5,6],ymm15[7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm15 = ymm9[1,1,1,1,5,5,5,5] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm15 = ymm9[1,1,1,1,5,5,5,5] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0],ymm11[1],ymm15[2,3,4],ymm11[5],ymm15[6,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[2,2,2,2] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm5 = ymm13[1,1,2,2,5,5,6,6] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm5 = ymm13[1,1,2,2,5,5,6,6] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm4[1,2],ymm5[3,4],ymm4[5,6],ymm5[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[2,2,2,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2],ymm15[3,4],ymm5[5,6,7] ; AVX2-FAST-NEXT: vbroadcastsd 16(%r8), %ymm15 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm15[0],ymm5[1,2,3,4],ymm15[5],ymm5[6,7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm2 = ymm2[3,0,2,3,7,4,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm2[3,0,2,3,7,4,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4],ymm2[5,6,7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm2 = ymm12[0,1,3,0,4,5,7,4] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm8 = ymm8[2,3,0,1,6,7,4,5] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm12[0,1,3,0,4,5,7,4] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm8 = ymm8[2,3,0,1,6,7,4,5] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm8[0,1],ymm2[2],ymm8[3,4,5,6],ymm2[7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4,5],ymm2[6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm3[3],ymm1[4,5,6,7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm2 = ymm13[3,0,2,3,7,4,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm13[3,0,2,3,7,4,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4],ymm2[5,6,7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm3 = ymm9[0,1,3,0,4,5,7,4] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm4 = ymm11[2,3,0,1,6,7,4,5] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm3 = ymm9[0,1,3,0,4,5,7,4] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm11[2,3,0,1,6,7,4,5] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1],ymm3[2],ymm4[3,4,5,6],ymm3[7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm3[1,2,3],ymm2[4,5],ymm3[6,7] ; AVX2-FAST-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload @@ -1251,11 +1251,11 @@ ; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdi), %xmm10 ; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rdi), %xmm12 ; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} xmm8 = xmm10[2],xmm9[2],xmm10[3],xmm9[3] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm8 = xmm8[0,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm8 = xmm8[0,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,0,2,1] ; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdx), %xmm13 ; AVX2-FAST-PERLANE-NEXT: vmovaps (%rcx), %xmm14 -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm15 = xmm14[1,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm15 = xmm14[1,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm15 = xmm15[0,1],xmm13[2],xmm15[3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[0,1,1,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm15[0,1],ymm8[2,3],ymm15[4,5],ymm8[6,7] @@ -1265,37 +1265,37 @@ ; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rdx), %xmm15 ; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm13 = xmm13[0],xmm14[0],xmm13[1],xmm14[1] ; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rcx), %xmm14 -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm13 = xmm13[0,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm13 = xmm13[0,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[0,0,2,1] ; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm9 = xmm9[0,1,3,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm9 = xmm9[0,1,3,2] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm10 = ymm9[0,1,1,1] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1],ymm13[2,3],ymm10[4,5,6],ymm13[7] ; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, (%r8), %ymm9, %ymm9 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0],ymm10[1,2,3],ymm9[4],ymm10[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} xmm10 = xmm12[2],xmm11[2],xmm12[3],xmm11[3] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm10 = xmm10[0,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm10 = xmm10[0,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm13 = xmm14[1,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm13 = xmm14[1,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm13 = xmm13[0,1],xmm15[2],xmm13[3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[0,1,1,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm10 = ymm13[0,1],ymm10[2,3],ymm13[4,5],ymm10[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm13 = ymm6[0,1,2,1] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0],ymm13[1],ymm10[2,3,4,5],ymm13[6],ymm10[7] ; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm13 = xmm15[0],xmm14[0],xmm15[1],xmm14[1] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm13 = xmm13[0,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm13 = xmm13[0,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[0,0,2,1] ; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm11 = xmm12[0],xmm11[0],xmm12[1],xmm11[1] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm11 = xmm11[0,1,3,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm11 = xmm11[0,1,3,2] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm12 = ymm11[0,1,1,1] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1],ymm13[2,3],ymm12[4,5,6],ymm13[7] ; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, 32(%r8), %ymm11, %ymm11 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0],ymm12[1,2,3],ymm11[4],ymm12[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm12 = ymm7[1,1,1,1,5,5,5,5] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm12 = ymm7[1,1,1,1,5,5,5,5] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0],ymm4[1],ymm12[2,3,4],ymm4[5],ymm12[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm12 = ymm12[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm13 = ymm5[1,1,2,2,5,5,6,6] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm13 = ymm5[1,1,2,2,5,5,6,6] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0],ymm3[1,2],ymm13[3,4],ymm3[5,6],ymm13[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[2,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0,1,2],ymm12[3,4],ymm13[5,6,7] @@ -1303,15 +1303,15 @@ ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0],ymm12[1,2,3,4],ymm13[5],ymm12[6,7] ; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm13 = ymm5[2],ymm3[2],ymm5[3],ymm3[3],ymm5[6],ymm3[6],ymm5[7],ymm3[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[3,3,3,3] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm14 = ymm7[0,2,3,3,4,6,7,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm15 = ymm4[2,3,2,3,6,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm14 = ymm7[0,2,3,3,4,6,7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm15 = ymm4[2,3,2,3,6,7,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0],ymm14[1,2],ymm15[3,4],ymm14[5,6],ymm15[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[2,1,3,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0,1,2],ymm13[3,4],ymm14[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 56(%r8), %ymm14 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1],ymm14[2],ymm13[3,4,5,6],ymm14[7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm14 = ymm2[0,2,3,3,4,6,7,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm15 = ymm1[2,3,2,3,6,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm14 = ymm2[0,2,3,3,4,6,7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm15 = ymm1[2,3,2,3,6,7,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0],ymm14[1,2],ymm15[3,4],ymm14[5,6],ymm15[7] ; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdi), %ymm15 ; AVX2-FAST-PERLANE-NEXT: vmovaps (%rsi), %ymm0 @@ -1321,26 +1321,26 @@ ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm14[0,1,2],ymm8[3,4],ymm14[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 24(%r8), %ymm14 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm14[2],ymm8[3,4,5,6],ymm14[7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm14 = ymm2[1,1,1,1,5,5,5,5] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm14 = ymm2[1,1,1,1,5,5,5,5] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0],ymm1[1],ymm14[2,3,4],ymm1[5],ymm14[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm9 = ymm0[1,1,2,2,5,5,6,6] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm9 = ymm0[1,1,2,2,5,5,6,6] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0],ymm15[1,2],ymm9[3,4],ymm15[5,6],ymm9[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[2,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2],ymm14[3,4],ymm9[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 16(%r8), %ymm14 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm14[0],ymm9[1,2,3,4],ymm14[5],ymm9[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm5 = ymm5[3,0,2,3,7,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm5 = ymm5[3,0,2,3,7,4,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4],ymm5[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm5 = ymm7[0,1,3,0,4,5,7,4] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm4 = ymm4[2,3,0,1,6,7,4,5] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm5 = ymm7[0,1,3,0,4,5,7,4] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm4 = ymm4[2,3,0,1,6,7,4,5] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm5[2],ymm4[3,4,5,6],ymm5[7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm4[1,2,3],ymm3[4,5],ymm4[6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm6[3],ymm3[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,0,2,3,7,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,0,2,3,7,4,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4],ymm0[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm2 = ymm2[0,1,3,0,4,5,7,4] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[2,3,0,1,6,7,4,5] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,1,3,0,4,5,7,4] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,3,0,1,6,7,4,5] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4,5,6],ymm2[7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4,5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload @@ -1921,7 +1921,7 @@ ; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm10 ; AVX1-ONLY-NEXT: vmovaps (%rcx), %xmm12 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm12[0],xmm10[0] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[0,1,2,0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,1,2,0] ; AVX1-ONLY-NEXT: vbroadcastss 4(%rdx), %xmm3 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5,6],ymm2[7] @@ -1934,7 +1934,7 @@ ; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %xmm4 ; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %xmm3 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm3[0],xmm4[0] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[0,1,2,0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,1,2,0] ; AVX1-ONLY-NEXT: vbroadcastss 36(%rdx), %xmm6 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm2, %ymm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5,6],ymm2[7] @@ -1947,7 +1947,7 @@ ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 64(%rcx), %xmm8 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm6 = xmm8[0],xmm0[0] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm6 = xmm6[0,1,2,0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm6[0,1,2,0] ; AVX1-ONLY-NEXT: vbroadcastss 68(%rdx), %xmm13 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm6, %ymm6 ; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm13 = zero,xmm1[1],xmm2[1],zero @@ -1962,7 +1962,7 @@ ; AVX1-ONLY-NEXT: vmovaps 96(%rcx), %xmm6 ; AVX1-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm6 = xmm6[0],xmm0[0] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm6 = xmm6[0,1,2,0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm6[0,1,2,0] ; AVX1-ONLY-NEXT: vbroadcastss 100(%rdx), %xmm13 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm6, %ymm13 ; AVX1-ONLY-NEXT: vmovaps 96(%rsi), %xmm0 @@ -2136,7 +2136,7 @@ ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3],ymm1[4,5],ymm0[6,7] ; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[0,1,2],mem[3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm1 = ymm12[3,0,2,3,7,4,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm12[3,0,2,3,7,4,6,7] ; AVX1-ONLY-NEXT: vblendps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4],ymm1[5,6,7] ; AVX1-ONLY-NEXT: vpermilps $52, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload @@ -2147,16 +2147,16 @@ ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm12[1,2,3],ymm1[4,5],ymm12[6,7] ; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm1 = ymm1[0,1,2],mem[3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm6 = ymm6[3,0,2,3,7,4,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm6[3,0,2,3,7,4,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4],ymm6[5,6,7] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm7 = ymm9[0,1,3,0,4,5,7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm9[0,1,3,0,4,5,7,4] ; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} ymm9 = ymm11[1,0,2,2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm9[0,1],ymm7[2],ymm9[3,4,5,6],ymm7[7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm7[1,2,3],ymm6[4,5],ymm7[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2],ymm8[3],ymm6[4,5,6,7] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm7 = ymm15[3,0,2,3,7,4,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm15[3,0,2,3,7,4,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm7[0,1,2,3],ymm2[4],ymm7[5,6,7] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm3 = ymm3[0,1,3,0,4,5,7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,1,3,0,4,5,7,4] ; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} ymm7 = ymm14[1,0,2,2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm7[0,1],ymm3[2],ymm7[3,4,5,6],ymm3[7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm3[1,2,3],ymm2[4,5],ymm3[6,7] @@ -2215,14 +2215,14 @@ ; AVX2-SLOW-NEXT: vmovaps 32(%rdx), %xmm4 ; AVX2-SLOW-NEXT: vmovaps 64(%rdx), %xmm1 ; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm2 = xmm8[0],xmm7[0],xmm8[1],xmm7[1] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[0,1,2,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,1,2,2] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm5 = ymm2[0,0,2,1] ; AVX2-SLOW-NEXT: vmovaps (%rsi), %xmm9 ; AVX2-SLOW-NEXT: vmovaps 32(%rsi), %xmm6 ; AVX2-SLOW-NEXT: vmovaps 64(%rsi), %xmm2 ; AVX2-SLOW-NEXT: vmovaps (%rdi), %xmm10 ; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm11 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm11 = xmm11[0,1,3,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm11 = xmm11[0,1,3,2] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm12 = ymm11[0,1,1,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm12[0,1],ymm5[2,3],ymm12[4,5,6],ymm5[7] ; AVX2-SLOW-NEXT: vmovaps 32(%rdi), %xmm12 @@ -2231,9 +2231,9 @@ ; AVX2-SLOW-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps 64(%rdi), %xmm5 ; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} xmm9 = xmm10[2],xmm9[2],xmm10[3],xmm9[3] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm9 = xmm9[0,1,2,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm9 = xmm9[0,1,2,2] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[0,0,2,1] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm7 = xmm7[1,1,2,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm7 = xmm7[1,1,2,2] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm8[2],xmm7[3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,1,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm9[2,3],ymm7[4,5],ymm9[6,7] @@ -2241,19 +2241,19 @@ ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0],ymm8[1],ymm7[2,3,4,5],ymm8[6],ymm7[7] ; AVX2-SLOW-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm7 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm7 = xmm7[0,1,2,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm7 = xmm7[0,1,2,2] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,0,2,1] ; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm8 = xmm12[0],xmm6[0],xmm12[1],xmm6[1] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm8 = xmm8[0,1,3,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm8 = xmm8[0,1,3,2] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm9 = ymm8[0,1,1,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm9[0,1],ymm7[2,3],ymm9[4,5,6],ymm7[7] ; AVX2-SLOW-NEXT: vinsertf128 $1, 32(%r8), %ymm8, %ymm8 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0],ymm7[1,2,3],ymm8[4],ymm7[5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} xmm6 = xmm12[2],xmm6[2],xmm12[3],xmm6[3] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm6 = xmm6[0,1,2,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm6 = xmm6[0,1,2,2] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,0,2,1] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm3 = xmm3[1,1,2,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm3[1,1,2,2] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm4[2],xmm3[3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,1,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm6[2,3],ymm3[4,5],ymm6[6,7] @@ -2261,19 +2261,19 @@ ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3,4,5],ymm4[6],ymm3[7] ; AVX2-SLOW-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm3 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm3 = xmm3[0,1,2,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,1,2,2] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,0,2,1] ; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm4 = xmm5[0],xmm2[0],xmm5[1],xmm2[1] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm4 = xmm4[0,1,3,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm4 = xmm4[0,1,3,2] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm6 = ymm4[0,1,1,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm6[0,1],ymm3[2,3],ymm6[4,5,6],ymm3[7] ; AVX2-SLOW-NEXT: vinsertf128 $1, 64(%r8), %ymm4, %ymm4 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0],ymm3[1,2,3],ymm4[4],ymm3[5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} xmm2 = xmm5[2],xmm2[2],xmm5[3],xmm2[3] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[0,1,2,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,1,2,2] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,0,2,1] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,2,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1,2,2] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5],ymm2[6,7] @@ -2283,12 +2283,12 @@ ; AVX2-SLOW-NEXT: vmovaps 96(%rcx), %xmm0 ; AVX2-SLOW-NEXT: vmovaps 96(%rdx), %xmm1 ; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[0,1,2,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,1,2,2] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,0,2,1] ; AVX2-SLOW-NEXT: vmovaps 96(%rsi), %xmm3 ; AVX2-SLOW-NEXT: vmovaps 96(%rdi), %xmm4 ; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm5 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm5 = xmm5[0,1,3,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm5 = xmm5[0,1,3,2] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm6 = ymm5[0,1,1,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm6[0,1],ymm2[2,3],ymm6[4,5,6],ymm2[7] ; AVX2-SLOW-NEXT: vmovaps (%rdi), %ymm6 @@ -2298,23 +2298,23 @@ ; AVX2-SLOW-NEXT: vmovaps (%rdx), %ymm5 ; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} xmm2 = xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX2-SLOW-NEXT: vmovaps (%rcx), %ymm14 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,2,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1,2,2] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3] ; AVX2-SLOW-NEXT: vmovaps 96(%r8), %ymm3 ; AVX2-SLOW-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm1 = xmm2[0,1,2,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm2[0,1,2,2] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,0,2,1] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm3[0,1,2,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5],ymm1[6],ymm0[7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm14[1,1,1,1,5,5,5,5] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm14[1,1,1,1,5,5,5,5] ; AVX2-SLOW-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm5[1],ymm0[2,3,4],ymm5[5],ymm0[6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] ; AVX2-SLOW-NEXT: vmovaps (%rsi), %ymm13 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm13[1,1,2,2,5,5,6,6] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm13[1,1,2,2,5,5,6,6] ; AVX2-SLOW-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm6[1,2],ymm1[3,4],ymm6[5,6],ymm1[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,3] @@ -2322,8 +2322,8 @@ ; AVX2-SLOW-NEXT: vbroadcastsd 16(%r8), %ymm1 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4],ymm1[5],ymm0[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm14[0,2,3,3,4,6,7,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm5[2,3,2,3,6,7,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm14[0,2,3,3,4,6,7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm5[2,3,2,3,6,7,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7] ; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm1 = ymm13[2],ymm6[2],ymm13[3],ymm6[3],ymm13[6],ymm6[6],ymm13[7],ymm6[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] @@ -2334,20 +2334,20 @@ ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps 32(%rdx), %ymm12 ; AVX2-SLOW-NEXT: vmovaps 32(%rcx), %ymm11 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm11[1,1,1,1,5,5,5,5] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm11[1,1,1,1,5,5,5,5] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm12[1],ymm0[2,3,4],ymm12[5],ymm0[6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] ; AVX2-SLOW-NEXT: vmovaps 32(%rdi), %ymm10 ; AVX2-SLOW-NEXT: vmovaps 32(%rsi), %ymm9 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm9[1,1,2,2,5,5,6,6] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm9[1,1,2,2,5,5,6,6] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm10[1,2],ymm1[3,4],ymm10[5,6],ymm1[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4],ymm1[5,6,7] ; AVX2-SLOW-NEXT: vbroadcastsd 48(%r8), %ymm1 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4],ymm1[5],ymm0[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm11[0,2,3,3,4,6,7,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm12[2,3,2,3,6,7,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm11[0,2,3,3,4,6,7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm12[2,3,2,3,6,7,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7] ; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm1 = ymm9[2],ymm10[2],ymm9[3],ymm10[3],ymm9[6],ymm10[6],ymm9[7],ymm10[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] @@ -2358,20 +2358,20 @@ ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps 64(%rdx), %ymm8 ; AVX2-SLOW-NEXT: vmovaps 64(%rcx), %ymm7 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm7[1,1,1,1,5,5,5,5] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm7[1,1,1,1,5,5,5,5] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm8[1],ymm0[2,3,4],ymm8[5],ymm0[6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] ; AVX2-SLOW-NEXT: vmovaps 64(%rdi), %ymm6 ; AVX2-SLOW-NEXT: vmovaps 64(%rsi), %ymm5 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm5[1,1,2,2,5,5,6,6] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm5[1,1,2,2,5,5,6,6] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm6[1,2],ymm1[3,4],ymm6[5,6],ymm1[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4],ymm1[5,6,7] ; AVX2-SLOW-NEXT: vbroadcastsd 80(%r8), %ymm1 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4],ymm1[5],ymm0[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm7[0,2,3,3,4,6,7,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm8[2,3,2,3,6,7,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm7[0,2,3,3,4,6,7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm8[2,3,2,3,6,7,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7] ; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm1 = ymm5[2],ymm6[2],ymm5[3],ymm6[3],ymm5[6],ymm6[6],ymm5[7],ymm6[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] @@ -2382,20 +2382,20 @@ ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps 96(%rdx), %ymm4 ; AVX2-SLOW-NEXT: vmovaps 96(%rcx), %ymm3 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm3[1,1,1,1,5,5,5,5] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm3[1,1,1,1,5,5,5,5] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm4[1],ymm0[2,3,4],ymm4[5],ymm0[6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] ; AVX2-SLOW-NEXT: vmovaps 96(%rdi), %ymm2 ; AVX2-SLOW-NEXT: vmovaps 96(%rsi), %ymm1 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm15 = ymm1[1,1,2,2,5,5,6,6] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm15 = ymm1[1,1,2,2,5,5,6,6] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0],ymm2[1,2],ymm15[3,4],ymm2[5,6],ymm15[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[2,2,2,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2],ymm0[3,4],ymm15[5,6,7] ; AVX2-SLOW-NEXT: vbroadcastsd 112(%r8), %ymm15 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0],ymm0[1,2,3,4],ymm15[5],ymm0[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm3[0,2,3,3,4,6,7,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm15 = ymm4[2,3,2,3,6,7,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm3[0,2,3,3,4,6,7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm15 = ymm4[2,3,2,3,6,7,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0],ymm0[1,2],ymm15[3,4],ymm0[5,6],ymm15[7] ; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm15 = ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[6],ymm2[6],ymm1[7],ymm2[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[3,3,3,3] @@ -2403,36 +2403,36 @@ ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm15[3,4],ymm0[5,6,7] ; AVX2-SLOW-NEXT: vbroadcastsd 120(%r8), %ymm15 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm15 = ymm0[0,1],ymm15[2],ymm0[3,4,5,6],ymm15[7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm13[3,0,2,3,7,4,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm13[3,0,2,3,7,4,6,7] ; AVX2-SLOW-NEXT: vblendps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4],ymm0[5,6,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm13 = ymm14[0,1,3,0,4,5,7,4] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm13 = ymm14[0,1,3,0,4,5,7,4] ; AVX2-SLOW-NEXT: vpermilps $78, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm14 = mem[2,3,0,1,6,7,4,5] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0,1],ymm13[2],ymm14[3,4,5,6],ymm13[7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm13[1,2,3],ymm0[4,5],ymm13[6,7] ; AVX2-SLOW-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm0 = ymm0[0,1,2],mem[3],ymm0[4,5,6,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm9 = ymm9[3,0,2,3,7,4,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm9 = ymm9[3,0,2,3,7,4,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4],ymm9[5,6,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm10 = ymm11[0,1,3,0,4,5,7,4] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm11 = ymm12[2,3,0,1,6,7,4,5] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm10 = ymm11[0,1,3,0,4,5,7,4] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm11 = ymm12[2,3,0,1,6,7,4,5] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0,1],ymm10[2],ymm11[3,4,5,6],ymm10[7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0],ymm10[1,2,3],ymm9[4,5],ymm10[6,7] ; AVX2-SLOW-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm9 = ymm9[0,1,2],mem[3],ymm9[4,5,6,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm5 = ymm5[3,0,2,3,7,4,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm5 = ymm5[3,0,2,3,7,4,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4],ymm5[5,6,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm6 = ymm7[0,1,3,0,4,5,7,4] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm7 = ymm8[2,3,0,1,6,7,4,5] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm6 = ymm7[0,1,3,0,4,5,7,4] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm7 = ymm8[2,3,0,1,6,7,4,5] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1],ymm6[2],ymm7[3,4,5,6],ymm6[7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm6[1,2,3],ymm5[4,5],ymm6[6,7] ; AVX2-SLOW-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm5 = ymm5[0,1,2],mem[3],ymm5[4,5,6,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[3,0,2,3,7,4,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm1[3,0,2,3,7,4,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4],ymm1[5,6,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm2 = ymm3[0,1,3,0,4,5,7,4] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm3 = ymm4[2,3,0,1,6,7,4,5] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm3[0,1,3,0,4,5,7,4] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm3 = ymm4[2,3,0,1,6,7,4,5] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2],ymm3[3,4,5,6],ymm2[7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4,5],ymm2[6,7] ; AVX2-SLOW-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload @@ -2497,7 +2497,7 @@ ; AVX2-FAST-NEXT: vmovaps 32(%rdx), %xmm1 ; AVX2-FAST-NEXT: vmovaps (%rcx), %xmm8 ; AVX2-FAST-NEXT: vmovaps 32(%rcx), %xmm2 -; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm3 = xmm8[1,1,2,2] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm3 = xmm8[1,1,2,2] ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm7[2],xmm3[3] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,1,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3],ymm3[4,5],ymm0[6,7] @@ -2506,7 +2506,7 @@ ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vunpckhps {{.*#+}} xmm0 = xmm9[2],xmm6[2],xmm9[3],xmm6[3] ; AVX2-FAST-NEXT: vpermps %ymm0, %ymm14, %ymm0 -; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm3 = xmm2[1,1,2,2] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm3 = xmm2[1,1,2,2] ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm1[2],xmm3[3] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,1,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3],ymm3[4,5],ymm0[6,7] @@ -2517,7 +2517,7 @@ ; AVX2-FAST-NEXT: vpermps %ymm0, %ymm14, %ymm10 ; AVX2-FAST-NEXT: vmovaps 64(%rdx), %xmm0 ; AVX2-FAST-NEXT: vmovaps 64(%rcx), %xmm3 -; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm11 = xmm3[1,1,2,2] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm11 = xmm3[1,1,2,2] ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm11 = xmm11[0,1],xmm0[2],xmm11[3] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[0,1,1,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0,1],ymm10[2,3],ymm11[4,5],ymm10[6,7] @@ -2536,7 +2536,7 @@ ; AVX2-FAST-NEXT: vmovaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vmovaps 96(%rcx), %xmm11 ; AVX2-FAST-NEXT: vmovaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm11 = xmm11[1,1,2,2] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm11 = xmm11[1,1,2,2] ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm11 = xmm11[0,1],xmm12[2],xmm11[3] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[0,1,1,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0,1],ymm10[2,3],ymm11[4,5],ymm10[6,7] @@ -2556,7 +2556,7 @@ ; AVX2-FAST-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps (%rdx), %ymm5 ; AVX2-FAST-NEXT: vmovaps (%rcx), %ymm7 -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm4 = ymm7[1,1,1,1,5,5,5,5] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm7[1,1,1,1,5,5,5,5] ; AVX2-FAST-NEXT: vmovaps %ymm7, %ymm8 ; AVX2-FAST-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm5[1],ymm4[2,3,4],ymm5[5],ymm4[6,7] @@ -2565,15 +2565,15 @@ ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[2,2,2,2] ; AVX2-FAST-NEXT: vmovaps (%rdi), %ymm13 ; AVX2-FAST-NEXT: vmovaps (%rsi), %ymm12 -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm5 = ymm12[1,1,2,2,5,5,6,6] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm5 = ymm12[1,1,2,2,5,5,6,6] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm13[1,2],ymm5[3,4],ymm13[5,6],ymm5[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[2,2,2,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3,4],ymm5[5,6,7] ; AVX2-FAST-NEXT: vbroadcastsd 16(%r8), %ymm5 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0],ymm4[1,2,3,4],ymm5[5],ymm4[6,7] ; AVX2-FAST-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm4 = ymm8[0,2,3,3,4,6,7,7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm5 = ymm7[2,3,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm8[0,2,3,3,4,6,7,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm5 = ymm7[2,3,2,3,6,7,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0],ymm4[1,2],ymm5[3,4],ymm4[5,6],ymm5[7] ; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm5 = ymm12[2],ymm13[2],ymm12[3],ymm13[3],ymm12[6],ymm13[6],ymm12[7],ymm13[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[3,3,3,3] @@ -2592,20 +2592,20 @@ ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps 32(%rdx), %ymm10 ; AVX2-FAST-NEXT: vmovaps 32(%rcx), %ymm9 -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm1 = ymm9[1,1,1,1,5,5,5,5] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm9[1,1,1,1,5,5,5,5] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm10[1],ymm1[2,3,4],ymm10[5],ymm1[6,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,2] ; AVX2-FAST-NEXT: vmovaps 32(%rdi), %ymm8 ; AVX2-FAST-NEXT: vmovaps 32(%rsi), %ymm7 -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm2 = ymm7[1,1,2,2,5,5,6,6] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm7[1,1,2,2,5,5,6,6] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm8[1,2],ymm2[3,4],ymm8[5,6],ymm2[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[2,2,2,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4],ymm2[5,6,7] ; AVX2-FAST-NEXT: vbroadcastsd 48(%r8), %ymm2 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3,4],ymm2[5],ymm1[6,7] ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm1 = ymm9[0,2,3,3,4,6,7,7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm2 = ymm10[2,3,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm9[0,2,3,3,4,6,7,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm10[2,3,2,3,6,7,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3,4],ymm1[5,6],ymm2[7] ; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm2 = ymm7[2],ymm8[2],ymm7[3],ymm8[3],ymm7[6],ymm8[6],ymm7[7],ymm8[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[3,3,3,3] @@ -2625,20 +2625,20 @@ ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps 64(%rdx), %ymm6 ; AVX2-FAST-NEXT: vmovaps 64(%rcx), %ymm5 -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm0 = ymm5[1,1,1,1,5,5,5,5] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm5[1,1,1,1,5,5,5,5] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm6[1],ymm0[2,3,4],ymm6[5],ymm0[6,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] ; AVX2-FAST-NEXT: vmovaps 64(%rdi), %ymm4 ; AVX2-FAST-NEXT: vmovaps 64(%rsi), %ymm3 -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm1 = ymm3[1,1,2,2,5,5,6,6] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm3[1,1,2,2,5,5,6,6] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm4[1,2],ymm1[3,4],ymm4[5,6],ymm1[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4],ymm1[5,6,7] ; AVX2-FAST-NEXT: vbroadcastsd 80(%r8), %ymm1 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4],ymm1[5],ymm0[6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm0 = ymm5[0,2,3,3,4,6,7,7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm1 = ymm6[2,3,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm5[0,2,3,3,4,6,7,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm6[2,3,2,3,6,7,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7] ; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm1 = ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[6],ymm4[6],ymm3[7],ymm4[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] @@ -2661,20 +2661,20 @@ ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps 96(%rdx), %ymm14 ; AVX2-FAST-NEXT: vmovaps 96(%rcx), %ymm2 -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm0 = ymm2[1,1,1,1,5,5,5,5] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm2[1,1,1,1,5,5,5,5] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm14[1],ymm0[2,3,4],ymm14[5],ymm0[6,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm11 = ymm0[2,2,2,2] ; AVX2-FAST-NEXT: vmovaps 96(%rdi), %ymm1 ; AVX2-FAST-NEXT: vmovaps 96(%rsi), %ymm0 -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm15 = ymm0[1,1,2,2,5,5,6,6] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm15 = ymm0[1,1,2,2,5,5,6,6] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0],ymm1[1,2],ymm15[3,4],ymm1[5,6],ymm15[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[2,2,2,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm11 = ymm15[0,1,2],ymm11[3,4],ymm15[5,6,7] ; AVX2-FAST-NEXT: vbroadcastsd 112(%r8), %ymm15 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm11 = ymm15[0],ymm11[1,2,3,4],ymm15[5],ymm11[6,7] ; AVX2-FAST-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm11 = ymm2[0,2,3,3,4,6,7,7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm15 = ymm14[2,3,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm11 = ymm2[0,2,3,3,4,6,7,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm15 = ymm14[2,3,2,3,6,7,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm11 = ymm15[0],ymm11[1,2],ymm15[3,4],ymm11[5,6],ymm15[7] ; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm15 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[3,3,3,3] @@ -2682,7 +2682,7 @@ ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2],ymm15[3,4],ymm11[5,6,7] ; AVX2-FAST-NEXT: vbroadcastsd 120(%r8), %ymm15 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1],ymm15[2],ymm11[3,4,5,6],ymm15[7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm12 = ymm12[3,0,2,3,7,4,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm12 = ymm12[3,0,2,3,7,4,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm13[4],ymm12[5,6,7] ; AVX2-FAST-NEXT: vpermilps $52, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm13 = mem[0,1,3,0,4,5,7,4] @@ -2692,26 +2692,26 @@ ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0],ymm13[1,2,3],ymm12[4,5],ymm13[6,7] ; AVX2-FAST-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm12 = ymm12[0,1,2],mem[3],ymm12[4,5,6,7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm7 = ymm7[3,0,2,3,7,4,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm7 = ymm7[3,0,2,3,7,4,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4],ymm7[5,6,7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm8 = ymm9[0,1,3,0,4,5,7,4] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm9 = ymm10[2,3,0,1,6,7,4,5] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm8 = ymm9[0,1,3,0,4,5,7,4] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm9 = ymm10[2,3,0,1,6,7,4,5] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1],ymm8[2],ymm9[3,4,5,6],ymm8[7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0],ymm8[1,2,3],ymm7[4,5],ymm8[6,7] ; AVX2-FAST-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm7 = ymm7[0,1,2],mem[3],ymm7[4,5,6,7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm3 = ymm3[3,0,2,3,7,4,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm3 = ymm3[3,0,2,3,7,4,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4],ymm3[5,6,7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm4 = ymm5[0,1,3,0,4,5,7,4] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm5 = ymm6[2,3,0,1,6,7,4,5] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm5[0,1,3,0,4,5,7,4] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm5 = ymm6[2,3,0,1,6,7,4,5] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1],ymm4[2],ymm5[3,4,5,6],ymm4[7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm4[1,2,3],ymm3[4,5],ymm4[6,7] ; AVX2-FAST-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm3 = ymm3[0,1,2],mem[3],ymm3[4,5,6,7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,0,2,3,7,4,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,0,2,3,7,4,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4],ymm0[5,6,7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm1 = ymm2[0,1,3,0,4,5,7,4] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm2 = ymm14[2,3,0,1,6,7,4,5] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,1,3,0,4,5,7,4] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm14[2,3,0,1,6,7,4,5] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4,5,6],ymm1[7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4,5],ymm1[6,7] ; AVX2-FAST-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload @@ -2771,14 +2771,14 @@ ; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rdx), %xmm4 ; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%rdx), %xmm1 ; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm2 = xmm8[0],xmm7[0],xmm8[1],xmm7[1] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[0,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm5 = ymm2[0,0,2,1] ; AVX2-FAST-PERLANE-NEXT: vmovaps (%rsi), %xmm9 ; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rsi), %xmm6 ; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%rsi), %xmm2 ; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdi), %xmm10 ; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm11 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm11 = xmm11[0,1,3,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm11 = xmm11[0,1,3,2] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm12 = ymm11[0,1,1,1] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm12[0,1],ymm5[2,3],ymm12[4,5,6],ymm5[7] ; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rdi), %xmm12 @@ -2787,9 +2787,9 @@ ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%rdi), %xmm5 ; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} xmm9 = xmm10[2],xmm9[2],xmm10[3],xmm9[3] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm9 = xmm9[0,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm9 = xmm9[0,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm7 = xmm7[1,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm7 = xmm7[1,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm8[2],xmm7[3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,1,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm9[2,3],ymm7[4,5],ymm9[6,7] @@ -2797,19 +2797,19 @@ ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0],ymm8[1],ymm7[2,3,4,5],ymm8[6],ymm7[7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm7 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm7 = xmm7[0,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm7 = xmm7[0,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,0,2,1] ; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm8 = xmm12[0],xmm6[0],xmm12[1],xmm6[1] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm8 = xmm8[0,1,3,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm8 = xmm8[0,1,3,2] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm9 = ymm8[0,1,1,1] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm9[0,1],ymm7[2,3],ymm9[4,5,6],ymm7[7] ; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, 32(%r8), %ymm8, %ymm8 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0],ymm7[1,2,3],ymm8[4],ymm7[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} xmm6 = xmm12[2],xmm6[2],xmm12[3],xmm6[3] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm6 = xmm6[0,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm6 = xmm6[0,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm3 = xmm3[1,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm3 = xmm3[1,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm4[2],xmm3[3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,1,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm6[2,3],ymm3[4,5],ymm6[6,7] @@ -2817,19 +2817,19 @@ ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3,4,5],ymm4[6],ymm3[7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm3 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm3 = xmm3[0,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,0,2,1] ; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm4 = xmm5[0],xmm2[0],xmm5[1],xmm2[1] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm4 = xmm4[0,1,3,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm4 = xmm4[0,1,3,2] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm6 = ymm4[0,1,1,1] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm6[0,1],ymm3[2,3],ymm6[4,5,6],ymm3[7] ; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, 64(%r8), %ymm4, %ymm4 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0],ymm3[1,2,3],ymm4[4],ymm3[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} xmm2 = xmm5[2],xmm2[2],xmm5[3],xmm2[3] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[0,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5],ymm2[6,7] @@ -2839,12 +2839,12 @@ ; AVX2-FAST-PERLANE-NEXT: vmovaps 96(%rcx), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovaps 96(%rdx), %xmm1 ; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[0,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,0,2,1] ; AVX2-FAST-PERLANE-NEXT: vmovaps 96(%rsi), %xmm3 ; AVX2-FAST-PERLANE-NEXT: vmovaps 96(%rdi), %xmm4 ; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm5 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm5 = xmm5[0,1,3,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm5 = xmm5[0,1,3,2] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm6 = ymm5[0,1,1,1] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm6[0,1],ymm2[2,3],ymm6[4,5,6],ymm2[7] ; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdi), %ymm6 @@ -2854,23 +2854,23 @@ ; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdx), %ymm5 ; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} xmm2 = xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX2-FAST-PERLANE-NEXT: vmovaps (%rcx), %ymm14 -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3] ; AVX2-FAST-PERLANE-NEXT: vmovaps 96(%r8), %ymm3 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm1 = xmm2[0,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm2[0,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,0,2,1] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm3[0,1,2,1] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5],ymm1[6],ymm0[7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm0 = ymm14[1,1,1,1,5,5,5,5] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm14[1,1,1,1,5,5,5,5] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm5[1],ymm0[2,3,4],ymm5[5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] ; AVX2-FAST-PERLANE-NEXT: vmovaps (%rsi), %ymm13 -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm13[1,1,2,2,5,5,6,6] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm13[1,1,2,2,5,5,6,6] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm6[1,2],ymm1[3,4],ymm6[5,6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,3] @@ -2878,8 +2878,8 @@ ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 16(%r8), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4],ymm1[5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm0 = ymm14[0,2,3,3,4,6,7,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm5[2,3,2,3,6,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm14[0,2,3,3,4,6,7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm5[2,3,2,3,6,7,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm1 = ymm13[2],ymm6[2],ymm13[3],ymm6[3],ymm13[6],ymm6[6],ymm13[7],ymm6[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] @@ -2890,20 +2890,20 @@ ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rdx), %ymm12 ; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rcx), %ymm11 -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm0 = ymm11[1,1,1,1,5,5,5,5] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm11[1,1,1,1,5,5,5,5] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm12[1],ymm0[2,3,4],ymm12[5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] ; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rdi), %ymm10 ; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rsi), %ymm9 -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm9[1,1,2,2,5,5,6,6] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm9[1,1,2,2,5,5,6,6] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm10[1,2],ymm1[3,4],ymm10[5,6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4],ymm1[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 48(%r8), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4],ymm1[5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm0 = ymm11[0,2,3,3,4,6,7,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm12[2,3,2,3,6,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm11[0,2,3,3,4,6,7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm12[2,3,2,3,6,7,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm1 = ymm9[2],ymm10[2],ymm9[3],ymm10[3],ymm9[6],ymm10[6],ymm9[7],ymm10[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] @@ -2914,20 +2914,20 @@ ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%rdx), %ymm8 ; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%rcx), %ymm7 -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm0 = ymm7[1,1,1,1,5,5,5,5] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm7[1,1,1,1,5,5,5,5] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm8[1],ymm0[2,3,4],ymm8[5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] ; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%rdi), %ymm6 ; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%rsi), %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm5[1,1,2,2,5,5,6,6] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm5[1,1,2,2,5,5,6,6] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm6[1,2],ymm1[3,4],ymm6[5,6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4],ymm1[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 80(%r8), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4],ymm1[5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm0 = ymm7[0,2,3,3,4,6,7,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm8[2,3,2,3,6,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm7[0,2,3,3,4,6,7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm8[2,3,2,3,6,7,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm1 = ymm5[2],ymm6[2],ymm5[3],ymm6[3],ymm5[6],ymm6[6],ymm5[7],ymm6[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] @@ -2938,20 +2938,20 @@ ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 96(%rdx), %ymm4 ; AVX2-FAST-PERLANE-NEXT: vmovaps 96(%rcx), %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm0 = ymm3[1,1,1,1,5,5,5,5] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm3[1,1,1,1,5,5,5,5] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm4[1],ymm0[2,3,4],ymm4[5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] ; AVX2-FAST-PERLANE-NEXT: vmovaps 96(%rdi), %ymm2 ; AVX2-FAST-PERLANE-NEXT: vmovaps 96(%rsi), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm15 = ymm1[1,1,2,2,5,5,6,6] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm15 = ymm1[1,1,2,2,5,5,6,6] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0],ymm2[1,2],ymm15[3,4],ymm2[5,6],ymm15[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[2,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2],ymm0[3,4],ymm15[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 112(%r8), %ymm15 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0],ymm0[1,2,3,4],ymm15[5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm0 = ymm3[0,2,3,3,4,6,7,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm15 = ymm4[2,3,2,3,6,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm3[0,2,3,3,4,6,7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm15 = ymm4[2,3,2,3,6,7,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0],ymm0[1,2],ymm15[3,4],ymm0[5,6],ymm15[7] ; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm15 = ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[6],ymm2[6],ymm1[7],ymm2[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[3,3,3,3] @@ -2959,36 +2959,36 @@ ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm15[3,4],ymm0[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 120(%r8), %ymm15 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm15 = ymm0[0,1],ymm15[2],ymm0[3,4,5,6],ymm15[7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm0 = ymm13[3,0,2,3,7,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm13[3,0,2,3,7,4,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4],ymm0[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm13 = ymm14[0,1,3,0,4,5,7,4] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm13 = ymm14[0,1,3,0,4,5,7,4] ; AVX2-FAST-PERLANE-NEXT: vpermilps $78, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm14 = mem[2,3,0,1,6,7,4,5] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0,1],ymm13[2],ymm14[3,4,5,6],ymm13[7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm13[1,2,3],ymm0[4,5],ymm13[6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm0 = ymm0[0,1,2],mem[3],ymm0[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm9 = ymm9[3,0,2,3,7,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm9 = ymm9[3,0,2,3,7,4,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4],ymm9[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm10 = ymm11[0,1,3,0,4,5,7,4] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm11 = ymm12[2,3,0,1,6,7,4,5] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm10 = ymm11[0,1,3,0,4,5,7,4] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm11 = ymm12[2,3,0,1,6,7,4,5] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0,1],ymm10[2],ymm11[3,4,5,6],ymm10[7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0],ymm10[1,2,3],ymm9[4,5],ymm10[6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm9 = ymm9[0,1,2],mem[3],ymm9[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm5 = ymm5[3,0,2,3,7,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm5 = ymm5[3,0,2,3,7,4,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4],ymm5[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm6 = ymm7[0,1,3,0,4,5,7,4] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm7 = ymm8[2,3,0,1,6,7,4,5] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm6 = ymm7[0,1,3,0,4,5,7,4] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm7 = ymm8[2,3,0,1,6,7,4,5] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1],ymm6[2],ymm7[3,4,5,6],ymm6[7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm6[1,2,3],ymm5[4,5],ymm6[6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm5 = ymm5[0,1,2],mem[3],ymm5[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[3,0,2,3,7,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm1[3,0,2,3,7,4,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4],ymm1[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm2 = ymm3[0,1,3,0,4,5,7,4] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm3 = ymm4[2,3,0,1,6,7,4,5] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm3[0,1,3,0,4,5,7,4] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm3 = ymm4[2,3,0,1,6,7,4,5] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2],ymm3[3,4,5,6],ymm2[7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4,5],ymm2[6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload @@ -4107,7 +4107,7 @@ ; AVX1-ONLY-NEXT: vmovaps (%rcx), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm4 = xmm0[0],xmm2[0] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm4 = xmm4[0,1,2,0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm4[0,1,2,0] ; AVX1-ONLY-NEXT: vbroadcastss 4(%rdx), %xmm5 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm4 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm4[2,3],ymm1[4,5,6],ymm4[7] @@ -4122,7 +4122,7 @@ ; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm4 = xmm2[0],xmm0[0] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm4 = xmm4[0,1,2,0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm4[0,1,2,0] ; AVX1-ONLY-NEXT: vbroadcastss 36(%rdx), %xmm5 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm4 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm4[2,3],ymm1[4,5,6],ymm4[7] @@ -4136,7 +4136,7 @@ ; AVX1-ONLY-NEXT: vmovaps 64(%rcx), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,1,2,0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,0] ; AVX1-ONLY-NEXT: vbroadcastss 68(%rdx), %xmm3 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm3 = zero,xmm13[1],xmm10[1],zero @@ -4151,7 +4151,7 @@ ; AVX1-ONLY-NEXT: vmovaps 96(%rcx), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,1,2,0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,0] ; AVX1-ONLY-NEXT: vbroadcastss 100(%rdx), %xmm3 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm3 ; AVX1-ONLY-NEXT: vmovaps 96(%rsi), %xmm11 @@ -4168,7 +4168,7 @@ ; AVX1-ONLY-NEXT: vmovaps 128(%rcx), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm3 = xmm1[0],xmm0[0] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm3 = xmm3[0,1,2,0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,1,2,0] ; AVX1-ONLY-NEXT: vbroadcastss 132(%rdx), %xmm4 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm4 ; AVX1-ONLY-NEXT: vmovaps 128(%rsi), %xmm0 @@ -4186,7 +4186,7 @@ ; AVX1-ONLY-NEXT: vmovaps 160(%rcx), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm4 = xmm1[0],xmm0[0] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm4 = xmm4[0,1,2,0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm4[0,1,2,0] ; AVX1-ONLY-NEXT: vbroadcastss 164(%rdx), %xmm6 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm4, %ymm6 ; AVX1-ONLY-NEXT: vmovaps 160(%rsi), %xmm0 @@ -4204,7 +4204,7 @@ ; AVX1-ONLY-NEXT: vmovaps 192(%rcx), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,2,0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1,2,0] ; AVX1-ONLY-NEXT: vbroadcastss 196(%rdx), %xmm6 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vmovaps 192(%rsi), %xmm0 @@ -4222,7 +4222,7 @@ ; AVX1-ONLY-NEXT: vmovaps 224(%rcx), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,2,0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1,2,0] ; AVX1-ONLY-NEXT: vbroadcastss 228(%rdx), %xmm1 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm3 ; AVX1-ONLY-NEXT: vmovaps 224(%rsi), %xmm1 @@ -4580,7 +4580,7 @@ ; AVX1-ONLY-NEXT: # ymm3 = mem[3,0,2,3,7,4,6,7] ; AVX1-ONLY-NEXT: vblendps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm3 = ymm3[0,1,2,3],mem[4],ymm3[5,6,7] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm6 = ymm14[0,1,3,0,4,5,7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm14[0,1,3,0,4,5,7,4] ; AVX1-ONLY-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm10 = mem[1,0,2,2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm10[0,1],ymm6[2],ymm10[3,4,5,6],ymm6[7] @@ -4633,17 +4633,17 @@ ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm14[0],ymm12[1,2,3],ymm14[4,5],ymm12[6,7] ; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm12 = ymm12[0,1,2],mem[3],ymm12[4,5,6,7] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm7 = ymm7[3,0,2,3,7,4,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm7[3,0,2,3,7,4,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4],ymm7[5,6,7] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm8 = ymm11[0,1,3,0,4,5,7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm11[0,1,3,0,4,5,7,4] ; AVX1-ONLY-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm11 = mem[1,0,2,2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm11[0,1],ymm8[2],ymm11[3,4,5,6],ymm8[7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0],ymm8[1,2,3],ymm7[4,5],ymm8[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2],ymm9[3],ymm7[4,5,6,7] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,0,2,3,7,4,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,0,2,3,7,4,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4],ymm0[5,6,7] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm1 = ymm4[0,1,3,0,4,5,7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm4[0,1,3,0,4,5,7,4] ; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} ymm4 = ymm5[1,0,2,2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1],ymm1[2],ymm4[3,4,5,6],ymm1[7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4,5],ymm1[6,7] @@ -4741,14 +4741,14 @@ ; AVX2-SLOW-NEXT: vmovaps 32(%rdx), %xmm4 ; AVX2-SLOW-NEXT: vmovaps 64(%rdx), %xmm1 ; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm2 = xmm8[0],xmm7[0],xmm8[1],xmm7[1] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[0,1,2,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,1,2,2] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm5 = ymm2[0,0,2,1] ; AVX2-SLOW-NEXT: vmovaps (%rsi), %xmm9 ; AVX2-SLOW-NEXT: vmovaps 32(%rsi), %xmm6 ; AVX2-SLOW-NEXT: vmovaps 64(%rsi), %xmm2 ; AVX2-SLOW-NEXT: vmovaps (%rdi), %xmm10 ; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm11 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm11 = xmm11[0,1,3,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm11 = xmm11[0,1,3,2] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm12 = ymm11[0,1,1,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm12[0,1],ymm5[2,3],ymm12[4,5,6],ymm5[7] ; AVX2-SLOW-NEXT: vmovaps 32(%rdi), %xmm12 @@ -4757,9 +4757,9 @@ ; AVX2-SLOW-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps 64(%rdi), %xmm5 ; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} xmm9 = xmm10[2],xmm9[2],xmm10[3],xmm9[3] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm9 = xmm9[0,1,2,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm9 = xmm9[0,1,2,2] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[0,0,2,1] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm7 = xmm7[1,1,2,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm7 = xmm7[1,1,2,2] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm8[2],xmm7[3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,1,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm9[2,3],ymm7[4,5],ymm9[6,7] @@ -4767,19 +4767,19 @@ ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0],ymm8[1],ymm7[2,3,4,5],ymm8[6],ymm7[7] ; AVX2-SLOW-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm7 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm7 = xmm7[0,1,2,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm7 = xmm7[0,1,2,2] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,0,2,1] ; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm8 = xmm12[0],xmm6[0],xmm12[1],xmm6[1] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm8 = xmm8[0,1,3,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm8 = xmm8[0,1,3,2] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm9 = ymm8[0,1,1,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm9[0,1],ymm7[2,3],ymm9[4,5,6],ymm7[7] ; AVX2-SLOW-NEXT: vinsertf128 $1, 32(%r8), %ymm8, %ymm8 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0],ymm7[1,2,3],ymm8[4],ymm7[5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} xmm6 = xmm12[2],xmm6[2],xmm12[3],xmm6[3] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm6 = xmm6[0,1,2,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm6 = xmm6[0,1,2,2] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,0,2,1] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm3 = xmm3[1,1,2,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm3[1,1,2,2] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm4[2],xmm3[3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,1,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm6[2,3],ymm3[4,5],ymm6[6,7] @@ -4787,19 +4787,19 @@ ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3,4,5],ymm4[6],ymm3[7] ; AVX2-SLOW-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm3 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm3 = xmm3[0,1,2,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,1,2,2] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,0,2,1] ; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm4 = xmm5[0],xmm2[0],xmm5[1],xmm2[1] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm4 = xmm4[0,1,3,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm4 = xmm4[0,1,3,2] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm6 = ymm4[0,1,1,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm6[0,1],ymm3[2,3],ymm6[4,5,6],ymm3[7] ; AVX2-SLOW-NEXT: vinsertf128 $1, 64(%r8), %ymm4, %ymm4 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0],ymm3[1,2,3],ymm4[4],ymm3[5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} xmm2 = xmm5[2],xmm2[2],xmm5[3],xmm2[3] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[0,1,2,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,1,2,2] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,0,2,1] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,2,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1,2,2] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5],ymm2[6,7] @@ -4809,12 +4809,12 @@ ; AVX2-SLOW-NEXT: vmovaps 96(%rcx), %xmm0 ; AVX2-SLOW-NEXT: vmovaps 96(%rdx), %xmm1 ; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[0,1,2,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,1,2,2] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,0,2,1] ; AVX2-SLOW-NEXT: vmovaps 96(%rsi), %xmm3 ; AVX2-SLOW-NEXT: vmovaps 96(%rdi), %xmm4 ; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm5 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm5 = xmm5[0,1,3,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm5 = xmm5[0,1,3,2] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm6 = ymm5[0,1,1,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm6[0,1],ymm2[2,3],ymm6[4,5,6],ymm2[7] ; AVX2-SLOW-NEXT: vmovaps 96(%r8), %ymm6 @@ -4823,9 +4823,9 @@ ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0],ymm2[1,2,3],ymm5[4],ymm2[5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} xmm2 = xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[0,1,2,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,1,2,2] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,0,2,1] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,2,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1,2,2] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5],ymm2[6,7] @@ -4835,23 +4835,23 @@ ; AVX2-SLOW-NEXT: vmovaps 128(%rcx), %xmm0 ; AVX2-SLOW-NEXT: vmovaps 128(%rdx), %xmm1 ; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[0,1,2,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,1,2,2] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,0,2,1] ; AVX2-SLOW-NEXT: vmovaps 128(%rsi), %xmm3 ; AVX2-SLOW-NEXT: vmovaps 128(%rdi), %xmm4 ; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm5 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm5 = xmm5[0,1,3,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm5 = xmm5[0,1,3,2] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm6 = ymm5[0,1,1,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm6[0,1],ymm2[2,3],ymm6[4,5,6],ymm2[7] ; AVX2-SLOW-NEXT: vinsertf128 $1, 128(%r8), %ymm5, %ymm5 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0],ymm2[1,2,3],ymm5[4],ymm2[5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} xmm2 = xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,2,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1,2,2] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3] ; AVX2-SLOW-NEXT: vmovaps 128(%r8), %ymm3 ; AVX2-SLOW-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm1 = xmm2[0,1,2,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm2[0,1,2,2] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,0,2,1] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] @@ -4861,21 +4861,21 @@ ; AVX2-SLOW-NEXT: vmovaps 160(%rcx), %xmm0 ; AVX2-SLOW-NEXT: vmovaps 160(%rdx), %xmm1 ; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[0,1,2,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,1,2,2] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,0,2,1] ; AVX2-SLOW-NEXT: vmovaps 160(%rsi), %xmm3 ; AVX2-SLOW-NEXT: vmovaps 160(%rdi), %xmm4 ; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm5 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm5 = xmm5[0,1,3,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm5 = xmm5[0,1,3,2] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm6 = ymm5[0,1,1,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm6[0,1],ymm2[2,3],ymm6[4,5,6],ymm2[7] ; AVX2-SLOW-NEXT: vinsertf128 $1, 160(%r8), %ymm5, %ymm5 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0],ymm2[1,2,3],ymm5[4],ymm2[5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} xmm2 = xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,2,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1,2,2] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm1 = xmm2[0,1,2,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm2[0,1,2,2] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,0,2,1] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] @@ -4887,21 +4887,21 @@ ; AVX2-SLOW-NEXT: vmovaps 192(%rcx), %xmm0 ; AVX2-SLOW-NEXT: vmovaps 192(%rdx), %xmm1 ; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[0,1,2,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,1,2,2] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,0,2,1] ; AVX2-SLOW-NEXT: vmovaps 192(%rsi), %xmm3 ; AVX2-SLOW-NEXT: vmovaps 192(%rdi), %xmm4 ; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm5 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm5 = xmm5[0,1,3,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm5 = xmm5[0,1,3,2] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm6 = ymm5[0,1,1,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm6[0,1],ymm2[2,3],ymm6[4,5,6],ymm2[7] ; AVX2-SLOW-NEXT: vinsertf128 $1, 192(%r8), %ymm5, %ymm5 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0],ymm2[1,2,3],ymm5[4],ymm2[5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} xmm2 = xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,2,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1,2,2] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm1 = xmm2[0,1,2,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm2[0,1,2,2] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,0,2,1] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] @@ -4913,21 +4913,21 @@ ; AVX2-SLOW-NEXT: vmovaps 224(%rcx), %xmm0 ; AVX2-SLOW-NEXT: vmovaps 224(%rdx), %xmm1 ; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[0,1,2,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,1,2,2] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,0,2,1] ; AVX2-SLOW-NEXT: vmovaps 224(%rsi), %xmm3 ; AVX2-SLOW-NEXT: vmovaps 224(%rdi), %xmm4 ; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm5 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm5 = xmm5[0,1,3,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm5 = xmm5[0,1,3,2] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm6 = ymm5[0,1,1,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm6[0,1],ymm2[2,3],ymm6[4,5,6],ymm2[7] ; AVX2-SLOW-NEXT: vinsertf128 $1, 224(%r8), %ymm5, %ymm5 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0],ymm2[1,2,3],ymm5[4],ymm2[5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} xmm2 = xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,2,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1,2,2] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm1 = xmm2[0,1,2,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm2[0,1,2,2] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,0,2,1] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] @@ -4938,7 +4938,7 @@ ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps (%rdx), %ymm14 ; AVX2-SLOW-NEXT: vmovaps (%rcx), %ymm1 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm1[1,1,1,1,5,5,5,5] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm1[1,1,1,1,5,5,5,5] ; AVX2-SLOW-NEXT: vmovaps %ymm1, %ymm2 ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm14[1],ymm0[2,3,4],ymm14[5],ymm0[6,7] @@ -4946,7 +4946,7 @@ ; AVX2-SLOW-NEXT: vmovaps (%rdi), %ymm3 ; AVX2-SLOW-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps (%rsi), %ymm4 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm4[1,1,2,2,5,5,6,6] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm4[1,1,2,2,5,5,6,6] ; AVX2-SLOW-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm3[1,2],ymm1[3,4],ymm3[5,6],ymm1[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,3] @@ -4954,8 +4954,8 @@ ; AVX2-SLOW-NEXT: vbroadcastsd 16(%r8), %ymm1 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4],ymm1[5],ymm0[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm2[0,2,3,3,4,6,7,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm14[2,3,2,3,6,7,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm2[0,2,3,3,4,6,7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm14[2,3,2,3,6,7,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7] ; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm1 = ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[6],ymm3[6],ymm4[7],ymm3[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] @@ -4966,7 +4966,7 @@ ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps 32(%rdx), %ymm1 ; AVX2-SLOW-NEXT: vmovaps 32(%rcx), %ymm2 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm2[1,1,1,1,5,5,5,5] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm2[1,1,1,1,5,5,5,5] ; AVX2-SLOW-NEXT: vmovaps %ymm2, %ymm3 ; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7] @@ -4976,7 +4976,7 @@ ; AVX2-SLOW-NEXT: vmovaps 32(%rdi), %ymm4 ; AVX2-SLOW-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps 32(%rsi), %ymm5 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm5[1,1,2,2,5,5,6,6] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm5[1,1,2,2,5,5,6,6] ; AVX2-SLOW-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm4[1,2],ymm1[3,4],ymm4[5,6],ymm1[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,3] @@ -4984,8 +4984,8 @@ ; AVX2-SLOW-NEXT: vbroadcastsd 48(%r8), %ymm1 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4],ymm1[5],ymm0[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm3[0,2,3,3,4,6,7,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm2[2,3,2,3,6,7,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm3[0,2,3,3,4,6,7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm2[2,3,2,3,6,7,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7] ; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm1 = ymm5[2],ymm4[2],ymm5[3],ymm4[3],ymm5[6],ymm4[6],ymm5[7],ymm4[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] @@ -4996,7 +4996,7 @@ ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps 64(%rdx), %ymm1 ; AVX2-SLOW-NEXT: vmovaps 64(%rcx), %ymm2 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm2[1,1,1,1,5,5,5,5] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm2[1,1,1,1,5,5,5,5] ; AVX2-SLOW-NEXT: vmovaps %ymm2, %ymm3 ; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7] @@ -5006,7 +5006,7 @@ ; AVX2-SLOW-NEXT: vmovaps 64(%rdi), %ymm4 ; AVX2-SLOW-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps 64(%rsi), %ymm5 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm5[1,1,2,2,5,5,6,6] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm5[1,1,2,2,5,5,6,6] ; AVX2-SLOW-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm4[1,2],ymm1[3,4],ymm4[5,6],ymm1[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,3] @@ -5014,8 +5014,8 @@ ; AVX2-SLOW-NEXT: vbroadcastsd 80(%r8), %ymm1 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4],ymm1[5],ymm0[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm3[0,2,3,3,4,6,7,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm2[2,3,2,3,6,7,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm3[0,2,3,3,4,6,7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm2[2,3,2,3,6,7,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7] ; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm1 = ymm5[2],ymm4[2],ymm5[3],ymm4[3],ymm5[6],ymm4[6],ymm5[7],ymm4[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] @@ -5026,7 +5026,7 @@ ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps 96(%rdx), %ymm1 ; AVX2-SLOW-NEXT: vmovaps 96(%rcx), %ymm2 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm2[1,1,1,1,5,5,5,5] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm2[1,1,1,1,5,5,5,5] ; AVX2-SLOW-NEXT: vmovaps %ymm2, %ymm3 ; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7] @@ -5036,7 +5036,7 @@ ; AVX2-SLOW-NEXT: vmovaps 96(%rdi), %ymm4 ; AVX2-SLOW-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps 96(%rsi), %ymm5 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm5[1,1,2,2,5,5,6,6] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm5[1,1,2,2,5,5,6,6] ; AVX2-SLOW-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm4[1,2],ymm1[3,4],ymm4[5,6],ymm1[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,3] @@ -5044,8 +5044,8 @@ ; AVX2-SLOW-NEXT: vbroadcastsd 112(%r8), %ymm1 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4],ymm1[5],ymm0[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm3[0,2,3,3,4,6,7,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm2[2,3,2,3,6,7,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm3[0,2,3,3,4,6,7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm2[2,3,2,3,6,7,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7] ; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm1 = ymm5[2],ymm4[2],ymm5[3],ymm4[3],ymm5[6],ymm4[6],ymm5[7],ymm4[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] @@ -5056,7 +5056,7 @@ ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps 128(%rdx), %ymm1 ; AVX2-SLOW-NEXT: vmovaps 128(%rcx), %ymm2 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm2[1,1,1,1,5,5,5,5] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm2[1,1,1,1,5,5,5,5] ; AVX2-SLOW-NEXT: vmovaps %ymm2, %ymm3 ; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7] @@ -5066,15 +5066,15 @@ ; AVX2-SLOW-NEXT: vmovaps 128(%rdi), %ymm4 ; AVX2-SLOW-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps 128(%rsi), %ymm13 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm13[1,1,2,2,5,5,6,6] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm13[1,1,2,2,5,5,6,6] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm4[1,2],ymm1[3,4],ymm4[5,6],ymm1[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4],ymm1[5,6,7] ; AVX2-SLOW-NEXT: vbroadcastsd 144(%r8), %ymm1 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4],ymm1[5],ymm0[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm3[0,2,3,3,4,6,7,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm2[2,3,2,3,6,7,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm3[0,2,3,3,4,6,7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm2[2,3,2,3,6,7,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7] ; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm1 = ymm13[2],ymm4[2],ymm13[3],ymm4[3],ymm13[6],ymm4[6],ymm13[7],ymm4[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] @@ -5085,20 +5085,20 @@ ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps 160(%rdx), %ymm12 ; AVX2-SLOW-NEXT: vmovaps 160(%rcx), %ymm11 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm11[1,1,1,1,5,5,5,5] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm11[1,1,1,1,5,5,5,5] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm12[1],ymm0[2,3,4],ymm12[5],ymm0[6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] ; AVX2-SLOW-NEXT: vmovaps 160(%rdi), %ymm9 ; AVX2-SLOW-NEXT: vmovaps 160(%rsi), %ymm8 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm8[1,1,2,2,5,5,6,6] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm8[1,1,2,2,5,5,6,6] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm9[1,2],ymm1[3,4],ymm9[5,6],ymm1[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4],ymm1[5,6,7] ; AVX2-SLOW-NEXT: vbroadcastsd 176(%r8), %ymm1 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4],ymm1[5],ymm0[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm11[0,2,3,3,4,6,7,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm12[2,3,2,3,6,7,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm11[0,2,3,3,4,6,7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm12[2,3,2,3,6,7,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7] ; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm1 = ymm8[2],ymm9[2],ymm8[3],ymm9[3],ymm8[6],ymm9[6],ymm8[7],ymm9[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] @@ -5109,20 +5109,20 @@ ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps 192(%rdx), %ymm7 ; AVX2-SLOW-NEXT: vmovaps 192(%rcx), %ymm6 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm6[1,1,1,1,5,5,5,5] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm6[1,1,1,1,5,5,5,5] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm7[1],ymm0[2,3,4],ymm7[5],ymm0[6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] ; AVX2-SLOW-NEXT: vmovaps 192(%rdi), %ymm5 ; AVX2-SLOW-NEXT: vmovaps 192(%rsi), %ymm4 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm4[1,1,2,2,5,5,6,6] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm4[1,1,2,2,5,5,6,6] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm5[1,2],ymm1[3,4],ymm5[5,6],ymm1[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4],ymm1[5,6,7] ; AVX2-SLOW-NEXT: vbroadcastsd 208(%r8), %ymm1 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4],ymm1[5],ymm0[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm6[0,2,3,3,4,6,7,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm7[2,3,2,3,6,7,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm6[0,2,3,3,4,6,7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm7[2,3,2,3,6,7,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7] ; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm1 = ymm4[2],ymm5[2],ymm4[3],ymm5[3],ymm4[6],ymm5[6],ymm4[7],ymm5[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] @@ -5133,20 +5133,20 @@ ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps 224(%rdx), %ymm3 ; AVX2-SLOW-NEXT: vmovaps 224(%rcx), %ymm2 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm2[1,1,1,1,5,5,5,5] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm2[1,1,1,1,5,5,5,5] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3,4],ymm3[5],ymm0[6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm10 = ymm0[2,2,2,2] ; AVX2-SLOW-NEXT: vmovaps 224(%rdi), %ymm1 ; AVX2-SLOW-NEXT: vmovaps 224(%rsi), %ymm0 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm15 = ymm0[1,1,2,2,5,5,6,6] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm15 = ymm0[1,1,2,2,5,5,6,6] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0],ymm1[1,2],ymm15[3,4],ymm1[5,6],ymm15[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[2,2,2,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm10 = ymm15[0,1,2],ymm10[3,4],ymm15[5,6,7] ; AVX2-SLOW-NEXT: vbroadcastsd 240(%r8), %ymm15 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm10 = ymm15[0],ymm10[1,2,3,4],ymm15[5],ymm10[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm10, (%rsp) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm10 = ymm2[0,2,3,3,4,6,7,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm15 = ymm3[2,3,2,3,6,7,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm10 = ymm2[0,2,3,3,4,6,7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm15 = ymm3[2,3,2,3,6,7,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm10 = ymm15[0],ymm10[1,2],ymm15[3,4],ymm10[5,6],ymm15[7] ; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm15 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[3,3,3,3] @@ -5161,7 +5161,7 @@ ; AVX2-SLOW-NEXT: # ymm10 = ymm10[0,1,2,3],mem[4],ymm10[5,6,7] ; AVX2-SLOW-NEXT: vpermilps $52, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm15 = mem[0,1,3,0,4,5,7,4] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm14 = ymm14[2,3,0,1,6,7,4,5] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm14 = ymm14[2,3,0,1,6,7,4,5] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1],ymm15[2],ymm14[3,4,5,6],ymm15[7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0],ymm14[1,2,3],ymm10[4,5],ymm14[6,7] ; AVX2-SLOW-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload @@ -5205,7 +5205,7 @@ ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0],ymm14[1,2,3],ymm10[4,5],ymm14[6,7] ; AVX2-SLOW-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm10 = ymm10[0,1,2],mem[3],ymm10[4,5,6,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm13 = ymm13[3,0,2,3,7,4,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm13 = ymm13[3,0,2,3,7,4,6,7] ; AVX2-SLOW-NEXT: vblendps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm13 = ymm13[0,1,2,3],mem[4],ymm13[5,6,7] ; AVX2-SLOW-NEXT: vpermilps $52, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload @@ -5216,26 +5216,26 @@ ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0],ymm14[1,2,3],ymm13[4,5],ymm14[6,7] ; AVX2-SLOW-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm13 = ymm13[0,1,2],mem[3],ymm13[4,5,6,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm8 = ymm8[3,0,2,3,7,4,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm8 = ymm8[3,0,2,3,7,4,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4],ymm8[5,6,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm9 = ymm11[0,1,3,0,4,5,7,4] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm11 = ymm12[2,3,0,1,6,7,4,5] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm9 = ymm11[0,1,3,0,4,5,7,4] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm11 = ymm12[2,3,0,1,6,7,4,5] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm11[0,1],ymm9[2],ymm11[3,4,5,6],ymm9[7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0],ymm9[1,2,3],ymm8[4,5],ymm9[6,7] ; AVX2-SLOW-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm8 = ymm8[0,1,2],mem[3],ymm8[4,5,6,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm4 = ymm4[3,0,2,3,7,4,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm4 = ymm4[3,0,2,3,7,4,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4],ymm4[5,6,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm5 = ymm6[0,1,3,0,4,5,7,4] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm6 = ymm7[2,3,0,1,6,7,4,5] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm5 = ymm6[0,1,3,0,4,5,7,4] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm6 = ymm7[2,3,0,1,6,7,4,5] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1],ymm5[2],ymm6[3,4,5,6],ymm5[7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm5[1,2,3],ymm4[4,5],ymm5[6,7] ; AVX2-SLOW-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm4 = ymm4[0,1,2],mem[3],ymm4[4,5,6,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,0,2,3,7,4,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,0,2,3,7,4,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4],ymm0[5,6,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm2[0,1,3,0,4,5,7,4] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm2 = ymm3[2,3,0,1,6,7,4,5] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,1,3,0,4,5,7,4] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm3[2,3,0,1,6,7,4,5] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4,5,6],ymm1[7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4,5],ymm1[6,7] ; AVX2-SLOW-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload @@ -5340,7 +5340,7 @@ ; AVX2-FAST-NEXT: vmovaps 32(%rdx), %xmm8 ; AVX2-FAST-NEXT: vmovaps (%rcx), %xmm14 ; AVX2-FAST-NEXT: vmovaps 32(%rcx), %xmm9 -; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm1 = xmm14[1,1,2,2] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm14[1,1,2,2] ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm13[2],xmm1[3] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,1,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] @@ -5349,7 +5349,7 @@ ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vunpckhps {{.*#+}} xmm0 = xmm5[2],xmm4[2],xmm5[3],xmm4[3] ; AVX2-FAST-NEXT: vpermps %ymm0, %ymm2, %ymm0 -; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm1 = xmm9[1,1,2,2] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm9[1,1,2,2] ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm8[2],xmm1[3] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,1,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] @@ -5360,7 +5360,7 @@ ; AVX2-FAST-NEXT: vpermps %ymm0, %ymm2, %ymm0 ; AVX2-FAST-NEXT: vmovaps 64(%rdx), %xmm6 ; AVX2-FAST-NEXT: vmovaps 64(%rcx), %xmm7 -; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm1 = xmm7[1,1,2,2] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm7[1,1,2,2] ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm6[2],xmm1[3] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,1,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] @@ -5379,7 +5379,7 @@ ; AVX2-FAST-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vmovaps 96(%rcx), %xmm1 ; AVX2-FAST-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm12 = xmm1[1,1,2,2] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm12 = xmm1[1,1,2,2] ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm12 = xmm12[0,1],xmm0[2],xmm12[3] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm12 = ymm12[0,1,1,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1],ymm15[2,3],ymm12[4,5],ymm15[6,7] @@ -5398,7 +5398,7 @@ ; AVX2-FAST-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vmovaps 128(%rcx), %xmm1 ; AVX2-FAST-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm15 = xmm1[1,1,2,2] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm15 = xmm1[1,1,2,2] ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm15 = xmm15[0,1],xmm0[2],xmm15[3] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[0,1,1,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm12 = ymm15[0,1],ymm12[2,3],ymm15[4,5],ymm12[6,7] @@ -5417,7 +5417,7 @@ ; AVX2-FAST-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill ; AVX2-FAST-NEXT: vmovaps 160(%rcx), %xmm1 ; AVX2-FAST-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm15 = xmm1[1,1,2,2] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm15 = xmm1[1,1,2,2] ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm15 = xmm15[0,1],xmm0[2],xmm15[3] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[0,1,1,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm12 = ymm15[0,1],ymm12[2,3],ymm15[4,5],ymm12[6,7] @@ -5436,7 +5436,7 @@ ; AVX2-FAST-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vmovaps 192(%rcx), %xmm1 ; AVX2-FAST-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm15 = xmm1[1,1,2,2] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm15 = xmm1[1,1,2,2] ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm15 = xmm15[0,1],xmm0[2],xmm15[3] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[0,1,1,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm12 = ymm15[0,1],ymm12[2,3],ymm15[4,5],ymm12[6,7] @@ -5455,7 +5455,7 @@ ; AVX2-FAST-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vmovaps 224(%rcx), %xmm0 ; AVX2-FAST-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm15 = xmm0[1,1,2,2] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm15 = xmm0[1,1,2,2] ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm15 = xmm15[0,1],xmm1[2],xmm15[3] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[0,1,1,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm12 = ymm15[0,1],ymm12[2,3],ymm15[4,5],ymm12[6,7] @@ -5475,7 +5475,7 @@ ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps (%rdx), %ymm0 ; AVX2-FAST-NEXT: vmovaps (%rcx), %ymm1 -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm10 = ymm1[1,1,1,1,5,5,5,5] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm10 = ymm1[1,1,1,1,5,5,5,5] ; AVX2-FAST-NEXT: vmovaps %ymm1, %ymm13 ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0],ymm0[1],ymm10[2,3,4],ymm0[5],ymm10[6,7] @@ -5484,7 +5484,7 @@ ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[2,2,2,2] ; AVX2-FAST-NEXT: vmovaps (%rdi), %ymm15 ; AVX2-FAST-NEXT: vmovaps (%rsi), %ymm0 -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm11 = ymm0[1,1,2,2,5,5,6,6] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm11 = ymm0[1,1,2,2,5,5,6,6] ; AVX2-FAST-NEXT: vmovaps %ymm0, %ymm1 ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0],ymm15[1,2],ymm11[3,4],ymm15[5,6],ymm11[7] @@ -5493,8 +5493,8 @@ ; AVX2-FAST-NEXT: vbroadcastsd 16(%r8), %ymm11 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm11[0],ymm10[1,2,3,4],ymm11[5],ymm10[6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm10 = ymm13[0,2,3,3,4,6,7,7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm11 = ymm12[2,3,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm10 = ymm13[0,2,3,3,4,6,7,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm11 = ymm12[2,3,2,3,6,7,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0],ymm10[1,2],ymm11[3,4],ymm10[5,6],ymm11[7] ; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm11 = ymm1[2],ymm15[2],ymm1[3],ymm15[3],ymm1[6],ymm15[6],ymm1[7],ymm15[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[3,3,3,3] @@ -5513,7 +5513,7 @@ ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps 32(%rdx), %ymm0 ; AVX2-FAST-NEXT: vmovaps 32(%rcx), %ymm1 -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm4 = ymm1[1,1,1,1,5,5,5,5] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm1[1,1,1,1,5,5,5,5] ; AVX2-FAST-NEXT: vmovaps %ymm1, %ymm8 ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm0[1],ymm4[2,3,4],ymm0[5],ymm4[6,7] @@ -5523,7 +5523,7 @@ ; AVX2-FAST-NEXT: vmovaps 32(%rdi), %ymm9 ; AVX2-FAST-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps 32(%rsi), %ymm0 -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm5 = ymm0[1,1,2,2,5,5,6,6] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm5 = ymm0[1,1,2,2,5,5,6,6] ; AVX2-FAST-NEXT: vmovaps %ymm0, %ymm10 ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm9[1,2],ymm5[3,4],ymm9[5,6],ymm5[7] @@ -5532,8 +5532,8 @@ ; AVX2-FAST-NEXT: vbroadcastsd 48(%r8), %ymm5 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0],ymm4[1,2,3,4],ymm5[5],ymm4[6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm4 = ymm8[0,2,3,3,4,6,7,7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm5 = ymm1[2,3,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm8[0,2,3,3,4,6,7,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm5 = ymm1[2,3,2,3,6,7,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0],ymm4[1,2],ymm5[3,4],ymm4[5,6],ymm5[7] ; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm5 = ymm10[2],ymm9[2],ymm10[3],ymm9[3],ymm10[6],ymm9[6],ymm10[7],ymm9[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[3,3,3,3] @@ -5553,7 +5553,7 @@ ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps 64(%rdx), %ymm0 ; AVX2-FAST-NEXT: vmovaps 64(%rcx), %ymm1 -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm3 = ymm1[1,1,1,1,5,5,5,5] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm3 = ymm1[1,1,1,1,5,5,5,5] ; AVX2-FAST-NEXT: vmovaps %ymm1, %ymm5 ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm0[1],ymm3[2,3,4],ymm0[5],ymm3[6,7] @@ -5563,7 +5563,7 @@ ; AVX2-FAST-NEXT: vmovaps 64(%rdi), %ymm6 ; AVX2-FAST-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps 64(%rsi), %ymm0 -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm4 = ymm0[1,1,2,2,5,5,6,6] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm0[1,1,2,2,5,5,6,6] ; AVX2-FAST-NEXT: vmovaps %ymm0, %ymm7 ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm6[1,2],ymm4[3,4],ymm6[5,6],ymm4[7] @@ -5572,8 +5572,8 @@ ; AVX2-FAST-NEXT: vbroadcastsd 80(%r8), %ymm4 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0],ymm3[1,2,3,4],ymm4[5],ymm3[6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm3 = ymm5[0,2,3,3,4,6,7,7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm4 = ymm1[2,3,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm3 = ymm5[0,2,3,3,4,6,7,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm1[2,3,2,3,6,7,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0],ymm3[1,2],ymm4[3,4],ymm3[5,6],ymm4[7] ; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm4 = ymm7[2],ymm6[2],ymm7[3],ymm6[3],ymm7[6],ymm6[6],ymm7[7],ymm6[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[3,3,3,3] @@ -5596,7 +5596,7 @@ ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps 96(%rdx), %ymm1 ; AVX2-FAST-NEXT: vmovaps 96(%rcx), %ymm3 -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm0 = ymm3[1,1,1,1,5,5,5,5] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm3[1,1,1,1,5,5,5,5] ; AVX2-FAST-NEXT: vmovaps %ymm3, %ymm4 ; AVX2-FAST-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7] @@ -5606,7 +5606,7 @@ ; AVX2-FAST-NEXT: vmovaps 96(%rdi), %ymm5 ; AVX2-FAST-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps 96(%rsi), %ymm6 -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm1 = ymm6[1,1,2,2,5,5,6,6] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm6[1,1,2,2,5,5,6,6] ; AVX2-FAST-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm5[1,2],ymm1[3,4],ymm5[5,6],ymm1[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,3] @@ -5614,8 +5614,8 @@ ; AVX2-FAST-NEXT: vbroadcastsd 112(%r8), %ymm1 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4],ymm1[5],ymm0[6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm0 = ymm4[0,2,3,3,4,6,7,7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm1 = ymm3[2,3,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm4[0,2,3,3,4,6,7,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm3[2,3,2,3,6,7,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7] ; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm1 = ymm6[2],ymm5[2],ymm6[3],ymm5[3],ymm6[6],ymm5[6],ymm6[7],ymm5[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] @@ -5638,7 +5638,7 @@ ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps 128(%rdx), %ymm13 ; AVX2-FAST-NEXT: vmovaps 128(%rcx), %ymm1 -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm0 = ymm1[1,1,1,1,5,5,5,5] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm1[1,1,1,1,5,5,5,5] ; AVX2-FAST-NEXT: vmovaps %ymm1, %ymm3 ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm13[1],ymm0[2,3,4],ymm13[5],ymm0[6,7] @@ -5648,15 +5648,15 @@ ; AVX2-FAST-NEXT: vmovaps 128(%rdi), %ymm4 ; AVX2-FAST-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps 128(%rsi), %ymm13 -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm1 = ymm13[1,1,2,2,5,5,6,6] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm13[1,1,2,2,5,5,6,6] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm4[1,2],ymm1[3,4],ymm4[5,6],ymm1[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4],ymm1[5,6,7] ; AVX2-FAST-NEXT: vbroadcastsd 144(%r8), %ymm1 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4],ymm1[5],ymm0[6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm0 = ymm3[0,2,3,3,4,6,7,7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm1 = ymm5[2,3,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm3[0,2,3,3,4,6,7,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm5[2,3,2,3,6,7,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7] ; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm1 = ymm13[2],ymm4[2],ymm13[3],ymm4[3],ymm13[6],ymm4[6],ymm13[7],ymm4[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] @@ -5679,22 +5679,22 @@ ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps 160(%rdx), %ymm1 ; AVX2-FAST-NEXT: vmovaps 160(%rcx), %ymm12 -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm0 = ymm12[1,1,1,1,5,5,5,5] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm12[1,1,1,1,5,5,5,5] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7] ; AVX2-FAST-NEXT: vmovaps %ymm1, %ymm3 ; AVX2-FAST-NEXT: vmovups %ymm1, (%rsp) # 32-byte Spill ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] ; AVX2-FAST-NEXT: vmovaps 160(%rdi), %ymm10 ; AVX2-FAST-NEXT: vmovaps 160(%rsi), %ymm9 -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm1 = ymm9[1,1,2,2,5,5,6,6] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm9[1,1,2,2,5,5,6,6] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm10[1,2],ymm1[3,4],ymm10[5,6],ymm1[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4],ymm1[5,6,7] ; AVX2-FAST-NEXT: vbroadcastsd 176(%r8), %ymm1 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4],ymm1[5],ymm0[6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm0 = ymm12[0,2,3,3,4,6,7,7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm1 = ymm3[2,3,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm12[0,2,3,3,4,6,7,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm3[2,3,2,3,6,7,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7] ; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm1 = ymm9[2],ymm10[2],ymm9[3],ymm10[3],ymm9[6],ymm10[6],ymm9[7],ymm10[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] @@ -5717,20 +5717,20 @@ ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps 192(%rdx), %ymm7 ; AVX2-FAST-NEXT: vmovaps 192(%rcx), %ymm6 -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm0 = ymm6[1,1,1,1,5,5,5,5] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm6[1,1,1,1,5,5,5,5] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm7[1],ymm0[2,3,4],ymm7[5],ymm0[6,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] ; AVX2-FAST-NEXT: vmovaps 192(%rdi), %ymm5 ; AVX2-FAST-NEXT: vmovaps 192(%rsi), %ymm4 -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm1 = ymm4[1,1,2,2,5,5,6,6] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm4[1,1,2,2,5,5,6,6] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm5[1,2],ymm1[3,4],ymm5[5,6],ymm1[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4],ymm1[5,6,7] ; AVX2-FAST-NEXT: vbroadcastsd 208(%r8), %ymm1 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4],ymm1[5],ymm0[6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm0 = ymm6[0,2,3,3,4,6,7,7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm1 = ymm7[2,3,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm6[0,2,3,3,4,6,7,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm7[2,3,2,3,6,7,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7] ; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm1 = ymm4[2],ymm5[2],ymm4[3],ymm5[3],ymm4[6],ymm5[6],ymm4[7],ymm5[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] @@ -5753,20 +5753,20 @@ ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps 224(%rdx), %ymm3 ; AVX2-FAST-NEXT: vmovaps 224(%rcx), %ymm2 -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm0 = ymm2[1,1,1,1,5,5,5,5] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm2[1,1,1,1,5,5,5,5] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3,4],ymm3[5],ymm0[6,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm8 = ymm0[2,2,2,2] ; AVX2-FAST-NEXT: vmovaps 224(%rdi), %ymm1 ; AVX2-FAST-NEXT: vmovaps 224(%rsi), %ymm0 -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm11 = ymm0[1,1,2,2,5,5,6,6] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm11 = ymm0[1,1,2,2,5,5,6,6] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0],ymm1[1,2],ymm11[3,4],ymm1[5,6],ymm11[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[2,2,2,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm11[0,1,2],ymm8[3,4],ymm11[5,6,7] ; AVX2-FAST-NEXT: vbroadcastsd 240(%r8), %ymm11 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm11[0],ymm8[1,2,3,4],ymm11[5],ymm8[6,7] ; AVX2-FAST-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm8 = ymm2[0,2,3,3,4,6,7,7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm11 = ymm3[2,3,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm8 = ymm2[0,2,3,3,4,6,7,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm11 = ymm3[2,3,2,3,6,7,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm11[0],ymm8[1,2],ymm11[3,4],ymm8[5,6],ymm11[7] ; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm11 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[3,3,3,3] @@ -5824,7 +5824,7 @@ ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0],ymm14[1,2,3],ymm8[4,5],ymm14[6,7] ; AVX2-FAST-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm8 = ymm8[0,1,2],mem[3],ymm8[4,5,6,7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm14 = ymm13[3,0,2,3,7,4,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm14 = ymm13[3,0,2,3,7,4,6,7] ; AVX2-FAST-NEXT: vblendps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm14 = ymm14[0,1,2,3],mem[4],ymm14[5,6,7] ; AVX2-FAST-NEXT: vpermilps $52, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload @@ -5835,27 +5835,27 @@ ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0],ymm13[1,2,3],ymm14[4,5],ymm13[6,7] ; AVX2-FAST-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm13 = ymm13[0,1,2],mem[3],ymm13[4,5,6,7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm9 = ymm9[3,0,2,3,7,4,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm9 = ymm9[3,0,2,3,7,4,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4],ymm9[5,6,7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm10 = ymm12[0,1,3,0,4,5,7,4] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm10 = ymm12[0,1,3,0,4,5,7,4] ; AVX2-FAST-NEXT: vpermilps $78, (%rsp), %ymm12 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm12 = mem[2,3,0,1,6,7,4,5] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm10 = ymm12[0,1],ymm10[2],ymm12[3,4,5,6],ymm10[7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0],ymm10[1,2,3],ymm9[4,5],ymm10[6,7] ; AVX2-FAST-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm9 = ymm9[0,1,2],mem[3],ymm9[4,5,6,7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm4 = ymm4[3,0,2,3,7,4,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm4[3,0,2,3,7,4,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4],ymm4[5,6,7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm5 = ymm6[0,1,3,0,4,5,7,4] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm6 = ymm7[2,3,0,1,6,7,4,5] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm5 = ymm6[0,1,3,0,4,5,7,4] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm6 = ymm7[2,3,0,1,6,7,4,5] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1],ymm5[2],ymm6[3,4,5,6],ymm5[7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm5[1,2,3],ymm4[4,5],ymm5[6,7] ; AVX2-FAST-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm4 = ymm4[0,1,2],mem[3],ymm4[4,5,6,7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,0,2,3,7,4,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,0,2,3,7,4,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4],ymm0[5,6,7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm1 = ymm2[0,1,3,0,4,5,7,4] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm2 = ymm3[2,3,0,1,6,7,4,5] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,1,3,0,4,5,7,4] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm3[2,3,0,1,6,7,4,5] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4,5,6],ymm1[7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4,5],ymm1[6,7] ; AVX2-FAST-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload @@ -5954,14 +5954,14 @@ ; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rdx), %xmm4 ; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%rdx), %xmm1 ; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm2 = xmm8[0],xmm7[0],xmm8[1],xmm7[1] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[0,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm5 = ymm2[0,0,2,1] ; AVX2-FAST-PERLANE-NEXT: vmovaps (%rsi), %xmm9 ; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rsi), %xmm6 ; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%rsi), %xmm2 ; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdi), %xmm10 ; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm11 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm11 = xmm11[0,1,3,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm11 = xmm11[0,1,3,2] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm12 = ymm11[0,1,1,1] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm12[0,1],ymm5[2,3],ymm12[4,5,6],ymm5[7] ; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rdi), %xmm12 @@ -5970,9 +5970,9 @@ ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%rdi), %xmm5 ; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} xmm9 = xmm10[2],xmm9[2],xmm10[3],xmm9[3] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm9 = xmm9[0,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm9 = xmm9[0,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm7 = xmm7[1,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm7 = xmm7[1,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm8[2],xmm7[3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,1,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm9[2,3],ymm7[4,5],ymm9[6,7] @@ -5980,19 +5980,19 @@ ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0],ymm8[1],ymm7[2,3,4,5],ymm8[6],ymm7[7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm7 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm7 = xmm7[0,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm7 = xmm7[0,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,0,2,1] ; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm8 = xmm12[0],xmm6[0],xmm12[1],xmm6[1] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm8 = xmm8[0,1,3,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm8 = xmm8[0,1,3,2] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm9 = ymm8[0,1,1,1] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm9[0,1],ymm7[2,3],ymm9[4,5,6],ymm7[7] ; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, 32(%r8), %ymm8, %ymm8 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0],ymm7[1,2,3],ymm8[4],ymm7[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} xmm6 = xmm12[2],xmm6[2],xmm12[3],xmm6[3] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm6 = xmm6[0,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm6 = xmm6[0,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm3 = xmm3[1,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm3 = xmm3[1,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm4[2],xmm3[3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,1,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm6[2,3],ymm3[4,5],ymm6[6,7] @@ -6000,19 +6000,19 @@ ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3,4,5],ymm4[6],ymm3[7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm3 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm3 = xmm3[0,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,0,2,1] ; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm4 = xmm5[0],xmm2[0],xmm5[1],xmm2[1] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm4 = xmm4[0,1,3,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm4 = xmm4[0,1,3,2] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm6 = ymm4[0,1,1,1] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm6[0,1],ymm3[2,3],ymm6[4,5,6],ymm3[7] ; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, 64(%r8), %ymm4, %ymm4 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0],ymm3[1,2,3],ymm4[4],ymm3[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} xmm2 = xmm5[2],xmm2[2],xmm5[3],xmm2[3] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[0,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5],ymm2[6,7] @@ -6022,12 +6022,12 @@ ; AVX2-FAST-PERLANE-NEXT: vmovaps 96(%rcx), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovaps 96(%rdx), %xmm1 ; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[0,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,0,2,1] ; AVX2-FAST-PERLANE-NEXT: vmovaps 96(%rsi), %xmm3 ; AVX2-FAST-PERLANE-NEXT: vmovaps 96(%rdi), %xmm4 ; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm5 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm5 = xmm5[0,1,3,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm5 = xmm5[0,1,3,2] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm6 = ymm5[0,1,1,1] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm6[0,1],ymm2[2,3],ymm6[4,5,6],ymm2[7] ; AVX2-FAST-PERLANE-NEXT: vmovaps 96(%r8), %ymm6 @@ -6036,9 +6036,9 @@ ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0],ymm2[1,2,3],ymm5[4],ymm2[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} xmm2 = xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[0,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5],ymm2[6,7] @@ -6048,23 +6048,23 @@ ; AVX2-FAST-PERLANE-NEXT: vmovaps 128(%rcx), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovaps 128(%rdx), %xmm1 ; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[0,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,0,2,1] ; AVX2-FAST-PERLANE-NEXT: vmovaps 128(%rsi), %xmm3 ; AVX2-FAST-PERLANE-NEXT: vmovaps 128(%rdi), %xmm4 ; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm5 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm5 = xmm5[0,1,3,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm5 = xmm5[0,1,3,2] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm6 = ymm5[0,1,1,1] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm6[0,1],ymm2[2,3],ymm6[4,5,6],ymm2[7] ; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, 128(%r8), %ymm5, %ymm5 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0],ymm2[1,2,3],ymm5[4],ymm2[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} xmm2 = xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3] ; AVX2-FAST-PERLANE-NEXT: vmovaps 128(%r8), %ymm3 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm1 = xmm2[0,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm2[0,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,0,2,1] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] @@ -6074,21 +6074,21 @@ ; AVX2-FAST-PERLANE-NEXT: vmovaps 160(%rcx), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovaps 160(%rdx), %xmm1 ; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[0,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,0,2,1] ; AVX2-FAST-PERLANE-NEXT: vmovaps 160(%rsi), %xmm3 ; AVX2-FAST-PERLANE-NEXT: vmovaps 160(%rdi), %xmm4 ; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm5 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm5 = xmm5[0,1,3,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm5 = xmm5[0,1,3,2] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm6 = ymm5[0,1,1,1] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm6[0,1],ymm2[2,3],ymm6[4,5,6],ymm2[7] ; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, 160(%r8), %ymm5, %ymm5 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0],ymm2[1,2,3],ymm5[4],ymm2[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} xmm2 = xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm1 = xmm2[0,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm2[0,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,0,2,1] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] @@ -6100,21 +6100,21 @@ ; AVX2-FAST-PERLANE-NEXT: vmovaps 192(%rcx), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovaps 192(%rdx), %xmm1 ; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[0,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,0,2,1] ; AVX2-FAST-PERLANE-NEXT: vmovaps 192(%rsi), %xmm3 ; AVX2-FAST-PERLANE-NEXT: vmovaps 192(%rdi), %xmm4 ; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm5 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm5 = xmm5[0,1,3,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm5 = xmm5[0,1,3,2] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm6 = ymm5[0,1,1,1] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm6[0,1],ymm2[2,3],ymm6[4,5,6],ymm2[7] ; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, 192(%r8), %ymm5, %ymm5 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0],ymm2[1,2,3],ymm5[4],ymm2[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} xmm2 = xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm1 = xmm2[0,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm2[0,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,0,2,1] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] @@ -6126,21 +6126,21 @@ ; AVX2-FAST-PERLANE-NEXT: vmovaps 224(%rcx), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovaps 224(%rdx), %xmm1 ; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[0,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,0,2,1] ; AVX2-FAST-PERLANE-NEXT: vmovaps 224(%rsi), %xmm3 ; AVX2-FAST-PERLANE-NEXT: vmovaps 224(%rdi), %xmm4 ; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm5 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm5 = xmm5[0,1,3,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm5 = xmm5[0,1,3,2] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm6 = ymm5[0,1,1,1] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm6[0,1],ymm2[2,3],ymm6[4,5,6],ymm2[7] ; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, 224(%r8), %ymm5, %ymm5 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0],ymm2[1,2,3],ymm5[4],ymm2[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} xmm2 = xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm1 = xmm2[0,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm2[0,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,0,2,1] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] @@ -6151,7 +6151,7 @@ ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdx), %ymm14 ; AVX2-FAST-PERLANE-NEXT: vmovaps (%rcx), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm0 = ymm1[1,1,1,1,5,5,5,5] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm1[1,1,1,1,5,5,5,5] ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm14[1],ymm0[2,3,4],ymm14[5],ymm0[6,7] @@ -6159,7 +6159,7 @@ ; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdi), %ymm3 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps (%rsi), %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm4[1,1,2,2,5,5,6,6] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm4[1,1,2,2,5,5,6,6] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm3[1,2],ymm1[3,4],ymm3[5,6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,3] @@ -6167,8 +6167,8 @@ ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 16(%r8), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4],ymm1[5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm0 = ymm2[0,2,3,3,4,6,7,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm14[2,3,2,3,6,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm2[0,2,3,3,4,6,7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm14[2,3,2,3,6,7,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm1 = ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[6],ymm3[6],ymm4[7],ymm3[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] @@ -6179,7 +6179,7 @@ ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rdx), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rcx), %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm0 = ymm2[1,1,1,1,5,5,5,5] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm2[1,1,1,1,5,5,5,5] ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, %ymm3 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7] @@ -6189,7 +6189,7 @@ ; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rdi), %ymm4 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rsi), %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm5[1,1,2,2,5,5,6,6] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm5[1,1,2,2,5,5,6,6] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm4[1,2],ymm1[3,4],ymm4[5,6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,3] @@ -6197,8 +6197,8 @@ ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 48(%r8), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4],ymm1[5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm0 = ymm3[0,2,3,3,4,6,7,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm2[2,3,2,3,6,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm3[0,2,3,3,4,6,7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm2[2,3,2,3,6,7,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm1 = ymm5[2],ymm4[2],ymm5[3],ymm4[3],ymm5[6],ymm4[6],ymm5[7],ymm4[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] @@ -6209,7 +6209,7 @@ ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%rdx), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%rcx), %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm0 = ymm2[1,1,1,1,5,5,5,5] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm2[1,1,1,1,5,5,5,5] ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, %ymm3 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7] @@ -6219,7 +6219,7 @@ ; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%rdi), %ymm4 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%rsi), %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm5[1,1,2,2,5,5,6,6] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm5[1,1,2,2,5,5,6,6] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm4[1,2],ymm1[3,4],ymm4[5,6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,3] @@ -6227,8 +6227,8 @@ ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 80(%r8), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4],ymm1[5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm0 = ymm3[0,2,3,3,4,6,7,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm2[2,3,2,3,6,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm3[0,2,3,3,4,6,7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm2[2,3,2,3,6,7,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm1 = ymm5[2],ymm4[2],ymm5[3],ymm4[3],ymm5[6],ymm4[6],ymm5[7],ymm4[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] @@ -6239,7 +6239,7 @@ ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 96(%rdx), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovaps 96(%rcx), %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm0 = ymm2[1,1,1,1,5,5,5,5] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm2[1,1,1,1,5,5,5,5] ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, %ymm3 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7] @@ -6249,7 +6249,7 @@ ; AVX2-FAST-PERLANE-NEXT: vmovaps 96(%rdi), %ymm4 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 96(%rsi), %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm5[1,1,2,2,5,5,6,6] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm5[1,1,2,2,5,5,6,6] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm4[1,2],ymm1[3,4],ymm4[5,6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,3] @@ -6257,8 +6257,8 @@ ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 112(%r8), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4],ymm1[5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm0 = ymm3[0,2,3,3,4,6,7,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm2[2,3,2,3,6,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm3[0,2,3,3,4,6,7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm2[2,3,2,3,6,7,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm1 = ymm5[2],ymm4[2],ymm5[3],ymm4[3],ymm5[6],ymm4[6],ymm5[7],ymm4[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] @@ -6269,7 +6269,7 @@ ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 128(%rdx), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovaps 128(%rcx), %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm0 = ymm2[1,1,1,1,5,5,5,5] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm2[1,1,1,1,5,5,5,5] ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, %ymm3 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7] @@ -6279,15 +6279,15 @@ ; AVX2-FAST-PERLANE-NEXT: vmovaps 128(%rdi), %ymm4 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 128(%rsi), %ymm13 -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm13[1,1,2,2,5,5,6,6] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm13[1,1,2,2,5,5,6,6] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm4[1,2],ymm1[3,4],ymm4[5,6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4],ymm1[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 144(%r8), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4],ymm1[5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm0 = ymm3[0,2,3,3,4,6,7,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm2[2,3,2,3,6,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm3[0,2,3,3,4,6,7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm2[2,3,2,3,6,7,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm1 = ymm13[2],ymm4[2],ymm13[3],ymm4[3],ymm13[6],ymm4[6],ymm13[7],ymm4[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] @@ -6298,20 +6298,20 @@ ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 160(%rdx), %ymm12 ; AVX2-FAST-PERLANE-NEXT: vmovaps 160(%rcx), %ymm11 -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm0 = ymm11[1,1,1,1,5,5,5,5] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm11[1,1,1,1,5,5,5,5] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm12[1],ymm0[2,3,4],ymm12[5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] ; AVX2-FAST-PERLANE-NEXT: vmovaps 160(%rdi), %ymm9 ; AVX2-FAST-PERLANE-NEXT: vmovaps 160(%rsi), %ymm8 -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm8[1,1,2,2,5,5,6,6] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm8[1,1,2,2,5,5,6,6] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm9[1,2],ymm1[3,4],ymm9[5,6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4],ymm1[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 176(%r8), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4],ymm1[5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm0 = ymm11[0,2,3,3,4,6,7,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm12[2,3,2,3,6,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm11[0,2,3,3,4,6,7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm12[2,3,2,3,6,7,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm1 = ymm8[2],ymm9[2],ymm8[3],ymm9[3],ymm8[6],ymm9[6],ymm8[7],ymm9[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] @@ -6322,20 +6322,20 @@ ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 192(%rdx), %ymm7 ; AVX2-FAST-PERLANE-NEXT: vmovaps 192(%rcx), %ymm6 -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm0 = ymm6[1,1,1,1,5,5,5,5] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm6[1,1,1,1,5,5,5,5] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm7[1],ymm0[2,3,4],ymm7[5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] ; AVX2-FAST-PERLANE-NEXT: vmovaps 192(%rdi), %ymm5 ; AVX2-FAST-PERLANE-NEXT: vmovaps 192(%rsi), %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm4[1,1,2,2,5,5,6,6] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm4[1,1,2,2,5,5,6,6] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm5[1,2],ymm1[3,4],ymm5[5,6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4],ymm1[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 208(%r8), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4],ymm1[5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm0 = ymm6[0,2,3,3,4,6,7,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm7[2,3,2,3,6,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm6[0,2,3,3,4,6,7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm7[2,3,2,3,6,7,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm1 = ymm4[2],ymm5[2],ymm4[3],ymm5[3],ymm4[6],ymm5[6],ymm4[7],ymm5[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] @@ -6346,20 +6346,20 @@ ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 224(%rdx), %ymm3 ; AVX2-FAST-PERLANE-NEXT: vmovaps 224(%rcx), %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm0 = ymm2[1,1,1,1,5,5,5,5] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm2[1,1,1,1,5,5,5,5] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3,4],ymm3[5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm10 = ymm0[2,2,2,2] ; AVX2-FAST-PERLANE-NEXT: vmovaps 224(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovaps 224(%rsi), %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm15 = ymm0[1,1,2,2,5,5,6,6] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm15 = ymm0[1,1,2,2,5,5,6,6] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0],ymm1[1,2],ymm15[3,4],ymm1[5,6],ymm15[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[2,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm10 = ymm15[0,1,2],ymm10[3,4],ymm15[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 240(%r8), %ymm15 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm10 = ymm15[0],ymm10[1,2,3,4],ymm15[5],ymm10[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm10, (%rsp) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm10 = ymm2[0,2,3,3,4,6,7,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm15 = ymm3[2,3,2,3,6,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm10 = ymm2[0,2,3,3,4,6,7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm15 = ymm3[2,3,2,3,6,7,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm10 = ymm15[0],ymm10[1,2],ymm15[3,4],ymm10[5,6],ymm15[7] ; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm15 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[3,3,3,3] @@ -6374,7 +6374,7 @@ ; AVX2-FAST-PERLANE-NEXT: # ymm10 = ymm10[0,1,2,3],mem[4],ymm10[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermilps $52, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm15 = mem[0,1,3,0,4,5,7,4] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm14 = ymm14[2,3,0,1,6,7,4,5] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm14 = ymm14[2,3,0,1,6,7,4,5] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1],ymm15[2],ymm14[3,4,5,6],ymm15[7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0],ymm14[1,2,3],ymm10[4,5],ymm14[6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload @@ -6418,7 +6418,7 @@ ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0],ymm14[1,2,3],ymm10[4,5],ymm14[6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm10 = ymm10[0,1,2],mem[3],ymm10[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm13 = ymm13[3,0,2,3,7,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm13 = ymm13[3,0,2,3,7,4,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm13 = ymm13[0,1,2,3],mem[4],ymm13[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermilps $52, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload @@ -6429,26 +6429,26 @@ ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0],ymm14[1,2,3],ymm13[4,5],ymm14[6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm13 = ymm13[0,1,2],mem[3],ymm13[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm8 = ymm8[3,0,2,3,7,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm8 = ymm8[3,0,2,3,7,4,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4],ymm8[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm9 = ymm11[0,1,3,0,4,5,7,4] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm11 = ymm12[2,3,0,1,6,7,4,5] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm9 = ymm11[0,1,3,0,4,5,7,4] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm11 = ymm12[2,3,0,1,6,7,4,5] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm11[0,1],ymm9[2],ymm11[3,4,5,6],ymm9[7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0],ymm9[1,2,3],ymm8[4,5],ymm9[6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm8 = ymm8[0,1,2],mem[3],ymm8[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm4 = ymm4[3,0,2,3,7,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm4 = ymm4[3,0,2,3,7,4,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4],ymm4[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm5 = ymm6[0,1,3,0,4,5,7,4] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm6 = ymm7[2,3,0,1,6,7,4,5] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm5 = ymm6[0,1,3,0,4,5,7,4] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm6 = ymm7[2,3,0,1,6,7,4,5] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1],ymm5[2],ymm6[3,4,5,6],ymm5[7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm5[1,2,3],ymm4[4,5],ymm5[6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm4 = ymm4[0,1,2],mem[3],ymm4[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,0,2,3,7,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,0,2,3,7,4,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4],ymm0[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm2[0,1,3,0,4,5,7,4] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm2 = ymm3[2,3,0,1,6,7,4,5] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,1,3,0,4,5,7,4] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm3[2,3,0,1,6,7,4,5] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4,5,6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4,5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-6.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-6.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-6.ll @@ -56,7 +56,7 @@ ; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm2 = ymm2[u,u,0,2,u,u,5,7] ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm4[0,2,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm4[0,2,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm3[1,3],xmm4[1,3] @@ -78,9 +78,9 @@ ; AVX2-SLOW-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero ; AVX2-SLOW-NEXT: vmovsd {{.*#+}} xmm3 = mem[0],zero ; AVX2-SLOW-NEXT: vmovlhps {{.*#+}} xmm2 = xmm3[0],xmm2[0] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,1] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm3 = xmm2[0,2,2,3] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm2[0,2,2,3] ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5],ymm0[6,7] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,3],xmm2[1,3] @@ -127,9 +127,9 @@ ; AVX2-FAST-PERLANE-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero ; AVX2-FAST-PERLANE-NEXT: vmovsd {{.*#+}} xmm3 = mem[0],zero ; AVX2-FAST-PERLANE-NEXT: vmovlhps {{.*#+}} xmm2 = xmm3[0],xmm2[0] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,1] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm3 = xmm2[0,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm3 = xmm2[0,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,3],xmm2[1,3] @@ -228,24 +228,24 @@ ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm10 = ymm9[0],ymm8[0],ymm9[1],ymm8[1],ymm9[4],ymm8[4],ymm9[5],ymm8[5] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm11 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm6[0],ymm11[0],ymm6[2],ymm11[2] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm12 = ymm12[0,2,3,1,4,6,7,5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm12[0,2,3,1,4,6,7,5] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm13 = xmm3[0,0],xmm1[0,0] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm13 = xmm13[0,1,2,0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm13 = xmm13[0,1,2,0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1],ymm13[2,3],ymm12[4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm12[0,1,2,3],ymm10[4,5],ymm12[6,7] ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm6 = ymm11[2],ymm6[2],ymm11[3],ymm6[3],ymm11[6],ymm6[6],ymm11[7],ymm6[7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm11 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm7[1,2],ymm11[1,2],ymm7[5,6],ymm11[5,6] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm7 = ymm7[0,2,3,1,4,6,7,5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm7[0,2,3,1,4,6,7,5] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5],ymm7[6,7] ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm6[0,1],ymm4[2,3],ymm6[4,5,6,7] ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm8[1],ymm9[1],ymm8[3],ymm9[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm5 = ymm5[0,2,3,1,4,6,7,5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[0,2,3,1,4,6,7,5] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,3],ymm2[3,3],ymm0[7,7],ymm2[7,7] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1],ymm0[2,3,4,5],ymm5[6,7] ; AVX1-ONLY-NEXT: vmovaps %ymm0, 64(%rax) ; AVX1-ONLY-NEXT: vmovaps %ymm4, 32(%rax) @@ -285,7 +285,7 @@ ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] ; AVX2-ONLY-NEXT: vbroadcastf128 {{.*#+}} ymm1 = [2,6,3,7,2,6,3,7] ; AVX2-ONLY-NEXT: # ymm1 = mem[0,1,0,1] ; AVX2-ONLY-NEXT: vpermps %ymm8, %ymm1, %ymm1 @@ -414,7 +414,7 @@ ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm7[0],ymm8[0],ymm7[1],ymm8[1],ymm7[4],ymm8[4],ymm7[5],ymm8[5] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm4 = ymm4[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm4 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm4[2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] @@ -423,7 +423,7 @@ ; AVX1-ONLY-NEXT: vmovaps (%rcx), %xmm9 ; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm10 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm10[1,2],xmm9[1,2] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm4 = xmm4[0,2,1,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm4[0,2,1,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm4, %ymm4 ; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm5 ; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm6 @@ -437,30 +437,30 @@ ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm8 = ymm7[2],ymm8[2],ymm7[3],ymm8[3],ymm7[6],ymm8[6],ymm7[7],ymm8[7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm2[1,2],ymm3[1,2],ymm2[5,6],ymm3[5,6] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm7[2,3,2,3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm7 = ymm7[0,2,1,3,4,6,5,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm7[0,2,1,3,4,6,5,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5],ymm7[6,7] ; AVX1-ONLY-NEXT: vbroadcastss 20(%r8), %xmm12 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm12[2,3],ymm7[4,5,6,7] ; AVX1-ONLY-NEXT: vbroadcastss 20(%r9), %ymm12 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2],ymm12[3],ymm7[4,5,6,7] ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm9 = xmm10[2],xmm9[2],xmm10[3],xmm9[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm9 = xmm9[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm9 = xmm9[2,3,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm11, %ymm9 ; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm10 = mem[2,1,3,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm10, %ymm10 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1],ymm9[2,3,4,5],ymm10[6,7] ; AVX1-ONLY-NEXT: vmovaps (%r9), %xmm10 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm11 = xmm10[0,2,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm11 = xmm10[0,2,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm11, %ymm10 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0],ymm10[1],ymm9[2,3,4,5,6],ymm10[7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm3[3,0],ymm2[3,0],ymm3[7,4],ymm2[7,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm2 = ymm2[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm8[2,3],ymm2[2,3] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,2,3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[2,1,3,3,6,5,7,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,1,3,3,6,5,7,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4,5,6],ymm2[7] ; AVX1-ONLY-NEXT: vbroadcastss (%rcx), %xmm2 ; AVX1-ONLY-NEXT: vbroadcastss (%rdx), %xmm3 @@ -944,7 +944,7 @@ ; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %xmm10 ; AVX1-ONLY-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm6[1,2],xmm8[1,2] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,1,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,1,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -961,7 +961,7 @@ ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm4[0],ymm13[0],ymm4[1],ymm13[1],ymm4[4],ymm13[4],ymm4[5],ymm13[5] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm9[0],ymm7[0],ymm9[2],ymm7[2] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm11[4,5],ymm0[6,7] @@ -969,7 +969,7 @@ ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm10[1,2],xmm3[1,2] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,1,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,1,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %xmm3 ; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm2 @@ -988,7 +988,7 @@ ; AVX1-ONLY-NEXT: vmovaps (%rdx), %ymm11 ; AVX1-ONLY-NEXT: vmovaps (%rcx), %ymm12 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm12[0],ymm11[0],ymm12[2],ymm11[2] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm10 = ymm10[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm14[0,1],ymm10[2,3],ymm14[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps (%r8), %ymm14 @@ -999,7 +999,7 @@ ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm13 = ymm4[2],ymm13[2],ymm4[3],ymm13[3],ymm4[6],ymm13[6],ymm4[7],ymm13[7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm7[1,2],ymm9[1,2],ymm7[5,6],ymm9[5,6] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3,2,3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm4 = ymm4[0,2,1,3,4,6,5,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,2,1,3,4,6,5,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm13[4,5],ymm4[6,7] ; AVX1-ONLY-NEXT: vbroadcastss 52(%r8), %xmm10 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm10[2,3],ymm4[4,5,6,7] @@ -1008,20 +1008,20 @@ ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm4 = ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[6],ymm0[6],ymm1[7],ymm0[7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm11[1,2],ymm12[1,2],ymm11[5,6],ymm12[5,6] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5],ymm0[6,7] ; AVX1-ONLY-NEXT: vbroadcastss 20(%r8), %xmm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vbroadcastss 20(%r9), %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm6[2],xmm8[2],xmm6[3],xmm8[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm5, %ymm0 ; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm5 = mem[2,1,3,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm5, %ymm5 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1],ymm0[2,3,4,5],ymm5[6,7] ; AVX1-ONLY-NEXT: vmovaps (%r9), %xmm5 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm6 = xmm5[0,2,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm5[0,2,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm6, %ymm5 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm0[0],ymm5[1],ymm0[2,3,4,5,6],ymm5[7] ; AVX1-ONLY-NEXT: vbroadcastss 32(%rcx), %xmm0 @@ -1035,25 +1035,25 @@ ; AVX1-ONLY-NEXT: vbroadcastss 32(%r9), %ymm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm0[0,1,2,3,4],ymm2[5],ymm0[6,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm9[3,0],ymm7[3,0],ymm9[7,4],ymm7[7,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm13[2,3],ymm0[2,3] ; AVX1-ONLY-NEXT: vperm2f128 $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm3 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm3 = ymm3[2,1,3,3,6,5,7,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[2,1,3,3,6,5,7,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3,4,5],ymm3[6,7] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3,4,5,6],ymm3[7] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm3 = xmm3[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm3[2,3,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm15, %ymm3 ; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm6 = mem[2,1,3,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm6, %ymm6 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm6[0,1],ymm3[2,3,4,5],ymm6[6,7] ; AVX1-ONLY-NEXT: vmovaps 32(%r9), %xmm6 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm7 = xmm6[0,2,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm6[0,2,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm7, %ymm6 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm6[1],ymm3[2,3,4,5,6],ymm6[7] ; AVX1-ONLY-NEXT: vbroadcastss (%rcx), %xmm6 @@ -1069,14 +1069,14 @@ ; AVX1-ONLY-NEXT: vbroadcastss (%r9), %ymm7 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm7[5],ymm6[6,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm12[3,0],ymm11[3,0],ymm12[7,4],ymm11[7,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm7 = ymm7[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm7[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3],ymm7[2,3] ; AVX1-ONLY-NEXT: vperm2f128 $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm7 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm7 = ymm7[2,1,3,3,6,5,7,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm7[2,1,3,3,6,5,7,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm7[0,1],ymm4[2,3,4,5],ymm7[6,7] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm7 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm7 = ymm7[0,2,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm7[0,2,2,3,4,6,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm7[1],ymm4[2,3,4,5,6],ymm7[7] ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX1-ONLY-NEXT: vmovaps %ymm4, 160(%rax) @@ -2351,7 +2351,7 @@ ; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %xmm3 ; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,2],xmm1[1,2] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,1,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,1,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -2370,7 +2370,7 @@ ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm10[0],ymm4[0],ymm10[2],ymm4[2] ; AVX1-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5],ymm0[6,7] @@ -2378,7 +2378,7 @@ ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm3[1,2],xmm2[1,2] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,1,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,1,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, (%rsp) # 16-byte Spill @@ -2402,7 +2402,7 @@ ; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 32(%r8), %ymm1 @@ -2416,7 +2416,7 @@ ; AVX1-ONLY-NEXT: vmovaps 64(%rdx), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm1[1,2],xmm0[1,2] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,1,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,1,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovaps 64(%rsi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -2441,7 +2441,7 @@ ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm14[0],ymm6[0],ymm14[2],ymm6[2] ; AVX1-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 64(%r8), %ymm1 @@ -2455,7 +2455,7 @@ ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,2],xmm15[1,2] ; AVX1-ONLY-NEXT: vmovaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,1,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,1,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm2 ; AVX1-ONLY-NEXT: vmovaps 96(%rsi), %xmm13 ; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm4 @@ -2476,7 +2476,7 @@ ; AVX1-ONLY-NEXT: vmovaps 96(%rcx), %ymm11 ; AVX1-ONLY-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm11[0],ymm12[0],ymm11[2],ymm12[2] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm11 = ymm11[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm11[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm11, %xmm11 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm11[2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 96(%r8), %ymm1 @@ -2490,7 +2490,7 @@ ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm0[1,2],ymm10[1,2],ymm0[5,6],ymm10[5,6] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm9[2,3,2,3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm9 = ymm9[0,2,1,3,4,6,5,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm9[0,2,1,3,4,6,5,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm11[4,5],ymm9[6,7] ; AVX1-ONLY-NEXT: vbroadcastss 20(%r8), %xmm10 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1],ymm10[2,3],ymm9[4,5,6,7] @@ -2503,7 +2503,7 @@ ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm10[1,2],ymm9[1,2],ymm10[5,6],ymm9[5,6] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm7[2,3,2,3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm7 = ymm7[0,2,1,3,4,6,5,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm7[0,2,1,3,4,6,5,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm1[4,5],ymm7[6,7] ; AVX1-ONLY-NEXT: vbroadcastss 52(%r8), %xmm8 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm8[2,3],ymm7[4,5,6,7] @@ -2514,7 +2514,7 @@ ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm7 = ymm1[2],ymm5[2],ymm1[3],ymm5[3],ymm1[6],ymm5[6],ymm1[7],ymm5[7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm6[1,2],ymm14[1,2],ymm6[5,6],ymm14[5,6] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm5[2,3,2,3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm5 = ymm5[0,2,1,3,4,6,5,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[0,2,1,3,4,6,5,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm7[4,5],ymm5[6,7] ; AVX1-ONLY-NEXT: vbroadcastss 84(%r8), %xmm6 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm6[2,3],ymm5[4,5,6,7] @@ -2522,7 +2522,7 @@ ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1,2],ymm6[3],ymm5[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm4[0],xmm13[0],xmm4[1],xmm13[1] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm4 = xmm15[0,0,0,0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm15[0,0,0,0] ; AVX1-ONLY-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm5 = mem[0,0,0,0] ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] @@ -2536,7 +2536,7 @@ ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm12[1,2],ymm13[1,2],ymm12[5,6],ymm13[5,6] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,2,3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,2,1,3,4,6,5,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2,1,3,4,6,5,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vbroadcastss 116(%r8), %xmm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5,6,7] @@ -2559,26 +2559,26 @@ ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 ; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm2 = mem[2,1,3,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm2, %ymm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5],ymm2[6,7] ; AVX1-ONLY-NEXT: vmovaps (%r9), %xmm2 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm3 = xmm2[0,2,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm2[0,2,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm1[0],ymm2[1],ymm1[2,3,4,5,6],ymm2[7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[3,0],ymm0[3,0],ymm1[7,4],ymm0[7,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm11[2,3],ymm1[2,3] ; AVX1-ONLY-NEXT: vperm2f128 $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm1 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[2,1,3,3,6,5,7,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,1,3,3,6,5,7,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6],ymm1[7] ; AVX1-ONLY-NEXT: vbroadcastss 32(%rcx), %xmm1 ; AVX1-ONLY-NEXT: vbroadcastss 32(%rdx), %xmm2 @@ -2595,26 +2595,26 @@ ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 ; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm3 = mem[2,1,3,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm3, %ymm3 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3,4,5],ymm3[6,7] ; AVX1-ONLY-NEXT: vmovaps 32(%r9), %xmm3 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm5 = xmm3[0,2,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm3[0,2,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm5, %ymm3 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm1[0],ymm3[1],ymm1[2,3,4,5,6],ymm3[7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm9[3,0],ymm10[3,0],ymm9[7,4],ymm10[7,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm1 = mem[2,3],ymm1[2,3] ; AVX1-ONLY-NEXT: vperm2f128 $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm3 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm3 = ymm3[2,1,3,3,6,5,7,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[2,1,3,3,6,5,7,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3,4,5],ymm3[6,7] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2,3,4,5,6],ymm3[7] ; AVX1-ONLY-NEXT: vbroadcastss 64(%rcx), %xmm3 ; AVX1-ONLY-NEXT: vbroadcastss 64(%rdx), %xmm5 @@ -2631,50 +2631,50 @@ ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm5 = xmm5[2],mem[2],xmm5[3],mem[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm5 = xmm5[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm5[2,3,2,3] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm8, %ymm5 ; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm8 = mem[2,1,3,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm8, %ymm8 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm8[0,1],ymm5[2,3,4,5],ymm8[6,7] ; AVX1-ONLY-NEXT: vmovaps 64(%r9), %xmm8 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm9 = xmm8[0,2,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm9 = xmm8[0,2,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm9, %ymm8 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm8[1],ymm5[2,3,4,5,6],ymm8[7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm8 = ymm8[3,0],mem[3,0],ymm8[7,4],mem[7,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm8 = ymm8[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm7[2,3],ymm8[2,3] ; AVX1-ONLY-NEXT: vperm2f128 $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm8 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm8 = ymm8[2,1,3,3,6,5,7,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[2,1,3,3,6,5,7,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1],ymm7[2,3,4,5],ymm8[6,7] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm8 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm8 = ymm8[0,2,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[0,2,2,3,4,6,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0],ymm8[1],ymm7[2,3,4,5,6],ymm8[7] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm8 = xmm8[2],mem[2],xmm8[3],mem[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm8 = xmm8[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm8 = xmm8[2,3,2,3] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm9, %ymm8 ; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm9 = mem[2,1,3,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm9, %ymm9 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1],ymm8[2,3,4,5],ymm9[6,7] ; AVX1-ONLY-NEXT: vmovaps 96(%r9), %xmm9 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm10 = xmm9[0,2,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm9[0,2,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm10, %ymm9 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0],ymm9[1],ymm8[2,3,4,5,6],ymm9[7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm13[3,0],ymm12[3,0],ymm13[7,4],ymm12[7,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm9 = ymm9[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm9[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3],ymm9[2,3] ; AVX1-ONLY-NEXT: vperm2f128 $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm9 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm9 = ymm9[2,1,3,3,6,5,7,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm9[2,1,3,3,6,5,7,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm9[0,1],ymm4[2,3,4,5],ymm9[6,7] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm9 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm9 = ymm9[0,2,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm9[0,2,2,3,4,6,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm9[1],ymm4[2,3,4,5,6],ymm9[7] ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX1-ONLY-NEXT: vmovaps %ymm4, 736(%rax) @@ -2930,7 +2930,7 @@ ; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload ; AVX2-SLOW-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm13 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm13 = xmm10[2],mem[2],xmm10[3],mem[3] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm13 = xmm13[2,3,2,3] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm13 = xmm13[2,3,2,3] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm13, %ymm10, %ymm13 ; AVX2-SLOW-NEXT: vpermilps $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload @@ -2963,7 +2963,7 @@ ; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-SLOW-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[2,3,2,3] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm8, %ymm1 ; AVX2-SLOW-NEXT: vpermilps $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload @@ -2996,7 +2996,7 @@ ; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX2-SLOW-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm3 = xmm3[2,3,2,3] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm3[2,3,2,3] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm3, %ymm9, %ymm3 ; AVX2-SLOW-NEXT: vpermilps $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload @@ -3031,7 +3031,7 @@ ; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX2-SLOW-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm5 = xmm5[2],mem[2],xmm5[3],mem[3] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm5 = xmm5[2,3,2,3] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm5 = xmm5[2,3,2,3] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm5, %ymm10, %ymm5 ; AVX2-SLOW-NEXT: vpermilps $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload @@ -3685,7 +3685,7 @@ ; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm13 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm13 = xmm10[2],mem[2],xmm10[3],mem[3] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm13 = xmm13[2,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm13 = xmm13[2,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm13, %ymm10, %ymm13 ; AVX2-FAST-PERLANE-NEXT: vpermilps $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload @@ -3718,7 +3718,7 @@ ; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[2,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm1, %ymm8, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vpermilps $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload @@ -3751,7 +3751,7 @@ ; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm3 = xmm3[2,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm3 = xmm3[2,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm3, %ymm9, %ymm3 ; AVX2-FAST-PERLANE-NEXT: vpermilps $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload @@ -3786,7 +3786,7 @@ ; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm5 = xmm5[2],mem[2],xmm5[3],mem[3] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm5 = xmm5[2,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm5 = xmm5[2,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm5, %ymm10, %ymm5 ; AVX2-FAST-PERLANE-NEXT: vpermilps $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload @@ -5147,7 +5147,7 @@ ; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %xmm3 ; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,2],xmm1[1,2] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,1,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,1,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -5165,7 +5165,7 @@ ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm8[0],ymm9[0],ymm8[1],ymm9[1],ymm8[4],ymm9[4],ymm8[5],ymm9[5] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm5[0],ymm4[0],ymm5[2],ymm4[2] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5],ymm0[6,7] @@ -5173,7 +5173,7 @@ ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm3[1,2],xmm2[1,2] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,1,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,1,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -5199,7 +5199,7 @@ ; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 32(%r8), %ymm1 @@ -5213,7 +5213,7 @@ ; AVX1-ONLY-NEXT: vmovaps 64(%rdx), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm1[1,2],xmm0[1,2] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,1,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,1,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovaps 64(%rsi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -5239,7 +5239,7 @@ ; AVX1-ONLY-NEXT: vmovaps 64(%rcx), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 64(%r8), %ymm1 @@ -5253,7 +5253,7 @@ ; AVX1-ONLY-NEXT: vmovaps 96(%rdx), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,2],xmm1[1,2] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,1,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,1,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovaps 96(%rsi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -5278,7 +5278,7 @@ ; AVX1-ONLY-NEXT: vmovaps 96(%rcx), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 96(%r8), %ymm1 @@ -5292,7 +5292,7 @@ ; AVX1-ONLY-NEXT: vmovaps 128(%rdx), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,2],xmm1[1,2] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,1,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,1,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovaps 128(%rsi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -5317,7 +5317,7 @@ ; AVX1-ONLY-NEXT: vmovaps 128(%rcx), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 128(%r8), %ymm1 @@ -5331,7 +5331,7 @@ ; AVX1-ONLY-NEXT: vmovaps 160(%rdx), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,2],xmm1[1,2] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,1,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,1,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovaps 160(%rsi), %xmm7 ; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %xmm6 @@ -5355,7 +5355,7 @@ ; AVX1-ONLY-NEXT: vmovaps 160(%rcx), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 160(%r8), %ymm1 @@ -5369,7 +5369,7 @@ ; AVX1-ONLY-NEXT: vmovaps 192(%rdx), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm1[1,2],xmm0[1,2] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,1,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,1,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovaps 192(%rsi), %xmm3 ; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %xmm2 @@ -5391,7 +5391,7 @@ ; AVX1-ONLY-NEXT: vmovaps 192(%rcx), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm5[0],ymm1[2],ymm5[2] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 192(%r8), %ymm1 @@ -5405,7 +5405,7 @@ ; AVX1-ONLY-NEXT: vmovaps 224(%rdx), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,2],xmm1[1,2] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,1,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,1,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovaps 224(%rsi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -5429,7 +5429,7 @@ ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm5[0],ymm1[0],ymm5[2],ymm1[2] ; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm15 = ymm15[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm15, %xmm15 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1],ymm15[2,3],ymm12[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 224(%r8), %ymm0 @@ -5445,7 +5445,7 @@ ; AVX1-ONLY-NEXT: vshufps $153, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm15 = ymm0[1,2],mem[1,2],ymm0[5,6],mem[5,6] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm15 = ymm15[2,3,2,3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm15 = ymm15[0,2,1,3,4,6,5,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[0,2,1,3,4,6,5,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0,1,2,3],ymm12[4,5],ymm15[6,7] ; AVX1-ONLY-NEXT: vbroadcastss 20(%r8), %xmm14 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1],ymm14[2,3],ymm15[4,5,6,7] @@ -5460,7 +5460,7 @@ ; AVX1-ONLY-NEXT: vshufps $153, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm14 = ymm0[1,2],mem[1,2],ymm0[5,6],mem[5,6] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm14 = ymm14[2,3,2,3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm14 = ymm14[0,2,1,3,4,6,5,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[0,2,1,3,4,6,5,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm15[4,5],ymm14[6,7] ; AVX1-ONLY-NEXT: vbroadcastss 52(%r8), %xmm15 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1],ymm15[2,3],ymm14[4,5,6,7] @@ -5475,7 +5475,7 @@ ; AVX1-ONLY-NEXT: vshufps $153, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm14 = ymm0[1,2],mem[1,2],ymm0[5,6],mem[5,6] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm14 = ymm14[2,3,2,3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm14 = ymm14[0,2,1,3,4,6,5,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[0,2,1,3,4,6,5,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm15[4,5],ymm14[6,7] ; AVX1-ONLY-NEXT: vbroadcastss 84(%r8), %xmm15 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1],ymm15[2,3],ymm14[4,5,6,7] @@ -5503,7 +5503,7 @@ ; AVX1-ONLY-NEXT: vshufps $153, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm14 = ymm0[1,2],mem[1,2],ymm0[5,6],mem[5,6] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm14 = ymm14[2,3,2,3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm14 = ymm14[0,2,1,3,4,6,5,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[0,2,1,3,4,6,5,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm13[4,5],ymm14[6,7] ; AVX1-ONLY-NEXT: vbroadcastss 116(%r8), %xmm15 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1],ymm15[2,3],ymm14[4,5,6,7] @@ -5517,7 +5517,7 @@ ; AVX1-ONLY-NEXT: vshufps $153, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm11 = ymm11[1,2],mem[1,2],ymm11[5,6],mem[5,6] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm11[2,3,2,3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm11 = ymm11[0,2,1,3,4,6,5,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm11[0,2,1,3,4,6,5,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm0[4,5],ymm11[6,7] ; AVX1-ONLY-NEXT: vbroadcastss 148(%r8), %xmm14 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1],ymm14[2,3],ymm11[4,5,6,7] @@ -5526,9 +5526,9 @@ ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm7 = xmm15[0,0,0,0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm15[0,0,0,0] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm11 = xmm14[0,0,0,0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm11 = xmm14[0,0,0,0] ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm7 = xmm11[0],xmm7[0],xmm11[1],xmm7[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm11 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm11[0,1],ymm7[2,3],ymm11[4,5,6,7] @@ -5545,7 +5545,7 @@ ; AVX1-ONLY-NEXT: vshufps $153, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm6 = ymm0[1,2],mem[1,2],ymm0[5,6],mem[5,6] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm6[2,3,2,3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm6 = ymm6[0,2,1,3,4,6,5,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm6[0,2,1,3,4,6,5,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5],ymm6[6,7] ; AVX1-ONLY-NEXT: vbroadcastss 180(%r8), %xmm7 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm7[2,3],ymm6[4,5,6,7] @@ -5556,7 +5556,7 @@ ; AVX1-ONLY-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm3 = mem[0,0,0,0] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm6 = xmm7[0,0,0,0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm7[0,0,0,0] ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm3 = xmm6[0],xmm3[0],xmm6[1],xmm3[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm6 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm6[0,1],ymm3[2,3],ymm6[4,5,6,7] @@ -5570,7 +5570,7 @@ ; AVX1-ONLY-NEXT: vshufps $153, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm2 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm2 = ymm3[1,2],mem[1,2],ymm3[5,6],mem[5,6] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3,2,3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm2 = ymm2[0,2,1,3,4,6,5,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,2,1,3,4,6,5,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm6[4,5],ymm2[6,7] ; AVX1-ONLY-NEXT: vbroadcastss 212(%r8), %xmm4 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm4[2,3],ymm2[4,5,6,7] @@ -5581,7 +5581,7 @@ ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[1,2],ymm5[1,2],ymm1[5,6],ymm5[5,6] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5],ymm0[6,7] ; AVX1-ONLY-NEXT: vbroadcastss 244(%r8), %xmm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] @@ -5604,28 +5604,28 @@ ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = mem[2,1,3,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovaps (%r9), %xmm1 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[0,2,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm1[0,2,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6],ymm1[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[3,0],mem[3,0],ymm0[7,4],mem[7,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm12[2,3],ymm0[2,3] ; AVX1-ONLY-NEXT: vperm2f128 $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm1 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[2,1,3,3,6,5,7,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,1,3,3,6,5,7,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6],ymm1[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vbroadcastss 32(%rcx), %xmm0 @@ -5644,29 +5644,29 @@ ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = mem[2,1,3,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovaps 32(%r9), %xmm1 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[0,2,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm1[0,2,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6],ymm1[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[3,0],mem[3,0],ymm0[7,4],mem[7,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = mem[2,3],ymm0[2,3] ; AVX1-ONLY-NEXT: vperm2f128 $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm1 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[2,1,3,3,6,5,7,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,1,3,3,6,5,7,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6],ymm1[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vbroadcastss 64(%rcx), %xmm1 @@ -5685,53 +5685,53 @@ ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm1 = xmm0[2],mem[2],xmm0[3],mem[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm2 = mem[2,1,3,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm2, %ymm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5],ymm2[6,7] ; AVX1-ONLY-NEXT: vmovaps 64(%r9), %xmm2 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm4 = xmm2[0,2,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm2[0,2,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm4, %ymm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4,5,6],ymm2[7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm2 = ymm0[3,0],mem[3,0],ymm0[7,4],mem[7,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm2 = ymm2[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm2 = mem[2,3],ymm2[2,3] ; AVX1-ONLY-NEXT: vperm2f128 $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm4 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm4 = ymm4[2,1,3,3,6,5,7,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[2,1,3,3,6,5,7,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3,4,5],ymm4[6,7] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm4 = ymm4[0,2,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,2,2,3,4,6,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm2[0],ymm4[1],ymm2[2,3,4,5,6],ymm4[7] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm2 = xmm0[2],mem[2],xmm0[3],mem[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm2[2,3,2,3] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 ; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm4 = mem[2,1,3,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm4, %ymm4 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3,4,5],ymm4[6,7] ; AVX1-ONLY-NEXT: vmovaps 96(%r9), %xmm4 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm5 = xmm4[0,2,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm4[0,2,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm5, %ymm4 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[2,3,4,5,6],ymm4[7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm4 = ymm0[3,0],mem[3,0],ymm0[7,4],mem[7,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm4 = ymm4[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm13[2,3],ymm4[2,3] ; AVX1-ONLY-NEXT: vperm2f128 $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm5 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm5 = ymm5[2,1,3,3,6,5,7,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[2,1,3,3,6,5,7,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1],ymm4[2,3,4,5],ymm5[6,7] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm5 = ymm5[0,2,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[0,2,2,3,4,6,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm4[0],ymm5[1],ymm4[2,3,4,5,6],ymm5[7] ; AVX1-ONLY-NEXT: vbroadcastss 128(%rcx), %xmm4 ; AVX1-ONLY-NEXT: vbroadcastss 128(%rdx), %xmm11 @@ -5748,75 +5748,75 @@ ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm11 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm11 = xmm0[2],mem[2],xmm0[3],mem[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm11 = xmm11[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm11 = xmm11[2,3,2,3] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm11 ; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm12 = mem[2,1,3,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm12, %ymm12 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm12[0,1],ymm11[2,3,4,5],ymm12[6,7] ; AVX1-ONLY-NEXT: vmovaps 128(%r9), %xmm12 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm13 = xmm12[0,2,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm13 = xmm12[0,2,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm13, %ymm12 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0],ymm12[1],ymm11[2,3,4,5,6],ymm12[7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm12 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm12 = ymm0[3,0],mem[3,0],ymm0[7,4],mem[7,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm12 = ymm12[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm12[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm10 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm10 = mem[2,3],ymm12[2,3] ; AVX1-ONLY-NEXT: vperm2f128 $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm12 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm12 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm12 = ymm12[2,1,3,3,6,5,7,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm12[2,1,3,3,6,5,7,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm12[0,1],ymm10[2,3,4,5],ymm12[6,7] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm12 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm12 = ymm12[0,2,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm12[0,2,2,3,4,6,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0],ymm12[1],ymm10[2,3,4,5,6],ymm12[7] ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm12 = xmm14[2],xmm15[2],xmm14[3],xmm15[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm12 = xmm12[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm12 = xmm12[2,3,2,3] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm12 ; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm13 = mem[2,1,3,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm13, %ymm13 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0,1],ymm12[2,3,4,5],ymm13[6,7] ; AVX1-ONLY-NEXT: vmovaps 160(%r9), %xmm13 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm14 = xmm13[0,2,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm13[0,2,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm14, %ymm13 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0],ymm13[1],ymm12[2,3,4,5,6],ymm13[7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm13 = ymm0[3,0],mem[3,0],ymm0[7,4],mem[7,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm13 = ymm13[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm13[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm9 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm9 = mem[2,3],ymm13[2,3] ; AVX1-ONLY-NEXT: vperm2f128 $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm13 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm13 = ymm13[2,1,3,3,6,5,7,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm13[2,1,3,3,6,5,7,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm13[0,1],ymm9[2,3,4,5],ymm13[6,7] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm13 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm13 = ymm13[0,2,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm13[0,2,2,3,4,6,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0],ymm13[1],ymm9[2,3,4,5,6],ymm13[7] ; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm13 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm13 = xmm7[2],mem[2],xmm7[3],mem[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm13 = xmm13[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm13 = xmm13[2,3,2,3] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm0, %ymm13 ; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm14 = mem[2,1,3,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm14, %ymm14 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0,1],ymm13[2,3,4,5],ymm14[6,7] ; AVX1-ONLY-NEXT: vmovaps 192(%r9), %xmm14 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm15 = xmm14[0,2,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm14[0,2,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm15, %ymm14 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0],ymm14[1],ymm13[2,3,4,5,6],ymm14[7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm0[3,0],ymm3[3,0],ymm0[7,4],ymm3[7,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm14 = ymm14[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm6[2,3],ymm14[2,3] ; AVX1-ONLY-NEXT: vperm2f128 $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm14 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm14 = ymm14[2,1,3,3,6,5,7,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[2,1,3,3,6,5,7,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm14[0,1],ymm3[2,3,4,5],ymm14[6,7] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm14 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm14 = ymm14[0,2,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[0,2,2,3,4,6,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm14[1],ymm3[2,3,4,5,6],ymm14[7] ; AVX1-ONLY-NEXT: vbroadcastss 224(%rcx), %xmm14 ; AVX1-ONLY-NEXT: vbroadcastss 224(%rdx), %xmm15 @@ -5833,28 +5833,28 @@ ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm14 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm14 = xmm7[2],mem[2],xmm7[3],mem[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm14 = xmm14[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm14[2,3,2,3] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm7, %ymm14 ; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm15 = mem[2,1,3,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm15, %ymm15 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1],ymm14[2,3,4,5],ymm15[6,7] ; AVX1-ONLY-NEXT: vmovaps 224(%r9), %xmm15 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm7 = xmm15[0,2,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm15[0,2,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm7, %ymm7 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm14[0],ymm7[1],ymm14[2,3,4,5,6],ymm7[7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm14 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm14 = ymm6[3,0],mem[3,0],ymm6[7,4],mem[7,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm14 = ymm14[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm6 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm6 = mem[2,3],ymm14[2,3] ; AVX1-ONLY-NEXT: vperm2f128 $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm14 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm14 = ymm14[2,1,3,3,6,5,7,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[2,1,3,3,6,5,7,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm14[0,1],ymm6[2,3,4,5],ymm14[6,7] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm14 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm14 = ymm14[0,2,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[0,2,2,3,4,6,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm14[1],ymm6[2,3,4,5,6],ymm14[7] ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX1-ONLY-NEXT: vmovaps %ymm6, 1504(%rax) @@ -6380,7 +6380,7 @@ ; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-SLOW-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm0, %ymm14, %ymm0 ; AVX2-SLOW-NEXT: vpermilps $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload @@ -6407,10 +6407,10 @@ ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm14[5],ymm0[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm0 = ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[6],ymm2[6],ymm1[7],ymm2[7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] ; AVX2-SLOW-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm0 = mem[2,3],ymm0[2,3] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm15[2,1,3,3,6,5,7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm15[2,1,3,3,6,5,7,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] ; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = mem[0,2,2,3,4,6,6,7] @@ -6420,7 +6420,7 @@ ; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-SLOW-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX2-SLOW-NEXT: vpermilps $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload @@ -6459,7 +6459,7 @@ ; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-SLOW-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX2-SLOW-NEXT: vpermilps $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload @@ -6497,7 +6497,7 @@ ; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-SLOW-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX2-SLOW-NEXT: vpermilps $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload @@ -6534,7 +6534,7 @@ ; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-SLOW-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX2-SLOW-NEXT: vpermilps $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload @@ -6569,7 +6569,7 @@ ; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-SLOW-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX2-SLOW-NEXT: vpermilps $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload @@ -6604,7 +6604,7 @@ ; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-SLOW-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0 ; AVX2-SLOW-NEXT: vpermilps $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload @@ -6639,7 +6639,7 @@ ; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX2-SLOW-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm13 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm13 = xmm3[2],mem[2],xmm3[3],mem[3] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm13 = xmm13[2,3,2,3] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm13 = xmm13[2,3,2,3] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm13, %ymm3, %ymm13 ; AVX2-SLOW-NEXT: vpermilps $250, (%rsp), %xmm14 # 16-byte Folded Reload @@ -6664,10 +6664,10 @@ ; AVX2-SLOW-NEXT: vbroadcastss 240(%r9), %ymm3 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm14[0,1,2,3,4],ymm3[5],ymm14[6,7] ; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm14 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm14 = ymm14[2,3,2,3,6,7,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm14 = ymm14[2,3,2,3,6,7,6,7] ; AVX2-SLOW-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm14 = mem[2,3],ymm14[2,3] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm15 = ymm15[2,1,3,3,6,5,7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm15 = ymm15[2,1,3,3,6,5,7,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[2,1,2,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1],ymm14[2,3,4,5],ymm15[6,7] ; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm15 = mem[0,2,2,3,4,6,6,7] @@ -7975,7 +7975,7 @@ ; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm0, %ymm14, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vpermilps $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload @@ -8002,10 +8002,10 @@ ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm14[5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm0 = ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[6],ymm2[6],ymm1[7],ymm2[7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] ; AVX2-FAST-PERLANE-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm0 = mem[2,3],ymm0[2,3] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm15[2,1,3,3,6,5,7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm15[2,1,3,3,6,5,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = mem[0,2,2,3,4,6,6,7] @@ -8015,7 +8015,7 @@ ; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vpermilps $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload @@ -8054,7 +8054,7 @@ ; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vpermilps $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload @@ -8092,7 +8092,7 @@ ; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vpermilps $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload @@ -8129,7 +8129,7 @@ ; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vpermilps $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload @@ -8164,7 +8164,7 @@ ; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vpermilps $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload @@ -8199,7 +8199,7 @@ ; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vpermilps $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload @@ -8234,7 +8234,7 @@ ; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm13 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm13 = xmm3[2],mem[2],xmm3[3],mem[3] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm13 = xmm13[2,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm13 = xmm13[2,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm13, %ymm3, %ymm13 ; AVX2-FAST-PERLANE-NEXT: vpermilps $250, (%rsp), %xmm14 # 16-byte Folded Reload @@ -8259,10 +8259,10 @@ ; AVX2-FAST-PERLANE-NEXT: vbroadcastss 240(%r9), %ymm3 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm14[0,1,2,3,4],ymm3[5],ymm14[6,7] ; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm14 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm14 = ymm14[2,3,2,3,6,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm14 = ymm14[2,3,2,3,6,7,6,7] ; AVX2-FAST-PERLANE-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm14 = mem[2,3],ymm14[2,3] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm15 = ymm15[2,1,3,3,6,5,7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm15 = ymm15[2,1,3,3,6,5,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[2,1,2,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1],ymm14[2,3,4,5],ymm15[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm15 = mem[0,2,2,3,4,6,6,7] diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-7.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-7.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-7.ll @@ -96,9 +96,9 @@ ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm2[0,2,2,1,4,6,6,5] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,2,2,1,4,6,6,5] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,2,1] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm3 = ymm0[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm3 = ymm0[0,2,2,3,4,6,6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,2] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6],ymm1[7] ; AVX2-SLOW-NEXT: vmovaps {{.*#+}} xmm3 = <3,5,7,u> @@ -166,9 +166,9 @@ ; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm2[0,2,2,1,4,6,6,5] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,2,2,1,4,6,6,5] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,2,1] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm3 = ymm0[0,2,2,3,4,6,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm3 = ymm0[0,2,2,3,4,6,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,2] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} xmm3 = <3,5,7,u> @@ -393,7 +393,7 @@ ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm10[6,7] ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm5 = ymm7[0],ymm5[0],ymm7[1],ymm5[1],ymm7[4],ymm5[4],ymm7[5],ymm5[5] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm4[0,0],xmm3[0,0] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm3 = xmm3[0,1,2,0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,1,2,0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1],ymm3[2,3],ymm5[4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm8[4,5,6],ymm3[7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3],xmm2[3,3] @@ -423,15 +423,15 @@ ; AVX2-SLOW-NEXT: vbroadcastf128 {{.*#+}} ymm8 = [2,6,0,3,2,6,0,3] ; AVX2-SLOW-NEXT: # ymm8 = mem[0,1,0,1] ; AVX2-SLOW-NEXT: vpermps %ymm6, %ymm8, %ymm8 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm9 = ymm4[3,3,3,3,7,7,7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm9 = ymm4[3,3,3,3,7,7,7,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[0,1,0,2] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm9[5,6],ymm8[7] ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm9 ; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm9 = ymm9[2],ymm3[2],ymm9[3],ymm3[3],ymm9[6],ymm3[6],ymm9[7],ymm3[7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm9 = ymm9[0,1,0,1,4,5,4,5] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm9 = ymm9[0,1,0,1,4,5,4,5] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm9[2,3,4],ymm8[5,6,7] ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm9 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm10 = ymm7[1,1,1,1,5,5,5,5] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm10 = ymm7[1,1,1,1,5,5,5,5] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1,2,3,4],ymm9[5],ymm10[6,7] ; AVX2-SLOW-NEXT: vinsertps {{.*#+}} xmm5 = zero,xmm5[1],xmm0[1],zero ; AVX2-SLOW-NEXT: vbroadcastf128 {{.*#+}} ymm10 = [5,0,2,6,5,0,2,6] @@ -474,7 +474,7 @@ ; AVX2-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm3 ; AVX2-FAST-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm6 ; AVX2-FAST-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm7 -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm8 = ymm6[1,1,1,1,5,5,5,5] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm8 = ymm6[1,1,1,1,5,5,5,5] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1,2,3,4],ymm7[5],ymm8[6,7] ; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm8 = ; AVX2-FAST-NEXT: vpermps %ymm3, %ymm8, %ymm8 @@ -491,7 +491,7 @@ ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1,2,3,4],ymm8[5,6],ymm9[7] ; AVX2-FAST-NEXT: vinsertf128 $1, %xmm0, %ymm4, %ymm4 ; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm4 = ymm4[2],ymm5[2],ymm4[3],ymm5[3],ymm4[6],ymm5[6],ymm4[7],ymm5[7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm4 = ymm4[0,1,0,1,4,5,4,5] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,1,0,1,4,5,4,5] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm8[0,1],ymm4[2,3,4],ymm8[5,6,7] ; AVX2-FAST-NEXT: vmovaps {{.*#+}} xmm5 = ; AVX2-FAST-NEXT: vpermps %ymm3, %ymm5, %ymm3 @@ -506,7 +506,7 @@ ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6],ymm2[7] ; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm3 = ; AVX2-FAST-NEXT: vpermps %ymm6, %ymm3, %ymm3 -; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,3,3,3] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3] ; AVX2-FAST-NEXT: vmovaps %ymm2, (%rax) ; AVX2-FAST-NEXT: vmovaps %ymm4, 64(%rax) @@ -532,15 +532,15 @@ ; AVX2-FAST-PERLANE-NEXT: vbroadcastf128 {{.*#+}} ymm8 = [2,6,0,3,2,6,0,3] ; AVX2-FAST-PERLANE-NEXT: # ymm8 = mem[0,1,0,1] ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm6, %ymm8, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm9 = ymm4[3,3,3,3,7,7,7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm9 = ymm4[3,3,3,3,7,7,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[0,1,0,2] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm9[5,6],ymm8[7] ; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm9 ; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm9 = ymm9[2],ymm3[2],ymm9[3],ymm3[3],ymm9[6],ymm3[6],ymm9[7],ymm3[7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm9 = ymm9[0,1,0,1,4,5,4,5] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm9 = ymm9[0,1,0,1,4,5,4,5] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm9[2,3,4],ymm8[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm10 = ymm7[1,1,1,1,5,5,5,5] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm10 = ymm7[1,1,1,1,5,5,5,5] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1,2,3,4],ymm9[5],ymm10[6,7] ; AVX2-FAST-PERLANE-NEXT: vinsertps {{.*#+}} xmm5 = zero,xmm5[1],xmm0[1],zero ; AVX2-FAST-PERLANE-NEXT: vbroadcastf128 {{.*#+}} ymm10 = [5,0,2,6,5,0,2,6] @@ -792,12 +792,12 @@ ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm12 = xmm12[2,0],xmm13[2,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm12, %ymm12 ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm9 = xmm14[0],xmm15[0],xmm14[1],xmm15[1] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm9 = xmm9[0,1,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm9 = xmm9[0,1,0,1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm12[0,1],ymm9[2,3],ymm12[4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm9[0,1,2,3],ymm4[4,5,6],ymm9[7] ; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm4 = ymm4[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm9 = ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[6],ymm3[6],ymm2[7],ymm3[7] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3],ymm9[2,3] ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm9 = ymm7[1],ymm8[1],ymm7[3],ymm8[3] @@ -820,7 +820,7 @@ ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm9 = xmm14[2],xmm15[2],xmm14[3],xmm15[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm9, %ymm9 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1,2,3,4],ymm8[5,6],ymm9[7] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm9 = xmm6[2,2,2,2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm9 = xmm6[2,2,2,2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm9 = xmm5[0,1,2],xmm9[3] ; AVX1-ONLY-NEXT: vbroadcastsd 8(%rax), %ymm11 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm11[4,5,6,7] @@ -859,7 +859,7 @@ ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm9 ; AVX2-SLOW-NEXT: vmovaps (%r8), %xmm4 ; AVX2-SLOW-NEXT: vmovaps (%r9), %xmm5 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm10 = xmm5[1,1,1,1] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm10 = xmm5[1,1,1,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm10 = xmm10[0],xmm4[1],xmm10[2,3] ; AVX2-SLOW-NEXT: vbroadcastsd %xmm10, %ymm10 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1,2,3,4],ymm9[5],ymm10[6,7] @@ -868,7 +868,7 @@ ; AVX2-SLOW-NEXT: vinsertps {{.*#+}} xmm10 = zero,xmm13[1],xmm12[1],zero ; AVX2-SLOW-NEXT: vmovaps (%rdi), %xmm14 ; AVX2-SLOW-NEXT: vmovaps (%rsi), %xmm15 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm11 = xmm15[1,1,2,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm11 = xmm15[1,1,2,2] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm11 = xmm11[0,1],xmm14[2],xmm11[3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[0,1,2,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0],ymm10[1,2],ymm11[3,4,5,6,7] @@ -878,7 +878,7 @@ ; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm11 = ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[6],ymm2[6],ymm0[7],ymm2[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[2,2,2,2] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm11 = ymm7[1,1,2,2,5,5,6,6] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm11 = ymm7[1,1,2,2,5,5,6,6] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1],ymm6[2],ymm11[3,4,5],ymm6[6],ymm11[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[2,1,2,3] ; AVX2-SLOW-NEXT: vmovaps 16(%rax), %xmm9 @@ -888,10 +888,10 @@ ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm9 = xmm14[3,3],xmm15[3,3] ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm9 ; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} xmm11 = xmm13[2],xmm12[2],xmm13[3],xmm12[3] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm11 = xmm11[0,1,2,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm11 = xmm11[0,1,2,2] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[0,1,2,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm11[0,1,2,3,4],ymm9[5,6],ymm11[7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm11 = xmm5[2,2,2,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm11 = xmm5[2,2,2,2] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm11 = xmm4[0,1,2],xmm11[3] ; AVX2-SLOW-NEXT: vbroadcastsd 8(%rax), %ymm10 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7] @@ -900,7 +900,7 @@ ; AVX2-SLOW-NEXT: vbroadcastss %xmm13, %xmm10 ; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] ; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm10 = xmm14[0],xmm15[0],xmm14[1],xmm15[1] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm10 = xmm10[0,1,2,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm10 = xmm10[0,1,2,2] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[0,1,2,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1],ymm9[2,3],ymm10[4,5,6,7] ; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm10 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] @@ -909,12 +909,12 @@ ; AVX2-SLOW-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm10[0],ymm12[0],ymm10[2],ymm12[2] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm12 = ymm9[0,1,2,3],ymm10[4,5,6],ymm9[7] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm9 = ymm8[1,1],ymm3[1,1],ymm8[5,5],ymm3[5,5] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm10 = ymm2[1,1,1,1,5,5,5,5] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm10 = ymm2[1,1,1,1,5,5,5,5] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0],ymm0[1],ymm10[2,3,4],ymm0[5],ymm10[6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[2,2,2,2] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1,2,3,4],ymm9[5,6],ymm10[7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm10 = ymm7[0,0,0,0,4,4,4,4] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm13 = ymm6[0,1,0,1,4,5,4,5] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm10 = ymm7[0,0,0,0,4,4,4,4] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm13 = ymm6[0,1,0,1,4,5,4,5] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm10 = ymm13[0],ymm10[1],ymm13[2,3,4],ymm10[5],ymm13[6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[2,1,2,3] ; AVX2-SLOW-NEXT: vbroadcastsd 16(%rax), %ymm13 @@ -931,7 +931,7 @@ ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0],ymm6[1,2,3,4,5,6],ymm7[7] ; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[4],ymm2[4],ymm0[5],ymm2[5] ; AVX2-SLOW-NEXT: vbroadcastss 16(%rdx), %ymm2 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm3 = ymm3[3,1,2,0,7,5,6,4] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm3 = ymm3[3,1,2,0,7,5,6,4] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6],ymm3[7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5],ymm2[6,7] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm4[3,3],xmm5[3,3] @@ -964,7 +964,7 @@ ; AVX2-FAST-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm6 ; AVX2-FAST-NEXT: vmovaps (%r8), %xmm4 ; AVX2-FAST-NEXT: vmovaps (%r9), %xmm5 -; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm9 = xmm5[1,1,1,1] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm9 = xmm5[1,1,1,1] ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm9 = xmm9[0],xmm4[1],xmm9[2,3] ; AVX2-FAST-NEXT: vbroadcastsd %xmm9, %ymm9 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm9[0,1,2,3,4],ymm6[5],ymm9[6,7] @@ -973,7 +973,7 @@ ; AVX2-FAST-NEXT: vinsertps {{.*#+}} xmm9 = zero,xmm12[1],xmm11[1],zero ; AVX2-FAST-NEXT: vmovaps (%rdi), %xmm13 ; AVX2-FAST-NEXT: vmovaps (%rsi), %xmm14 -; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm15 = xmm14[1,1,2,2] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm15 = xmm14[1,1,2,2] ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm15 = xmm15[0,1],xmm13[2],xmm15[3] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[0,1,2,1] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm9 = ymm15[0],ymm9[1,2],ymm15[3,4,5,6,7] @@ -983,20 +983,20 @@ ; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm9 = ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[6],ymm2[6],ymm1[7],ymm2[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[2,2,2,2] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm9[0,1,2,3],ymm6[4,5,6,7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm9 = ymm8[1,1,2,2,5,5,6,6] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm9 = ymm8[1,1,2,2,5,5,6,6] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1],ymm7[2],ymm9[3,4,5],ymm7[6],ymm9[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[2,1,2,3] ; AVX2-FAST-NEXT: vmovaps 16(%rax), %xmm15 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0],ymm15[1],ymm9[2,3,4,5,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1],ymm6[2,3,4,5],ymm9[6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm6 = ymm2[1,1,1,1,5,5,5,5] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm6 = ymm2[1,1,1,1,5,5,5,5] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm1[1],ymm6[2,3,4],ymm1[5],ymm6[6,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[2,2,2,2] ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm15 = ymm10[1,1],ymm3[1,1],ymm10[5,5],ymm3[5,5] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm15[5,6],ymm6[7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm15 = ymm8[0,0,0,0,4,4,4,4] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm9 = ymm7[0,1,0,1,4,5,4,5] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm15 = ymm8[0,0,0,0,4,4,4,4] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm9 = ymm7[0,1,0,1,4,5,4,5] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0],ymm15[1],ymm9[2,3,4],ymm15[5],ymm9[6,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[2,1,2,3] ; AVX2-FAST-NEXT: vbroadcastsd 16(%rax), %ymm15 @@ -1010,7 +1010,7 @@ ; AVX2-FAST-NEXT: # ymm0 = mem[0,1,0,1] ; AVX2-FAST-NEXT: vpermps %ymm9, %ymm0, %ymm9 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm9[0,1,2,3,4],ymm6[5,6],ymm9[7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm9 = xmm5[2,2,2,2] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm9 = xmm5[2,2,2,2] ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm9 = xmm4[0,1,2],xmm9[3] ; AVX2-FAST-NEXT: vbroadcastsd 8(%rax), %ymm15 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm15[4,5,6,7] @@ -1038,7 +1038,7 @@ ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm8[4,5,6],ymm0[7] ; AVX2-FAST-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[4],ymm2[4],ymm1[5],ymm2[5] ; AVX2-FAST-NEXT: vbroadcastss 16(%rdx), %ymm2 -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm3 = ymm3[3,1,2,0,7,5,6,4] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm3 = ymm3[3,1,2,0,7,5,6,4] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6],ymm3[7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5],ymm2[6,7] ; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm2 = xmm4[3,3],xmm5[3,3] @@ -1071,7 +1071,7 @@ ; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm9 ; AVX2-FAST-PERLANE-NEXT: vmovaps (%r8), %xmm4 ; AVX2-FAST-PERLANE-NEXT: vmovaps (%r9), %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm10 = xmm5[1,1,1,1] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm10 = xmm5[1,1,1,1] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm10 = xmm10[0],xmm4[1],xmm10[2,3] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd %xmm10, %ymm10 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1,2,3,4],ymm9[5],ymm10[6,7] @@ -1080,7 +1080,7 @@ ; AVX2-FAST-PERLANE-NEXT: vinsertps {{.*#+}} xmm10 = zero,xmm13[1],xmm12[1],zero ; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdi), %xmm14 ; AVX2-FAST-PERLANE-NEXT: vmovaps (%rsi), %xmm15 -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm11 = xmm15[1,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm11 = xmm15[1,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm11 = xmm11[0,1],xmm14[2],xmm11[3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[0,1,2,1] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0],ymm10[1,2],ymm11[3,4,5,6,7] @@ -1090,7 +1090,7 @@ ; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm11 = ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[6],ymm2[6],ymm0[7],ymm2[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[2,2,2,2] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm11 = ymm7[1,1,2,2,5,5,6,6] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm11 = ymm7[1,1,2,2,5,5,6,6] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1],ymm6[2],ymm11[3,4,5],ymm6[6],ymm11[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[2,1,2,3] ; AVX2-FAST-PERLANE-NEXT: vmovaps 16(%rax), %xmm9 @@ -1100,10 +1100,10 @@ ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm9 = xmm14[3,3],xmm15[3,3] ; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm9 ; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} xmm11 = xmm13[2],xmm12[2],xmm13[3],xmm12[3] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm11 = xmm11[0,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm11 = xmm11[0,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[0,1,2,1] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm11[0,1,2,3,4],ymm9[5,6],ymm11[7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm11 = xmm5[2,2,2,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm11 = xmm5[2,2,2,2] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm11 = xmm4[0,1,2],xmm11[3] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 8(%rax), %ymm10 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7] @@ -1112,7 +1112,7 @@ ; AVX2-FAST-PERLANE-NEXT: vbroadcastss %xmm13, %xmm10 ; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] ; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm10 = xmm14[0],xmm15[0],xmm14[1],xmm15[1] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm10 = xmm10[0,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm10 = xmm10[0,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[0,1,2,1] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1],ymm9[2,3],ymm10[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm10 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] @@ -1121,12 +1121,12 @@ ; AVX2-FAST-PERLANE-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm10[0],ymm12[0],ymm10[2],ymm12[2] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm12 = ymm9[0,1,2,3],ymm10[4,5,6],ymm9[7] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm9 = ymm8[1,1],ymm3[1,1],ymm8[5,5],ymm3[5,5] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm10 = ymm2[1,1,1,1,5,5,5,5] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm10 = ymm2[1,1,1,1,5,5,5,5] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0],ymm0[1],ymm10[2,3,4],ymm0[5],ymm10[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[2,2,2,2] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1,2,3,4],ymm9[5,6],ymm10[7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm10 = ymm7[0,0,0,0,4,4,4,4] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm13 = ymm6[0,1,0,1,4,5,4,5] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm10 = ymm7[0,0,0,0,4,4,4,4] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm13 = ymm6[0,1,0,1,4,5,4,5] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm10 = ymm13[0],ymm10[1],ymm13[2,3,4],ymm10[5],ymm13[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[2,1,2,3] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 16(%rax), %ymm13 @@ -1143,7 +1143,7 @@ ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0],ymm6[1,2,3,4,5,6],ymm7[7] ; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[4],ymm2[4],ymm0[5],ymm2[5] ; AVX2-FAST-PERLANE-NEXT: vbroadcastss 16(%rdx), %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm3 = ymm3[3,1,2,0,7,5,6,4] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm3 = ymm3[3,1,2,0,7,5,6,4] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6],ymm3[7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5],ymm2[6,7] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm2 = xmm4[3,3],xmm5[3,3] @@ -1583,7 +1583,7 @@ ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm4[2,3],ymm3[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm0[6,7] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm1[2,3,2,3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm4 = ymm4[1,1,2,2,5,5,6,6] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[1,1,2,2,5,5,6,6] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0],ymm3[1,2,3,4,5,6],ymm4[7] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm2[2,3],ymm3[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3,4],ymm4[5],ymm3[6,7] @@ -1606,7 +1606,7 @@ ; AVX1-ONLY-NEXT: vmovaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps %xmm5, %xmm14 ; AVX1-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm4 = xmm4[0,1,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm4[0,1,0,1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3],ymm3[4,5,6,7] ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm4 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 @@ -1668,7 +1668,7 @@ ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,0],xmm12[2,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm9[0],xmm3[0],xmm9[1],xmm3[1] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[0,1,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,1,0,1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6],ymm1[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -1691,7 +1691,7 @@ ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm5[3,4,5,6],ymm0[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm8[1],ymm10[1],ymm8[3],ymm10[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm3 = ymm7[2],ymm6[2],ymm7[3],ymm6[3],ymm7[6],ymm6[6],ymm7[7],ymm6[7] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm3[2,3] ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm2[1],ymm1[1],ymm2[3],ymm1[3] @@ -1724,7 +1724,7 @@ ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[3,3],ymm1[3,3],ymm2[7,7],ymm1[7,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm4[2,3],ymm1[1,2],ymm4[6,7],ymm1[5,6] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,2,3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,2,3,1,4,6,7,5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2,3,1,4,6,7,5] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4],ymm1[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -1737,7 +1737,7 @@ ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1,2,3,4],ymm0[5,6],ymm1[7] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm5[2,2,2,2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm5[2,2,2,2] ; AVX1-ONLY-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm0[0,1,2],xmm1[3] ; AVX1-ONLY-NEXT: vbroadcastsd 40(%rax), %ymm2 @@ -1750,7 +1750,7 @@ ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm2, %ymm2 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6],ymm2[7] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm2 = xmm15[2,2,2,2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm15[2,2,2,2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm14[0,1,2],xmm2[3] ; AVX1-ONLY-NEXT: vbroadcastsd 8(%rax), %ymm3 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] @@ -1821,7 +1821,7 @@ ; AVX2-SLOW-NEXT: vmovaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vmovaps 32(%r9), %xmm1 ; AVX2-SLOW-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[1,1,1,1] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,1,1,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3] ; AVX2-SLOW-NEXT: vbroadcastsd %xmm1, %ymm1 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm1[0,1,2,3,4],ymm0[5],ymm1[6,7] @@ -1835,18 +1835,18 @@ ; AVX2-SLOW-NEXT: vmovaps 32(%rdi), %xmm8 ; AVX2-SLOW-NEXT: vmovaps (%rsi), %xmm12 ; AVX2-SLOW-NEXT: vmovaps 32(%rsi), %xmm7 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm6 = xmm7[1,1,2,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm6 = xmm7[1,1,2,2] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1],xmm8[2],xmm6[3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,2,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0],ymm5[1,2],ymm6[3,4,5,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2],ymm4[3,4,5],ymm5[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm4 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm5 = xmm14[1,1,1,1] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm5 = xmm14[1,1,1,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0],xmm13[1],xmm5[2,3] ; AVX2-SLOW-NEXT: vbroadcastsd %xmm5, %ymm5 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5],ymm5[6,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm5 = xmm12[1,1,2,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm5 = xmm12[1,1,2,2] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1],xmm2[2],xmm5[3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm6 = ymm5[0,1,2,1] ; AVX2-SLOW-NEXT: vmovaps (%rdx), %xmm5 @@ -1869,7 +1869,7 @@ ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps (%r9), %ymm1 ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm6 = ymm1[1,1,2,2,5,5,6,6] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm6 = ymm1[1,1,2,2,5,5,6,6] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm0[2],ymm6[3,4,5],ymm0[6],ymm6[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[2,1,2,3] ; AVX2-SLOW-NEXT: vmovaps 16(%rax), %xmm14 @@ -1889,7 +1889,7 @@ ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] ; AVX2-SLOW-NEXT: vmovaps 32(%r9), %ymm2 ; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm10 = ymm2[1,2,2,3,5,6,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm10 = ymm2[1,2,2,3,5,6,6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[2,2,2,2] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm10[0],ymm1[1,2,3,4,5,6],ymm10[7] ; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm0[2,3] @@ -1910,11 +1910,11 @@ ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm8[3,3],xmm7[3,3] ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} xmm1 = xmm9[2],xmm11[2],xmm9[3],xmm11[3] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,1,2,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,2] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1,2,3,4],ymm0[5,6],ymm1[7] ; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm1 = xmm2[2,2,2,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm2[2,2,2,2] ; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm0[0,1,2],xmm1[3] ; AVX2-SLOW-NEXT: vbroadcastsd 40(%rax), %ymm10 @@ -1925,7 +1925,7 @@ ; AVX2-SLOW-NEXT: vbroadcastss %xmm9, %xmm1 ; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm3 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] ; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm1 = xmm8[0],xmm7[0],xmm8[1],xmm7[1] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,1,2,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,2] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1],ymm3[2,3],ymm1[4,5,6,7] ; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm1 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] @@ -1940,11 +1940,11 @@ ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload ; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} xmm1 = xmm5[2],xmm9[2],xmm5[3],xmm9[3] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,1,2,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,2] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6],ymm1[7] ; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm1 = xmm3[2,2,2,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm3[2,2,2,2] ; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm8[0,1,2],xmm1[3] ; AVX2-SLOW-NEXT: vbroadcastsd 8(%rax), %ymm2 @@ -1955,7 +1955,7 @@ ; AVX2-SLOW-NEXT: vbroadcastss %xmm5, %xmm1 ; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm1 = xmm10[0],xmm12[0],xmm10[1],xmm12[1] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,1,2,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,2] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7] ; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm1 = xmm8[0],xmm3[0],xmm8[1],xmm3[1] @@ -1967,7 +1967,7 @@ ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6],ymm0[7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm4[1,1],ymm6[1,1],ymm4[5,5],ymm6[5,5] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm2 = ymm15[1,1,1,1,5,5,5,5] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm15[1,1,1,1,5,5,5,5] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm14[1],ymm2[2,3,4],ymm14[5],ymm2[6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[2,2,2,2] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6],ymm2[7] @@ -1981,7 +1981,7 @@ ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3],ymm2[4,5,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm2[0,1,2],ymm1[3,4,5,6],ymm2[7] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm2 = ymm12[1,1,1,1,5,5,5,5] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm12[1,1,1,1,5,5,5,5] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm5[1],ymm2[2,3,4],ymm5[5],ymm2[6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm2[2,2,2,2] @@ -1989,9 +1989,9 @@ ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm3 = ymm2[1,1],ymm13[1,1],ymm2[5,5],ymm13[5,5] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm0[0,1,2,3,4],ymm3[5,6],ymm0[7] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm3 = ymm1[0,0,0,0,4,4,4,4] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm3 = ymm1[0,0,0,0,4,4,4,4] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm4 = ymm0[0,1,0,1,4,5,4,5] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm4 = ymm0[0,1,0,1,4,5,4,5] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3,4],ymm3[5],ymm4[6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[2,1,2,3] ; AVX2-SLOW-NEXT: vbroadcastsd 16(%rax), %ymm4 @@ -2010,7 +2010,7 @@ ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[2,1,2,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0],ymm3[1,2,3,4],ymm4[5,6,7] ; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} ymm4 = ymm14[0],ymm15[0],ymm14[1],ymm15[1],ymm14[4],ymm15[4],ymm14[5],ymm15[5] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm5 = ymm6[3,1,2,0,7,5,6,4] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm5 = ymm6[3,1,2,0,7,5,6,4] ; AVX2-SLOW-NEXT: vbroadcastss 48(%rdx), %ymm6 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm6[6],ymm5[7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5],ymm5[6,7] @@ -2020,7 +2020,7 @@ ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1,2],xmm7[3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm5[1,2,3],ymm4[4,5,6,7] ; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} ymm5 = ymm12[0],ymm2[0],ymm12[1],ymm2[1],ymm12[4],ymm2[4],ymm12[5],ymm2[5] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm6 = ymm13[3,1,2,0,7,5,6,4] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm6 = ymm13[3,1,2,0,7,5,6,4] ; AVX2-SLOW-NEXT: vbroadcastss 16(%rdx), %ymm7 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm7[6],ymm6[7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5],ymm6[6,7] @@ -2072,7 +2072,7 @@ ; AVX2-FAST-NEXT: vmovaps (%r9), %xmm8 ; AVX2-FAST-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vmovaps 32(%r9), %xmm4 -; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm1 = xmm4[1,1,1,1] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm4[1,1,1,1] ; AVX2-FAST-NEXT: vmovaps %xmm4, %xmm14 ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2,3] ; AVX2-FAST-NEXT: vbroadcastsd %xmm1, %ymm1 @@ -2088,18 +2088,18 @@ ; AVX2-FAST-NEXT: vmovaps (%rsi), %xmm10 ; AVX2-FAST-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vmovaps 32(%rsi), %xmm3 -; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm4 = xmm3[1,1,2,2] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm4 = xmm3[1,1,2,2] ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],xmm5[2],xmm4[3] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,2,1] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0],ymm1[1,2],ymm4[3,4,5,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5],ymm1[6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm1 = xmm8[1,1,1,1] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm8[1,1,1,1] ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm7[1],xmm1[2,3] ; AVX2-FAST-NEXT: vbroadcastsd %xmm1, %ymm1 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5],ymm1[6,7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm1 = xmm10[1,1,2,2] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm10[1,1,2,2] ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm12[2],xmm1[3] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-FAST-NEXT: vmovaps (%rdx), %xmm2 @@ -2124,7 +2124,7 @@ ; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps (%r9), %ymm1 ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[1,1,2,2,5,5,6,6] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,1,2,2,5,5,6,6] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4,5],ymm2[6],ymm1[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] ; AVX2-FAST-NEXT: vmovaps 16(%rax), %xmm4 @@ -2169,7 +2169,7 @@ ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm15[0,1,2,3,4],ymm1[5,6],ymm15[7] ; AVX2-FAST-NEXT: vmovaps %xmm14, %xmm0 ; AVX2-FAST-NEXT: vmovaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm15 = xmm14[2,2,2,2] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm15 = xmm14[2,2,2,2] ; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm15 = xmm8[0,1,2],xmm15[3] ; AVX2-FAST-NEXT: vbroadcastsd 40(%rax), %ymm14 @@ -2190,12 +2190,12 @@ ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm3[4,5,6],ymm1[7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm2[1,1],ymm11[1,1],ymm2[5,5],ymm11[5,5] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm2 = ymm12[1,1,1,1,5,5,5,5] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm12[1,1,1,1,5,5,5,5] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm13[1],ymm2[2,3,4],ymm13[5],ymm2[6,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[2,2,2,2] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6],ymm2[7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm2 = ymm4[0,0,0,0,4,4,4,4] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm3 = ymm10[0,1,0,1,4,5,4,5] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm4[0,0,0,0,4,4,4,4] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm3 = ymm10[0,1,0,1,4,5,4,5] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3,4],ymm2[5],ymm3[6,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[2,1,2,3] ; AVX2-FAST-NEXT: vbroadcastsd 48(%rax), %ymm3 @@ -2212,7 +2212,7 @@ ; AVX2-FAST-NEXT: vpermps %ymm3, %ymm7, %ymm3 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3,4],ymm1[5,6],ymm3[7] ; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm3 = xmm10[2,2,2,2] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm3 = xmm10[2,2,2,2] ; AVX2-FAST-NEXT: vmovaps (%rsp), %xmm14 # 16-byte Reload ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm3 = xmm14[0,1,2],xmm3[3] ; AVX2-FAST-NEXT: vbroadcastsd 8(%rax), %ymm4 @@ -2233,7 +2233,7 @@ ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6],ymm0[7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm3 = ymm0[1,1,1,1,5,5,5,5] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm3 = ymm0[1,1,1,1,5,5,5,5] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm6[1],ymm3[2,3,4],ymm6[5],ymm3[6,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm3[2,2,2,2] @@ -2242,9 +2242,9 @@ ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm3[1,1],ymm1[1,1],ymm3[5,5],ymm1[5,5] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm9 = ymm2[0,1,2,3,4],ymm4[5,6],ymm2[7] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm4 = ymm7[0,0,0,0,4,4,4,4] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm7[0,0,0,0,4,4,4,4] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm5 = ymm2[0,1,0,1,4,5,4,5] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm5 = ymm2[0,1,0,1,4,5,4,5] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2,3,4],ymm4[5],ymm5[6,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[2,1,2,3] ; AVX2-FAST-NEXT: vbroadcastsd 16(%rax), %ymm5 @@ -2264,7 +2264,7 @@ ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[2,1,2,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0],ymm4[1,2,3,4],ymm5[5,6,7] ; AVX2-FAST-NEXT: vunpcklps {{.*#+}} ymm5 = ymm13[0],ymm12[0],ymm13[1],ymm12[1],ymm13[4],ymm12[4],ymm13[5],ymm12[5] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm6 = ymm11[3,1,2,0,7,5,6,4] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm6 = ymm11[3,1,2,0,7,5,6,4] ; AVX2-FAST-NEXT: vbroadcastss 48(%rdx), %ymm7 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm7[6],ymm6[7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5],ymm6[6,7] @@ -2273,7 +2273,7 @@ ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1,2],xmm15[3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm6[1,2,3],ymm5[4,5,6,7] ; AVX2-FAST-NEXT: vunpcklps {{.*#+}} ymm6 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm7 = ymm3[3,1,2,0,7,5,6,4] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm7 = ymm3[3,1,2,0,7,5,6,4] ; AVX2-FAST-NEXT: vbroadcastss 16(%rdx), %ymm8 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm8[6],ymm7[7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5],ymm7[6,7] @@ -2327,7 +2327,7 @@ ; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%r9), %xmm1 ; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[1,1,1,1] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,1,1,1] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd %xmm1, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm1[0,1,2,3,4],ymm0[5],ymm1[6,7] @@ -2341,18 +2341,18 @@ ; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rdi), %xmm8 ; AVX2-FAST-PERLANE-NEXT: vmovaps (%rsi), %xmm12 ; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rsi), %xmm7 -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm6 = xmm7[1,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm6 = xmm7[1,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1],xmm8[2],xmm6[3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,2,1] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0],ymm5[1,2],ymm6[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2],ymm4[3,4,5],ymm5[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm5 = xmm14[1,1,1,1] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm5 = xmm14[1,1,1,1] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0],xmm13[1],xmm5[2,3] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd %xmm5, %ymm5 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5],ymm5[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm5 = xmm12[1,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm5 = xmm12[1,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1],xmm2[2],xmm5[3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm6 = ymm5[0,1,2,1] ; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdx), %xmm5 @@ -2375,7 +2375,7 @@ ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps (%r9), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm6 = ymm1[1,1,2,2,5,5,6,6] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm6 = ymm1[1,1,2,2,5,5,6,6] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm0[2],ymm6[3,4,5],ymm0[6],ymm6[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[2,1,2,3] ; AVX2-FAST-PERLANE-NEXT: vmovaps 16(%rax), %xmm14 @@ -2395,7 +2395,7 @@ ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%r9), %ymm2 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm10 = ymm2[1,2,2,3,5,6,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm10 = ymm2[1,2,2,3,5,6,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[2,2,2,2] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm10[0],ymm1[1,2,3,4,5,6],ymm10[7] ; AVX2-FAST-PERLANE-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm0[2,3] @@ -2416,11 +2416,11 @@ ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm8[3,3],xmm7[3,3] ; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} xmm1 = xmm9[2],xmm11[2],xmm9[3],xmm11[3] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1,2,3,4],ymm0[5,6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm1 = xmm2[2,2,2,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm2[2,2,2,2] ; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm1 = xmm0[0,1,2],xmm1[3] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 40(%rax), %ymm10 @@ -2431,7 +2431,7 @@ ; AVX2-FAST-PERLANE-NEXT: vbroadcastss %xmm9, %xmm1 ; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm3 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] ; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm1 = xmm8[0],xmm7[0],xmm8[1],xmm7[1] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1],ymm3[2,3],ymm1[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm1 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] @@ -2446,11 +2446,11 @@ ; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} xmm1 = xmm5[2],xmm9[2],xmm5[3],xmm9[3] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm1 = xmm3[2,2,2,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm3[2,2,2,2] ; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm1 = xmm8[0,1,2],xmm1[3] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 8(%rax), %ymm2 @@ -2461,7 +2461,7 @@ ; AVX2-FAST-PERLANE-NEXT: vbroadcastss %xmm5, %xmm1 ; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm1 = xmm10[0],xmm12[0],xmm10[1],xmm12[1] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm1 = xmm8[0],xmm3[0],xmm8[1],xmm3[1] @@ -2473,7 +2473,7 @@ ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6],ymm0[7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm4[1,1],ymm6[1,1],ymm4[5,5],ymm6[5,5] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm2 = ymm15[1,1,1,1,5,5,5,5] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm15[1,1,1,1,5,5,5,5] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm14[1],ymm2[2,3,4],ymm14[5],ymm2[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[2,2,2,2] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6],ymm2[7] @@ -2487,7 +2487,7 @@ ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3],ymm2[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm2[0,1,2],ymm1[3,4,5,6],ymm2[7] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm2 = ymm12[1,1,1,1,5,5,5,5] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm12[1,1,1,1,5,5,5,5] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm5[1],ymm2[2,3,4],ymm5[5],ymm2[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm2[2,2,2,2] @@ -2495,9 +2495,9 @@ ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm3 = ymm2[1,1],ymm13[1,1],ymm2[5,5],ymm13[5,5] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm0[0,1,2,3,4],ymm3[5,6],ymm0[7] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm3 = ymm1[0,0,0,0,4,4,4,4] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm3 = ymm1[0,0,0,0,4,4,4,4] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm4 = ymm0[0,1,0,1,4,5,4,5] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm4 = ymm0[0,1,0,1,4,5,4,5] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3,4],ymm3[5],ymm4[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[2,1,2,3] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 16(%rax), %ymm4 @@ -2516,7 +2516,7 @@ ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[2,1,2,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0],ymm3[1,2,3,4],ymm4[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} ymm4 = ymm14[0],ymm15[0],ymm14[1],ymm15[1],ymm14[4],ymm15[4],ymm14[5],ymm15[5] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm5 = ymm6[3,1,2,0,7,5,6,4] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm5 = ymm6[3,1,2,0,7,5,6,4] ; AVX2-FAST-PERLANE-NEXT: vbroadcastss 48(%rdx), %ymm6 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm6[6],ymm5[7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5],ymm5[6,7] @@ -2526,7 +2526,7 @@ ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1,2],xmm7[3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm5[1,2,3],ymm4[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} ymm5 = ymm12[0],ymm2[0],ymm12[1],ymm2[1],ymm12[4],ymm2[4],ymm12[5],ymm2[5] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm6 = ymm13[3,1,2,0,7,5,6,4] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm6 = ymm13[3,1,2,0,7,5,6,4] ; AVX2-FAST-PERLANE-NEXT: vbroadcastss 16(%rdx), %ymm7 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm7[6],ymm6[7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5],ymm6[6,7] @@ -3445,7 +3445,7 @@ ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[1,1,2,2,5,5,6,6] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,1,2,2,5,5,6,6] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6],ymm1[7] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm3[2,3],ymm0[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7] @@ -3469,7 +3469,7 @@ ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm8[0],xmm7[0],xmm8[1],xmm7[1] ; AVX1-ONLY-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[0,1,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,1,0,1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 @@ -3526,7 +3526,7 @@ ; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps %xmm2, %xmm7 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,1,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,0,1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 32(%r9), %xmm3 ; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -3589,7 +3589,7 @@ ; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps %xmm2, %xmm10 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,1,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,0,1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 64(%r9), %xmm3 ; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -3649,7 +3649,7 @@ ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm14 = xmm3[0],xmm5[0],xmm3[1],xmm5[1] ; AVX1-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps %xmm3, (%rsp) # 16-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm14 = xmm14[0,1,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm14[0,1,0,1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1],ymm14[2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 96(%r9), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -3675,7 +3675,7 @@ ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[6],ymm3[6],ymm4[7],ymm3[7] @@ -3691,7 +3691,7 @@ ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm8[1],ymm0[3],ymm8[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm8[2],ymm5[2],ymm8[3],ymm5[3],ymm8[6],ymm5[6],ymm8[7],ymm5[7] @@ -3706,7 +3706,7 @@ ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm15[1],ymm0[3],ymm15[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm10[2],ymm9[2],ymm10[3],ymm9[3],ymm10[6],ymm9[6],ymm10[7],ymm9[7] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] ; AVX1-ONLY-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -3785,7 +3785,7 @@ ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[2,3],ymm1[1,2],ymm2[6,7],ymm1[5,6] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,2,3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,2,3,1,4,6,7,5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2,3,1,4,6,7,5] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4],ymm1[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -3798,7 +3798,7 @@ ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6],ymm1[7] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm15[2,2,2,2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm15[2,2,2,2] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm14[0,1,2],xmm1[3] ; AVX1-ONLY-NEXT: vbroadcastsd 40(%rax), %ymm13 @@ -3817,7 +3817,7 @@ ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm2[2,3],ymm11[1,2],ymm2[6,7],ymm11[5,6] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm11[2,3,2,3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm11 = ymm11[0,2,3,1,4,6,7,5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm11[0,2,3,1,4,6,7,5] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm11[0],ymm1[1,2,3,4],ymm11[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload @@ -3830,7 +3830,7 @@ ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm9 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm9[5,6],ymm8[7] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm9 = xmm12[2,2,2,2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm9 = xmm12[2,2,2,2] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm9 = xmm13[0,1,2],xmm9[3] ; AVX1-ONLY-NEXT: vbroadcastsd 72(%rax), %ymm10 @@ -3852,7 +3852,7 @@ ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm2[2,3],ymm4[1,2],ymm2[6,7],ymm4[5,6] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3,2,3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm4 = ymm4[0,2,3,1,4,6,7,5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,2,3,1,4,6,7,5] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm9[1,2,3,4],ymm4[5,6,7] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; AVX1-ONLY-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm3 # 16-byte Folded Reload @@ -3973,7 +3973,7 @@ ; AVX2-SLOW-NEXT: vmovaps (%r9), %xmm8 ; AVX2-SLOW-NEXT: vmovaps 32(%r9), %xmm9 ; AVX2-SLOW-NEXT: vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm1 = xmm8[1,1,1,1] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm8[1,1,1,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm14[1],xmm1[2,3] ; AVX2-SLOW-NEXT: vbroadcastsd %xmm1, %ymm1 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5],ymm1[6,7] @@ -3988,18 +3988,18 @@ ; AVX2-SLOW-NEXT: vmovaps (%rsi), %xmm4 ; AVX2-SLOW-NEXT: vmovaps 32(%rsi), %xmm12 ; AVX2-SLOW-NEXT: vmovaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm2 = xmm4[1,1,2,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm4[1,1,2,2] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm5[2],xmm2[3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,2,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3,4,5,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5],ymm1[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm1 = xmm9[1,1,1,1] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm9[1,1,1,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm6[1],xmm1[2,3] ; AVX2-SLOW-NEXT: vbroadcastsd %xmm1, %ymm1 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5],ymm1[6,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm1 = xmm12[1,1,2,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm12[1,1,2,2] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm13[2],xmm1[3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-SLOW-NEXT: vmovaps 32(%rdx), %xmm6 @@ -4011,7 +4011,7 @@ ; AVX2-SLOW-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vmovaps 64(%r9), %xmm0 ; AVX2-SLOW-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,1,1] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1,1,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] ; AVX2-SLOW-NEXT: vbroadcastsd %xmm0, %ymm0 ; AVX2-SLOW-NEXT: vmovaps 64(%rax), %xmm1 @@ -4022,7 +4022,7 @@ ; AVX2-SLOW-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vmovaps 64(%rsi), %xmm1 ; AVX2-SLOW-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[1,1,2,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,1,2,2] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-SLOW-NEXT: vmovaps 64(%rcx), %xmm13 @@ -4034,7 +4034,7 @@ ; AVX2-SLOW-NEXT: vmovaps 96(%r8), %xmm1 ; AVX2-SLOW-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vmovaps 96(%r9), %xmm11 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm11[1,1,1,1] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm11[1,1,1,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] ; AVX2-SLOW-NEXT: vbroadcastsd %xmm0, %ymm0 ; AVX2-SLOW-NEXT: vmovaps 96(%rax), %xmm1 @@ -4045,7 +4045,7 @@ ; AVX2-SLOW-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vmovaps 96(%rsi), %xmm1 ; AVX2-SLOW-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[1,1,2,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,1,2,2] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-SLOW-NEXT: vmovaps 96(%rcx), %xmm3 @@ -4072,7 +4072,7 @@ ; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps (%r9), %ymm1 ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[1,1,2,2,5,5,6,6] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,1,2,2,5,5,6,6] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4,5],ymm2[6],ymm1[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] ; AVX2-SLOW-NEXT: vmovaps 16(%rax), %xmm2 @@ -4095,7 +4095,7 @@ ; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps 32(%r9), %ymm1 ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[1,1,2,2,5,5,6,6] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,1,2,2,5,5,6,6] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4,5],ymm2[6],ymm1[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] ; AVX2-SLOW-NEXT: vmovaps 48(%rax), %xmm2 @@ -4118,7 +4118,7 @@ ; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps 64(%r9), %ymm1 ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[1,1,2,2,5,5,6,6] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,1,2,2,5,5,6,6] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4,5],ymm2[6],ymm1[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] ; AVX2-SLOW-NEXT: vmovaps 80(%rax), %xmm2 @@ -4127,7 +4127,7 @@ ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps 96(%rdi), %ymm3 ; AVX2-SLOW-NEXT: vmovaps 96(%rsi), %ymm2 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm2[1,1,1,1,5,5,5,5] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm2[1,1,1,1,5,5,5,5] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3,4],ymm3[5],ymm0[6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] ; AVX2-SLOW-NEXT: vmovaps 96(%rdx), %ymm9 @@ -4146,7 +4146,7 @@ ; AVX2-SLOW-NEXT: vbroadcastss %xmm7, %xmm15 ; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm0 = xmm15[0],xmm0[0],xmm15[1],xmm0[1] ; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm15 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm15 = xmm15[0,1,2,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm15 = xmm15[0,1,2,2] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[0,1,2,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1],ymm0[2,3],ymm15[4,5,6,7] ; AVX2-SLOW-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -4161,10 +4161,10 @@ ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm5[3,3],xmm4[3,3] ; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} xmm4 = xmm7[2],xmm10[2],xmm7[3],xmm10[3] ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm4 = xmm4[0,1,2,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm4 = xmm4[0,1,2,2] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,2,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3,4],ymm0[5,6],ymm4[7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm4 = xmm8[2,2,2,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm4 = xmm8[2,2,2,2] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm4 = xmm9[0,1,2],xmm4[3] ; AVX2-SLOW-NEXT: vbroadcastsd 8(%rax), %ymm5 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] @@ -4177,7 +4177,7 @@ ; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload ; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload ; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm4 = xmm10[0],xmm14[0],xmm10[1],xmm14[1] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm4 = xmm4[0,1,2,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm4 = xmm4[0,1,2,2] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,2,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,3],ymm4[4,5,6,7] ; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload @@ -4191,10 +4191,10 @@ ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm10[3,3],xmm14[3,3] ; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} xmm4 = xmm6[2],xmm8[2],xmm6[3],xmm8[3] ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm4 = xmm4[0,1,2,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm4 = xmm4[0,1,2,2] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,2,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3,4],ymm0[5,6],ymm4[7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm4 = xmm9[2,2,2,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm4 = xmm9[2,2,2,2] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm4 = xmm7[0,1,2],xmm4[3] ; AVX2-SLOW-NEXT: vbroadcastsd 40(%rax), %ymm5 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] @@ -4206,7 +4206,7 @@ ; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload ; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload ; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm4 = xmm9[0],xmm8[0],xmm9[1],xmm8[1] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm4 = xmm4[0,1,2,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm4 = xmm4[0,1,2,2] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,2,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,3],ymm4[4,5,6,7] ; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload @@ -4220,10 +4220,10 @@ ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm9[3,3],xmm8[3,3] ; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} xmm4 = xmm12[2],xmm13[2],xmm12[3],xmm13[3] ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm4 = xmm4[0,1,2,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm4 = xmm4[0,1,2,2] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,2,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3,4],ymm0[5,6],ymm4[7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm4 = xmm6[2,2,2,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm4 = xmm6[2,2,2,2] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm4 = xmm7[0,1,2],xmm4[3] ; AVX2-SLOW-NEXT: vbroadcastsd 72(%rax), %ymm5 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] @@ -4237,7 +4237,7 @@ ; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload ; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload ; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm4 = xmm8[0],xmm7[0],xmm8[1],xmm7[1] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm4 = xmm4[0,1,2,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm4 = xmm4[0,1,2,2] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,2,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,3],ymm4[4,5,6,7] ; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload @@ -4250,23 +4250,23 @@ ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm8[3,3],xmm7[3,3] ; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} xmm4 = xmm10[2],xmm9[2],xmm10[3],xmm9[3] ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm4 = xmm4[0,1,2,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm4 = xmm4[0,1,2,2] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,2,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3,4],ymm0[5,6],ymm4[7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm4 = xmm11[2,2,2,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm4 = xmm11[2,2,2,2] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm4 = xmm6[0,1,2],xmm4[3] ; AVX2-SLOW-NEXT: vbroadcastsd 104(%rax), %ymm5 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm4[2,3,4],ymm0[5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vbroadcastss 112(%rdx), %ymm0 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm5 = ymm1[3,1,2,0,7,5,6,4] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm5 = ymm1[3,1,2,0,7,5,6,4] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5],ymm0[6],ymm5[7] ; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} ymm5 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5],ymm0[6,7] ; AVX2-SLOW-NEXT: vbroadcastss 108(%r8), %ymm5 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm5[1],ymm0[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm5 = xmm11[2,2,3,3] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm5 = xmm11[2,2,3,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm5[2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm5 = ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[6],ymm2[6],ymm3[7],ymm2[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[2,2,2,2] @@ -4296,7 +4296,7 @@ ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6],ymm1[7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm8[1,1,1,1,5,5,5,5] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm8[1,1,1,1,5,5,5,5] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm5[1],ymm1[2,3,4],ymm5[5],ymm1[6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,2] @@ -4305,9 +4305,9 @@ ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm9[1,1],ymm4[1,1],ymm9[5,5],ymm4[5,5] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6],ymm1[7] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm2 = ymm13[0,0,0,0,4,4,4,4] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm13[0,0,0,0,4,4,4,4] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm3 = ymm11[0,1,0,1,4,5,4,5] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm3 = ymm11[0,1,0,1,4,5,4,5] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3,4],ymm2[5],ymm3[6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[2,1,2,3] ; AVX2-SLOW-NEXT: vbroadcastsd 16(%rax), %ymm3 @@ -4315,7 +4315,7 @@ ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm1[3,4,5,6],ymm2[7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm2 = ymm14[1,1,1,1,5,5,5,5] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm14[1,1,1,1,5,5,5,5] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm12[1],ymm2[2,3,4],ymm12[5],ymm2[6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[2,2,2,2] @@ -4353,7 +4353,7 @@ ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2],ymm3[3,4,5,6],ymm6[7] ; AVX2-SLOW-NEXT: vbroadcastss 16(%rdx), %ymm3 ; AVX2-SLOW-NEXT: vmovaps %ymm4, %ymm1 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm7 = ymm4[3,1,2,0,7,5,6,4] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm7 = ymm4[3,1,2,0,7,5,6,4] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm7[0,1,2,3,4,5],ymm3[6],ymm7[7] ; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} ymm7 = ymm5[0],ymm8[0],ymm5[1],ymm8[1],ymm5[4],ymm8[4],ymm5[5],ymm8[5] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm7[4,5],ymm3[6,7] @@ -4374,7 +4374,7 @@ ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[2,1,2,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0],ymm3[1,2,3,4],ymm8[5,6,7] ; AVX2-SLOW-NEXT: vbroadcastss 48(%rdx), %ymm3 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm9 = ymm10[3,1,2,0,7,5,6,4] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm9 = ymm10[3,1,2,0,7,5,6,4] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm9[0,1,2,3,4,5],ymm3[6],ymm9[7] ; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} ymm9 = ymm12[0],ymm14[0],ymm12[1],ymm14[1],ymm12[4],ymm14[4],ymm12[5],ymm14[5] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm9[4,5],ymm3[6,7] @@ -4399,7 +4399,7 @@ ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm11[0],ymm3[1,2,3,4],ymm11[5,6,7] ; AVX2-SLOW-NEXT: vbroadcastss 80(%rdx), %ymm11 ; AVX2-SLOW-NEXT: vmovaps %ymm15, %ymm1 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm12 = ymm15[3,1,2,0,7,5,6,4] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm12 = ymm15[3,1,2,0,7,5,6,4] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm11 = ymm12[0,1,2,3,4,5],ymm11[6],ymm12[7] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -4495,7 +4495,7 @@ ; AVX2-FAST-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vmovaps 32(%r9), %xmm5 ; AVX2-FAST-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[1,1,1,1] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,1,1,1] ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2,3] ; AVX2-FAST-NEXT: vbroadcastsd %xmm1, %ymm1 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5],ymm1[6,7] @@ -4513,18 +4513,18 @@ ; AVX2-FAST-NEXT: vmovaps (%rsi), %xmm13 ; AVX2-FAST-NEXT: vmovaps 32(%rsi), %xmm8 ; AVX2-FAST-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm6 = xmm13[1,1,2,2] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm6 = xmm13[1,1,2,2] ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1],xmm9[2],xmm6[3] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,2,1] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0],ymm1[1,2],ymm6[3,4,5,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5],ymm1[6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm1 = xmm5[1,1,1,1] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm5[1,1,1,1] ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm4[1],xmm1[2,3] ; AVX2-FAST-NEXT: vbroadcastsd %xmm1, %ymm1 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5],ymm1[6,7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm1 = xmm8[1,1,2,2] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm8[1,1,2,2] ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm7[2],xmm1[3] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-FAST-NEXT: vmovaps 32(%rdx), %xmm2 @@ -4537,7 +4537,7 @@ ; AVX2-FAST-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vmovaps 64(%r9), %xmm0 ; AVX2-FAST-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,1,1] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1,1,1] ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] ; AVX2-FAST-NEXT: vbroadcastsd %xmm0, %ymm0 ; AVX2-FAST-NEXT: vmovaps 64(%rax), %xmm1 @@ -4548,7 +4548,7 @@ ; AVX2-FAST-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vmovaps 64(%rsi), %xmm1 ; AVX2-FAST-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[1,1,2,2] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,1,2,2] ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-FAST-NEXT: vmovaps 64(%rcx), %xmm2 @@ -4562,7 +4562,7 @@ ; AVX2-FAST-NEXT: vmovaps 96(%r8), %xmm1 ; AVX2-FAST-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vmovaps 96(%r9), %xmm2 -; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm0 = xmm2[1,1,1,1] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm2[1,1,1,1] ; AVX2-FAST-NEXT: vmovaps %xmm2, %xmm12 ; AVX2-FAST-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] @@ -4575,7 +4575,7 @@ ; AVX2-FAST-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vmovaps 96(%rsi), %xmm1 ; AVX2-FAST-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[1,1,2,2] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,1,2,2] ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-FAST-NEXT: vmovaps 96(%rcx), %xmm2 @@ -4600,7 +4600,7 @@ ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FAST-NEXT: vmovaps (%r8), %ymm6 ; AVX2-FAST-NEXT: vmovaps (%r9), %ymm5 -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm1 = ymm5[1,1,2,2,5,5,6,6] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm5[1,1,2,2,5,5,6,6] ; AVX2-FAST-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm6[2],ymm1[3,4,5],ymm6[6],ymm1[7] ; AVX2-FAST-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -4625,7 +4625,7 @@ ; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps 32(%r9), %ymm1 ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[1,1,2,2,5,5,6,6] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,1,2,2,5,5,6,6] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4,5],ymm2[6],ymm1[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] ; AVX2-FAST-NEXT: vmovaps 48(%rax), %xmm15 @@ -4648,7 +4648,7 @@ ; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps 64(%r9), %ymm1 ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[1,1,2,2,5,5,6,6] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,1,2,2,5,5,6,6] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4,5],ymm2[6],ymm1[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] ; AVX2-FAST-NEXT: vmovaps 80(%rax), %xmm15 @@ -4657,7 +4657,7 @@ ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps 96(%rdi), %ymm3 ; AVX2-FAST-NEXT: vmovaps 96(%rsi), %ymm2 -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm15 = ymm2[1,1,1,1,5,5,5,5] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm15 = ymm2[1,1,1,1,5,5,5,5] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0],ymm3[1],ymm15[2,3,4],ymm3[5],ymm15[6,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[2,2,2,2] ; AVX2-FAST-NEXT: vmovaps 96(%rdx), %ymm4 @@ -4672,13 +4672,13 @@ ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1],ymm15[2],ymm14[3,4,5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vbroadcastss 112(%rdx), %ymm14 -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm15 = ymm1[3,1,2,0,7,5,6,4] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm15 = ymm1[3,1,2,0,7,5,6,4] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1,2,3,4,5],ymm14[6],ymm15[7] ; AVX2-FAST-NEXT: vunpcklps {{.*#+}} ymm15 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm15[4,5],ymm14[6,7] ; AVX2-FAST-NEXT: vbroadcastss 108(%r8), %ymm15 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0],ymm15[1],ymm14[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm15 = xmm12[2,2,3,3] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm15 = xmm12[2,2,3,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1],ymm15[2,3],ymm14[4,5,6,7] ; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm15 = ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[6],ymm2[6],ymm3[7],ymm2[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[2,2,2,2] @@ -4730,20 +4730,20 @@ ; AVX2-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-FAST-NEXT: vpermps %ymm2, %ymm7, %ymm2 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6],ymm2[7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm2 = xmm11[2,2,2,2] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm2 = xmm11[2,2,2,2] ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm2 = xmm4[0,1,2],xmm2[3] ; AVX2-FAST-NEXT: vbroadcastsd 8(%rax), %ymm3 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm2[2,3,4],ymm1[5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm1 = ymm10[1,1,1,1,5,5,5,5] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm10[1,1,1,1,5,5,5,5] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm10[1],ymm1[2,3,4],ymm10[5],ymm1[6,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,2] ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm9[1,1],ymm8[1,1],ymm9[5,5],ymm8[5,5] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6],ymm1[7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm2 = ymm5[0,0,0,0,4,4,4,4] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm3 = ymm6[0,1,0,1,4,5,4,5] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm5[0,0,0,0,4,4,4,4] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm3 = ymm6[0,1,0,1,4,5,4,5] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3,4],ymm2[5],ymm3[6,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[2,1,2,3] ; AVX2-FAST-NEXT: vbroadcastsd 16(%rax), %ymm3 @@ -4773,7 +4773,7 @@ ; AVX2-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-FAST-NEXT: vpermps %ymm2, %ymm7, %ymm2 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6],ymm2[7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm2 = xmm8[2,2,2,2] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm2 = xmm8[2,2,2,2] ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm2 = xmm9[0,1,2],xmm2[3] ; AVX2-FAST-NEXT: vbroadcastsd 40(%rax), %ymm3 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] @@ -4821,14 +4821,14 @@ ; AVX2-FAST-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 ; AVX2-FAST-NEXT: vpermps %ymm5, %ymm7, %ymm5 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5,6],ymm5[7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm5 = xmm1[2,2,2,2] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm5 = xmm1[2,2,2,2] ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm5 = xmm6[0,1,2],xmm5[3] ; AVX2-FAST-NEXT: vbroadcastsd 72(%rax), %ymm6 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1],ymm5[2,3,4],ymm4[5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm5 = ymm15[1,1,1,1,5,5,5,5] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm5 = ymm15[1,1,1,1,5,5,5,5] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm14[1],ymm5[2,3,4],ymm14[5],ymm5[6,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[2,2,2,2] @@ -4868,14 +4868,14 @@ ; AVX2-FAST-NEXT: vpermps %ymm9, %ymm7, %ymm0 ; AVX2-FAST-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm6[5,6],ymm0[7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm6 = xmm5[2,2,2,2] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm6 = xmm5[2,2,2,2] ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm6 = xmm1[0,1,2],xmm6[3] ; AVX2-FAST-NEXT: vbroadcastsd 104(%rax), %ymm9 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm9[4,5,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm0[0,1],ymm6[2,3,4],ymm0[5,6,7] ; AVX2-FAST-NEXT: vbroadcastss 16(%rdx), %ymm6 ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm9 = ymm0[3,1,2,0,7,5,6,4] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm9 = ymm0[3,1,2,0,7,5,6,4] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm9[0,1,2,3,4,5],ymm6[6],ymm9[7] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-NEXT: vunpcklps {{.*#+}} ymm9 = ymm10[0],ymm1[0],ymm10[1],ymm1[1],ymm10[4],ymm1[4],ymm10[5],ymm1[5] @@ -4900,7 +4900,7 @@ ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[2,1,2,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0],ymm6[1,2,3,4],ymm7[5,6,7] ; AVX2-FAST-NEXT: vbroadcastss 48(%rdx), %ymm7 -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm10 = ymm11[3,1,2,0,7,5,6,4] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm10 = ymm11[3,1,2,0,7,5,6,4] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm10[0,1,2,3,4,5],ymm7[6],ymm10[7] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vunpcklps {{.*#+}} ymm10 = ymm12[0],ymm0[0],ymm12[1],ymm0[1],ymm12[4],ymm0[4],ymm12[5],ymm0[5] @@ -4925,7 +4925,7 @@ ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[2,1,2,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0],ymm10[1,2,3,4],ymm11[5,6,7] ; AVX2-FAST-NEXT: vbroadcastss 80(%rdx), %ymm11 -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm12 = ymm13[3,1,2,0,7,5,6,4] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm12 = ymm13[3,1,2,0,7,5,6,4] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm11 = ymm12[0,1,2,3,4,5],ymm11[6],ymm12[7] ; AVX2-FAST-NEXT: vunpcklps {{.*#+}} ymm12 = ymm14[0],ymm15[0],ymm14[1],ymm15[1],ymm14[4],ymm15[4],ymm14[5],ymm15[5] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5],ymm11[6,7] @@ -5016,7 +5016,7 @@ ; AVX2-FAST-PERLANE-NEXT: vmovaps (%r9), %xmm8 ; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%r9), %xmm9 ; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm1 = xmm8[1,1,1,1] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm8[1,1,1,1] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm14[1],xmm1[2,3] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd %xmm1, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5],ymm1[6,7] @@ -5031,18 +5031,18 @@ ; AVX2-FAST-PERLANE-NEXT: vmovaps (%rsi), %xmm4 ; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rsi), %xmm12 ; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm2 = xmm4[1,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm2 = xmm4[1,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm5[2],xmm2[3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,2,1] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm1 = xmm9[1,1,1,1] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm9[1,1,1,1] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm6[1],xmm1[2,3] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd %xmm1, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5],ymm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm1 = xmm12[1,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm12[1,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm13[2],xmm1[3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rdx), %xmm6 @@ -5054,7 +5054,7 @@ ; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%r9), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,1,1] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1,1,1] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd %xmm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%rax), %xmm1 @@ -5065,7 +5065,7 @@ ; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%rsi), %xmm1 ; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[1,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%rcx), %xmm13 @@ -5077,7 +5077,7 @@ ; AVX2-FAST-PERLANE-NEXT: vmovaps 96(%r8), %xmm1 ; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 96(%r9), %xmm11 -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm0 = xmm11[1,1,1,1] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm11[1,1,1,1] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd %xmm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovaps 96(%rax), %xmm1 @@ -5088,7 +5088,7 @@ ; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 96(%rsi), %xmm1 ; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[1,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-FAST-PERLANE-NEXT: vmovaps 96(%rcx), %xmm3 @@ -5115,7 +5115,7 @@ ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps (%r9), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[1,1,2,2,5,5,6,6] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,1,2,2,5,5,6,6] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4,5],ymm2[6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] ; AVX2-FAST-PERLANE-NEXT: vmovaps 16(%rax), %xmm2 @@ -5138,7 +5138,7 @@ ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%r9), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[1,1,2,2,5,5,6,6] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,1,2,2,5,5,6,6] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4,5],ymm2[6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] ; AVX2-FAST-PERLANE-NEXT: vmovaps 48(%rax), %xmm2 @@ -5161,7 +5161,7 @@ ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%r9), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[1,1,2,2,5,5,6,6] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,1,2,2,5,5,6,6] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4,5],ymm2[6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] ; AVX2-FAST-PERLANE-NEXT: vmovaps 80(%rax), %xmm2 @@ -5170,7 +5170,7 @@ ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 96(%rdi), %ymm3 ; AVX2-FAST-PERLANE-NEXT: vmovaps 96(%rsi), %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm0 = ymm2[1,1,1,1,5,5,5,5] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm2[1,1,1,1,5,5,5,5] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3,4],ymm3[5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] ; AVX2-FAST-PERLANE-NEXT: vmovaps 96(%rdx), %ymm9 @@ -5189,7 +5189,7 @@ ; AVX2-FAST-PERLANE-NEXT: vbroadcastss %xmm7, %xmm15 ; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm0 = xmm15[0],xmm0[0],xmm15[1],xmm0[1] ; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm15 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm15 = xmm15[0,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm15 = xmm15[0,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[0,1,2,1] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1],ymm0[2,3],ymm15[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -5204,10 +5204,10 @@ ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm5[3,3],xmm4[3,3] ; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} xmm4 = xmm7[2],xmm10[2],xmm7[3],xmm10[3] ; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm4 = xmm4[0,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm4 = xmm4[0,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,2,1] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3,4],ymm0[5,6],ymm4[7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm4 = xmm8[2,2,2,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm4 = xmm8[2,2,2,2] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm4 = xmm9[0,1,2],xmm4[3] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 8(%rax), %ymm5 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] @@ -5220,7 +5220,7 @@ ; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm4 = xmm10[0],xmm14[0],xmm10[1],xmm14[1] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm4 = xmm4[0,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm4 = xmm4[0,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,2,1] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,3],ymm4[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload @@ -5234,10 +5234,10 @@ ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm10[3,3],xmm14[3,3] ; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} xmm4 = xmm6[2],xmm8[2],xmm6[3],xmm8[3] ; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm4 = xmm4[0,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm4 = xmm4[0,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,2,1] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3,4],ymm0[5,6],ymm4[7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm4 = xmm9[2,2,2,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm4 = xmm9[2,2,2,2] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm4 = xmm7[0,1,2],xmm4[3] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 40(%rax), %ymm5 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] @@ -5249,7 +5249,7 @@ ; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm4 = xmm9[0],xmm8[0],xmm9[1],xmm8[1] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm4 = xmm4[0,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm4 = xmm4[0,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,2,1] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,3],ymm4[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload @@ -5263,10 +5263,10 @@ ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm9[3,3],xmm8[3,3] ; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} xmm4 = xmm12[2],xmm13[2],xmm12[3],xmm13[3] ; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm4 = xmm4[0,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm4 = xmm4[0,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,2,1] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3,4],ymm0[5,6],ymm4[7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm4 = xmm6[2,2,2,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm4 = xmm6[2,2,2,2] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm4 = xmm7[0,1,2],xmm4[3] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 72(%rax), %ymm5 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] @@ -5280,7 +5280,7 @@ ; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm4 = xmm8[0],xmm7[0],xmm8[1],xmm7[1] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm4 = xmm4[0,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm4 = xmm4[0,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,2,1] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,3],ymm4[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload @@ -5293,23 +5293,23 @@ ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm8[3,3],xmm7[3,3] ; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} xmm4 = xmm10[2],xmm9[2],xmm10[3],xmm9[3] ; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm4 = xmm4[0,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm4 = xmm4[0,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,2,1] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3,4],ymm0[5,6],ymm4[7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm4 = xmm11[2,2,2,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm4 = xmm11[2,2,2,2] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm4 = xmm6[0,1,2],xmm4[3] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 104(%rax), %ymm5 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm4[2,3,4],ymm0[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vbroadcastss 112(%rdx), %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm5 = ymm1[3,1,2,0,7,5,6,4] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm5 = ymm1[3,1,2,0,7,5,6,4] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5],ymm0[6],ymm5[7] ; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} ymm5 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vbroadcastss 108(%r8), %ymm5 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm5[1],ymm0[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm5 = xmm11[2,2,3,3] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm5 = xmm11[2,2,3,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm5[2,3],ymm0[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm5 = ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[6],ymm2[6],ymm3[7],ymm2[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[2,2,2,2] @@ -5339,7 +5339,7 @@ ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm8[1,1,1,1,5,5,5,5] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm8[1,1,1,1,5,5,5,5] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm5[1],ymm1[2,3,4],ymm5[5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,2] @@ -5348,9 +5348,9 @@ ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm9[1,1],ymm4[1,1],ymm9[5,5],ymm4[5,5] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm2 = ymm13[0,0,0,0,4,4,4,4] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm13[0,0,0,0,4,4,4,4] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm3 = ymm11[0,1,0,1,4,5,4,5] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm3 = ymm11[0,1,0,1,4,5,4,5] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3,4],ymm2[5],ymm3[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[2,1,2,3] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 16(%rax), %ymm3 @@ -5358,7 +5358,7 @@ ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm1[3,4,5,6],ymm2[7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm2 = ymm14[1,1,1,1,5,5,5,5] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm14[1,1,1,1,5,5,5,5] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm12[1],ymm2[2,3,4],ymm12[5],ymm2[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[2,2,2,2] @@ -5396,7 +5396,7 @@ ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2],ymm3[3,4,5,6],ymm6[7] ; AVX2-FAST-PERLANE-NEXT: vbroadcastss 16(%rdx), %ymm3 ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm7 = ymm4[3,1,2,0,7,5,6,4] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm7 = ymm4[3,1,2,0,7,5,6,4] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm7[0,1,2,3,4,5],ymm3[6],ymm7[7] ; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} ymm7 = ymm5[0],ymm8[0],ymm5[1],ymm8[1],ymm5[4],ymm8[4],ymm5[5],ymm8[5] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm7[4,5],ymm3[6,7] @@ -5417,7 +5417,7 @@ ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[2,1,2,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0],ymm3[1,2,3,4],ymm8[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vbroadcastss 48(%rdx), %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm9 = ymm10[3,1,2,0,7,5,6,4] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm9 = ymm10[3,1,2,0,7,5,6,4] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm9[0,1,2,3,4,5],ymm3[6],ymm9[7] ; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} ymm9 = ymm12[0],ymm14[0],ymm12[1],ymm14[1],ymm12[4],ymm14[4],ymm12[5],ymm14[5] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm9[4,5],ymm3[6,7] @@ -5442,7 +5442,7 @@ ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm11[0],ymm3[1,2,3,4],ymm11[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vbroadcastss 80(%rdx), %ymm11 ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm15, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm12 = ymm15[3,1,2,0,7,5,6,4] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm12 = ymm15[3,1,2,0,7,5,6,4] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm11 = ymm12[0,1,2,3,4,5],ymm11[6],ymm12[7] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -7232,7 +7232,7 @@ ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm3[6,7] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[1,1,2,2,5,5,6,6] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,1,2,2,5,5,6,6] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6],ymm1[7] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm4[2,3],ymm0[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7] @@ -7255,7 +7255,7 @@ ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm8[0],xmm7[0],xmm8[1],xmm7[1] ; AVX1-ONLY-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,1,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,0,1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 @@ -7312,7 +7312,7 @@ ; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps %xmm2, %xmm9 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,1,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,0,1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 32(%r9), %xmm3 ; AVX1-ONLY-NEXT: vmovaps %xmm3, (%rsp) # 16-byte Spill @@ -7376,7 +7376,7 @@ ; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps %xmm2, %xmm9 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,1,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,0,1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 64(%r9), %xmm3 ; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -7439,7 +7439,7 @@ ; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps %xmm2, %xmm9 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,1,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,0,1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 96(%r9), %xmm3 ; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -7502,7 +7502,7 @@ ; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps %xmm2, %xmm9 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,1,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,0,1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 128(%r9), %xmm3 ; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -7566,7 +7566,7 @@ ; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps %xmm2, %xmm9 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,1,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,0,1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 160(%r9), %xmm3 ; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -7627,7 +7627,7 @@ ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] ; AVX1-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,1,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,0,1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 192(%r9), %xmm3 ; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -7672,7 +7672,7 @@ ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm1 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] @@ -7689,7 +7689,7 @@ ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm1[2],ymm8[2],ymm1[3],ymm8[3],ymm1[6],ymm8[6],ymm1[7],ymm8[7] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] @@ -7705,7 +7705,7 @@ ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm1 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm1 = ymm8[2],mem[2],ymm8[3],mem[3],ymm8[6],mem[6],ymm8[7],mem[7] @@ -7722,7 +7722,7 @@ ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm15[1],mem[1],ymm15[3],mem[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm1 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm1 = ymm12[2],mem[2],ymm12[3],mem[3],ymm12[6],mem[6],ymm12[7],mem[7] @@ -7739,7 +7739,7 @@ ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm1 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] @@ -7756,7 +7756,7 @@ ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm6[2],ymm2[2],ymm6[3],ymm2[3],ymm6[6],ymm2[6],ymm6[7],ymm2[7] ; AVX1-ONLY-NEXT: vmovaps %ymm2, %ymm10 @@ -7773,7 +7773,7 @@ ; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm5[1],ymm4[1],ymm5[3],ymm4[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm7[2],ymm14[2],ymm7[3],ymm14[3],ymm7[6],ymm14[6],ymm7[7],ymm14[7] @@ -7829,7 +7829,7 @@ ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm11[0] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,0],xmm11[2,1] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1,0,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, 224(%r8), %ymm1, %ymm1 @@ -7909,7 +7909,7 @@ ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[2,3],ymm1[1,2],ymm2[6,7],ymm1[5,6] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,2,3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,2,3,1,4,6,7,5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2,3,1,4,6,7,5] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4],ymm1[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -7944,7 +7944,7 @@ ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[2,3],ymm1[1,2],ymm2[6,7],ymm1[5,6] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,2,3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,2,3,1,4,6,7,5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2,3,1,4,6,7,5] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4],ymm1[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -7978,7 +7978,7 @@ ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[2,3],ymm1[1,2],ymm2[6,7],ymm1[5,6] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,2,3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,2,3,1,4,6,7,5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2,3,1,4,6,7,5] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4],ymm1[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -8012,7 +8012,7 @@ ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[2,3],ymm1[1,2],ymm2[6,7],ymm1[5,6] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,2,3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,2,3,1,4,6,7,5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2,3,1,4,6,7,5] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4],ymm1[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -8047,7 +8047,7 @@ ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm3[2,3],ymm2[1,2],ymm3[6,7],ymm2[5,6] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3,2,3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm2 = ymm2[0,2,3,1,4,6,7,5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,2,3,1,4,6,7,5] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0],ymm0[1,2,3,4],ymm2[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -8060,7 +8060,7 @@ ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm0[5,6],ymm2[7] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm2 = xmm15[2,2,2,2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm15[2,2,2,2] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm13[0,1,2],xmm2[3] ; AVX1-ONLY-NEXT: vbroadcastsd 168(%rax), %ymm3 @@ -8081,7 +8081,7 @@ ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm3[2,3],ymm2[1,2],ymm3[6,7],ymm2[5,6] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3,2,3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm2 = ymm2[0,2,3,1,4,6,7,5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,2,3,1,4,6,7,5] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0],ymm0[1,2,3,4],ymm2[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -8094,7 +8094,7 @@ ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm0[5,6],ymm2[7] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm2 = xmm8[2,2,2,2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm8[2,2,2,2] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm5[0,1,2],xmm2[3] ; AVX1-ONLY-NEXT: vbroadcastsd 200(%rax), %ymm3 @@ -8313,7 +8313,7 @@ ; AVX2-SLOW-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vmovaps 32(%r9), %xmm4 ; AVX2-SLOW-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[1,1,1,1] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,1,1,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm5[1],xmm1[2,3] ; AVX2-SLOW-NEXT: vbroadcastsd %xmm1, %ymm1 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5],ymm1[6,7] @@ -8328,7 +8328,7 @@ ; AVX2-SLOW-NEXT: vmovaps (%rsi), %xmm5 ; AVX2-SLOW-NEXT: vmovaps 32(%rsi), %xmm11 ; AVX2-SLOW-NEXT: vmovaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm2 = xmm5[1,1,2,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm5[1,1,2,2] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm6[2],xmm2[3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,2,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3,4,5,6,7] @@ -8337,11 +8337,11 @@ ; AVX2-SLOW-NEXT: vmovaps 32(%rax), %xmm0 ; AVX2-SLOW-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm1 = xmm4[1,1,1,1] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm4[1,1,1,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2,3] ; AVX2-SLOW-NEXT: vbroadcastsd %xmm1, %ymm1 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5],ymm1[6,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm1 = xmm11[1,1,2,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm11[1,1,2,2] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm7[2],xmm1[3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-SLOW-NEXT: vmovaps 32(%rdx), %xmm7 @@ -8353,7 +8353,7 @@ ; AVX2-SLOW-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vmovaps 64(%r9), %xmm0 ; AVX2-SLOW-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,1,1] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1,1,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] ; AVX2-SLOW-NEXT: vbroadcastsd %xmm0, %ymm0 ; AVX2-SLOW-NEXT: vmovaps 64(%rax), %xmm1 @@ -8364,7 +8364,7 @@ ; AVX2-SLOW-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vmovaps 64(%rsi), %xmm1 ; AVX2-SLOW-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[1,1,2,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,1,2,2] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-SLOW-NEXT: vmovaps 64(%rcx), %xmm2 @@ -8378,7 +8378,7 @@ ; AVX2-SLOW-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vmovaps 96(%r9), %xmm0 ; AVX2-SLOW-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,1,1] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1,1,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] ; AVX2-SLOW-NEXT: vbroadcastsd %xmm0, %ymm0 ; AVX2-SLOW-NEXT: vmovaps 96(%rax), %xmm1 @@ -8389,7 +8389,7 @@ ; AVX2-SLOW-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vmovaps 96(%rsi), %xmm1 ; AVX2-SLOW-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[1,1,2,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,1,2,2] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-SLOW-NEXT: vmovaps 96(%rcx), %xmm3 @@ -8404,7 +8404,7 @@ ; AVX2-SLOW-NEXT: vmovaps %xmm1, (%rsp) # 16-byte Spill ; AVX2-SLOW-NEXT: vmovaps 128(%r9), %xmm0 ; AVX2-SLOW-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,1,1] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1,1,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] ; AVX2-SLOW-NEXT: vbroadcastsd %xmm0, %ymm0 ; AVX2-SLOW-NEXT: vmovaps 128(%rax), %xmm1 @@ -8415,7 +8415,7 @@ ; AVX2-SLOW-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vmovaps 128(%rsi), %xmm1 ; AVX2-SLOW-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[1,1,2,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,1,2,2] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-SLOW-NEXT: vmovaps 128(%rcx), %xmm3 @@ -8430,7 +8430,7 @@ ; AVX2-SLOW-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vmovaps 160(%r9), %xmm0 ; AVX2-SLOW-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,1,1] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1,1,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] ; AVX2-SLOW-NEXT: vbroadcastsd %xmm0, %ymm0 ; AVX2-SLOW-NEXT: vmovaps 160(%rax), %xmm1 @@ -8441,7 +8441,7 @@ ; AVX2-SLOW-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vmovaps 160(%rsi), %xmm1 ; AVX2-SLOW-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[1,1,2,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,1,2,2] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-SLOW-NEXT: vmovaps 160(%rcx), %xmm3 @@ -8456,7 +8456,7 @@ ; AVX2-SLOW-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vmovaps 192(%r8), %xmm1 ; AVX2-SLOW-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,1,1] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1,1,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] ; AVX2-SLOW-NEXT: vbroadcastsd %xmm0, %ymm0 ; AVX2-SLOW-NEXT: vmovaps 192(%rax), %xmm1 @@ -8467,7 +8467,7 @@ ; AVX2-SLOW-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vmovaps 192(%rsi), %xmm1 ; AVX2-SLOW-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[1,1,2,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,1,2,2] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-SLOW-NEXT: vmovaps 192(%rcx), %xmm3 @@ -8494,7 +8494,7 @@ ; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps (%r9), %ymm1 ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[1,1,2,2,5,5,6,6] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,1,2,2,5,5,6,6] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4,5],ymm2[6],ymm1[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] ; AVX2-SLOW-NEXT: vmovaps 16(%rax), %xmm2 @@ -8517,7 +8517,7 @@ ; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps 32(%r9), %ymm1 ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[1,1,2,2,5,5,6,6] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,1,2,2,5,5,6,6] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4,5],ymm2[6],ymm1[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] ; AVX2-SLOW-NEXT: vmovaps 48(%rax), %xmm2 @@ -8540,7 +8540,7 @@ ; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps 64(%r9), %ymm1 ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[1,1,2,2,5,5,6,6] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,1,2,2,5,5,6,6] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4,5],ymm2[6],ymm1[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] ; AVX2-SLOW-NEXT: vmovaps 80(%rax), %xmm2 @@ -8563,7 +8563,7 @@ ; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps 96(%r9), %ymm1 ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[1,1,2,2,5,5,6,6] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,1,2,2,5,5,6,6] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4,5],ymm2[6],ymm1[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] ; AVX2-SLOW-NEXT: vmovaps 112(%rax), %xmm2 @@ -8586,7 +8586,7 @@ ; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps 128(%r9), %ymm1 ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[1,1,2,2,5,5,6,6] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,1,2,2,5,5,6,6] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4,5],ymm2[6],ymm1[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] ; AVX2-SLOW-NEXT: vmovaps 144(%rax), %xmm2 @@ -8609,7 +8609,7 @@ ; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps 160(%r9), %ymm1 ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[1,1,2,2,5,5,6,6] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,1,2,2,5,5,6,6] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4,5],ymm2[6],ymm1[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] ; AVX2-SLOW-NEXT: vmovaps 176(%rax), %xmm2 @@ -8632,7 +8632,7 @@ ; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps 192(%r9), %ymm1 ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[1,1,2,2,5,5,6,6] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,1,2,2,5,5,6,6] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4,5],ymm2[6],ymm1[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] ; AVX2-SLOW-NEXT: vmovaps 208(%rax), %xmm2 @@ -8647,7 +8647,7 @@ ; AVX2-SLOW-NEXT: vmovaps 224(%rsi), %xmm4 ; AVX2-SLOW-NEXT: vmovaps 224(%rdi), %xmm2 ; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm15 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm15 = xmm15[0,1,2,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm15 = xmm15[0,1,2,2] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[0,1,2,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm15[0,1],ymm3[2,3],ymm15[4,5,6,7] ; AVX2-SLOW-NEXT: vbroadcastsd 224(%r8), %ymm15 @@ -8659,14 +8659,14 @@ ; AVX2-SLOW-NEXT: vbroadcastss %xmm15, %ymm13 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm10 = ymm14[0,1,2,3,4,5],ymm13[6],ymm14[7] ; AVX2-SLOW-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm13 = xmm4[1,1,2,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm13 = xmm4[1,1,2,2] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm13 = xmm13[0,1],xmm2[2],xmm13[3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[0,1,2,1] ; AVX2-SLOW-NEXT: vinsertps {{.*#+}} xmm14 = zero,xmm1[1],xmm0[1],zero ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0],ymm14[1,2],ymm13[3,4,5,6,7] ; AVX2-SLOW-NEXT: vbroadcastss 228(%r8), %ymm12 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0,1,2],ymm12[3],ymm13[4,5,6,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm13 = xmm3[1,1,1,1] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm13 = xmm3[1,1,1,1] ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm13, %ymm0, %ymm13 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm13[4,5],ymm12[6,7] ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm15, %ymm14, %ymm13 @@ -8675,20 +8675,20 @@ ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm2[3,3],xmm4[3,3] ; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} xmm0 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,2,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1,2,2] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,2,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6],ymm0[7] ; AVX2-SLOW-NEXT: vmovaps 224(%r8), %ymm1 ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm1 = xmm3[2,2,2,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm3[2,2,2,2] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vbroadcastss 232(%rax), %ymm1 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4],ymm0[5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps 224(%rdi), %ymm4 ; AVX2-SLOW-NEXT: vmovaps 224(%rsi), %ymm2 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm2[1,1,1,1,5,5,5,5] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm2[1,1,1,1,5,5,5,5] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm4[1],ymm0[2,3,4],ymm4[5],ymm0[6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm12 = ymm0[2,2,2,2] ; AVX2-SLOW-NEXT: vmovaps 224(%rdx), %ymm10 @@ -8706,7 +8706,7 @@ ; AVX2-SLOW-NEXT: vbroadcastss %xmm8, %xmm13 ; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1] ; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm13 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm13 = xmm13[0,1,2,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm13 = xmm13[0,1,2,2] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[0,1,2,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0,1],ymm12[2,3],ymm13[4,5,6,7] ; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload @@ -8720,10 +8720,10 @@ ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm5 = xmm6[3,3],xmm5[3,3] ; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} xmm6 = xmm8[2],xmm9[2],xmm8[3],xmm9[3] ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm6 = xmm6[0,1,2,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm6 = xmm6[0,1,2,2] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,2,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3,4],ymm5[5,6],ymm6[7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm6 = xmm0[2,2,2,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm6 = xmm0[2,2,2,2] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm6 = xmm15[0,1,2],xmm6[3] ; AVX2-SLOW-NEXT: vbroadcastsd 8(%rax), %ymm8 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] @@ -8736,7 +8736,7 @@ ; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload ; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload ; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm6 = xmm13[0],xmm14[0],xmm13[1],xmm14[1] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm6 = xmm6[0,1,2,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm6 = xmm6[0,1,2,2] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,2,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1],ymm5[2,3],ymm6[4,5,6,7] ; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload @@ -8750,10 +8750,10 @@ ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm5 = xmm13[3,3],xmm14[3,3] ; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} xmm6 = xmm7[2],xmm12[2],xmm7[3],xmm12[3] ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm6 = xmm6[0,1,2,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm6 = xmm6[0,1,2,2] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,2,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3,4],ymm5[5,6],ymm6[7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm6 = xmm15[2,2,2,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm6 = xmm15[2,2,2,2] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm6 = xmm9[0,1,2],xmm6[3] ; AVX2-SLOW-NEXT: vbroadcastsd 40(%rax), %ymm7 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7] @@ -8766,7 +8766,7 @@ ; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload ; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload ; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm6 = xmm12[0],xmm9[0],xmm12[1],xmm9[1] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm6 = xmm6[0,1,2,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm6 = xmm6[0,1,2,2] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,2,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1],ymm5[2,3],ymm6[4,5,6,7] ; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -8780,10 +8780,10 @@ ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm5 = xmm12[3,3],xmm9[3,3] ; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} xmm6 = xmm11[2],xmm13[2],xmm11[3],xmm13[3] ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm6 = xmm6[0,1,2,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm6 = xmm6[0,1,2,2] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,2,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3,4],ymm5[5,6],ymm6[7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm6 = xmm0[2,2,2,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm6 = xmm0[2,2,2,2] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm6 = xmm8[0,1,2],xmm6[3] ; AVX2-SLOW-NEXT: vbroadcastsd 72(%rax), %ymm7 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7] @@ -8797,7 +8797,7 @@ ; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload ; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload ; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm6 = xmm12[0],xmm9[0],xmm12[1],xmm9[1] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm6 = xmm6[0,1,2,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm6 = xmm6[0,1,2,2] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,2,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1],ymm5[2,3],ymm6[4,5,6,7] ; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload @@ -8811,10 +8811,10 @@ ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm5 = xmm12[3,3],xmm9[3,3] ; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} xmm6 = xmm14[2],xmm13[2],xmm14[3],xmm13[3] ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm6 = xmm6[0,1,2,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm6 = xmm6[0,1,2,2] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,2,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3,4],ymm5[5,6],ymm6[7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm6 = xmm11[2,2,2,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm6 = xmm11[2,2,2,2] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm6 = xmm8[0,1,2],xmm6[3] ; AVX2-SLOW-NEXT: vbroadcastsd 104(%rax), %ymm7 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7] @@ -8828,7 +8828,7 @@ ; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload ; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload ; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm6 = xmm9[0],xmm8[0],xmm9[1],xmm8[1] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm6 = xmm6[0,1,2,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm6 = xmm6[0,1,2,2] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,2,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1],ymm5[2,3],ymm6[4,5,6,7] ; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload @@ -8842,10 +8842,10 @@ ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm5 = xmm9[3,3],xmm8[3,3] ; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} xmm6 = xmm13[2],xmm12[2],xmm13[3],xmm12[3] ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm6 = xmm6[0,1,2,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm6 = xmm6[0,1,2,2] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,2,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3,4],ymm5[5,6],ymm6[7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm6 = xmm11[2,2,2,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm6 = xmm11[2,2,2,2] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm6 = xmm0[0,1,2],xmm6[3] ; AVX2-SLOW-NEXT: vbroadcastsd 136(%rax), %ymm7 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7] @@ -8859,7 +8859,7 @@ ; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload ; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload ; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm6 = xmm9[0],xmm8[0],xmm9[1],xmm8[1] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm6 = xmm6[0,1,2,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm6 = xmm6[0,1,2,2] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,2,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1],ymm5[2,3],ymm6[4,5,6,7] ; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload @@ -8873,10 +8873,10 @@ ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm5 = xmm9[3,3],xmm8[3,3] ; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} xmm6 = xmm13[2],xmm12[2],xmm13[3],xmm12[3] ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm6 = xmm6[0,1,2,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm6 = xmm6[0,1,2,2] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,2,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3,4],ymm5[5,6],ymm6[7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm6 = xmm11[2,2,2,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm6 = xmm11[2,2,2,2] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm6 = xmm0[0,1,2],xmm6[3] ; AVX2-SLOW-NEXT: vbroadcastsd 168(%rax), %ymm7 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7] @@ -8890,7 +8890,7 @@ ; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload ; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload ; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm6 = xmm9[0],xmm8[0],xmm9[1],xmm8[1] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm6 = xmm6[0,1,2,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm6 = xmm6[0,1,2,2] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,2,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1],ymm5[2,3],ymm6[4,5,6,7] ; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -8904,10 +8904,10 @@ ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm5 = xmm9[3,3],xmm8[3,3] ; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} xmm6 = xmm13[2],xmm11[2],xmm13[3],xmm11[3] ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm6 = xmm6[0,1,2,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm6 = xmm6[0,1,2,2] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,2,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3,4],ymm5[5,6],ymm6[7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm6 = xmm0[2,2,2,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm6 = xmm0[2,2,2,2] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm6 = xmm12[0,1,2],xmm6[3] ; AVX2-SLOW-NEXT: vbroadcastsd 200(%rax), %ymm7 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7] @@ -8930,13 +8930,13 @@ ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0],ymm5[1,2,3,4,5,6],ymm6[7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vbroadcastss 240(%rdx), %ymm5 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm6 = ymm1[3,1,2,0,7,5,6,4] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm6 = ymm1[3,1,2,0,7,5,6,4] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5],ymm5[6],ymm6[7] ; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} ymm6 = ymm4[0],ymm2[0],ymm4[1],ymm2[1],ymm4[4],ymm2[4],ymm4[5],ymm2[5] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5],ymm5[6,7] ; AVX2-SLOW-NEXT: vbroadcastss 236(%r8), %ymm6 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm6[1],ymm5[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm3 = xmm3[2,2,3,3] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm3[2,2,3,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1],ymm3[2,3],ymm5[4,5,6,7] ; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm5 = ymm4[2],ymm2[2],ymm4[3],ymm2[3],ymm4[6],ymm2[6],ymm4[7],ymm2[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[2,2,2,2] @@ -8966,7 +8966,7 @@ ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6],ymm1[7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm5[1,1,1,1,5,5,5,5] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm5[1,1,1,1,5,5,5,5] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm4[1],ymm0[2,3,4],ymm4[5],ymm0[6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] @@ -8975,9 +8975,9 @@ ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm8[1,1],ymm3[1,1],ymm8[5,5],ymm3[5,5] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6],ymm0[7] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm14[0,0,0,0,4,4,4,4] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm14[0,0,0,0,4,4,4,4] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm2 = ymm9[0,1,0,1,4,5,4,5] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm9[0,1,0,1,4,5,4,5] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] ; AVX2-SLOW-NEXT: vbroadcastsd 16(%rax), %ymm2 @@ -8985,7 +8985,7 @@ ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6],ymm1[7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm15[1,1,1,1,5,5,5,5] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm15[1,1,1,1,5,5,5,5] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm10[1],ymm0[2,3,4],ymm10[5],ymm0[6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] @@ -9004,7 +9004,7 @@ ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6],ymm1[7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm6[1,1,1,1,5,5,5,5] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm6[1,1,1,1,5,5,5,5] ; AVX2-SLOW-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm0 = ymm0[0],mem[1],ymm0[2,3,4],mem[5],ymm0[6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] @@ -9081,7 +9081,7 @@ ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,1],ymm12[1,1],ymm0[5,5],ymm12[5,5] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm13[1,1,1,1,5,5,5,5] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm13[1,1,1,1,5,5,5,5] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm11[1],ymm1[2,3,4],ymm11[5],ymm1[6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,2] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6],ymm1[7] @@ -9096,7 +9096,7 @@ ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6],ymm1[7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vbroadcastss 16(%rdx), %ymm0 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm3[3,1,2,0,7,5,6,4] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm3[3,1,2,0,7,5,6,4] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6],ymm1[7] ; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} ymm1 = ymm4[0],ymm5[0],ymm4[1],ymm5[1],ymm4[4],ymm5[4],ymm4[5],ymm5[5] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] @@ -9118,7 +9118,7 @@ ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm1[0],ymm0[1,2,3,4],ymm1[5,6,7] ; AVX2-SLOW-NEXT: vbroadcastss 48(%rdx), %ymm0 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm7[3,1,2,0,7,5,6,4] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm7[3,1,2,0,7,5,6,4] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6],ymm1[7] ; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} ymm1 = ymm10[0],ymm15[0],ymm10[1],ymm15[1],ymm10[4],ymm15[4],ymm10[5],ymm15[5] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] @@ -9143,7 +9143,7 @@ ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm15 = ymm1[0],ymm0[1,2,3,4],ymm1[5,6,7] ; AVX2-SLOW-NEXT: vbroadcastss 80(%rdx), %ymm0 ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm12[3,1,2,0,7,5,6,4] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm12[3,1,2,0,7,5,6,4] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6],ymm1[7] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} ymm1 = ymm2[0],ymm6[0],ymm2[1],ymm6[1],ymm2[4],ymm6[4],ymm2[5],ymm6[5] @@ -9169,7 +9169,7 @@ ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm12 = ymm1[0],ymm0[1,2,3,4],ymm1[5,6,7] ; AVX2-SLOW-NEXT: vbroadcastss 112(%rdx), %ymm2 ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm0[3,1,2,0,7,5,6,4] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm0[3,1,2,0,7,5,6,4] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm1[0,1,2,3,4,5],ymm2[6],ymm1[7] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload @@ -9196,7 +9196,7 @@ ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm1[0],ymm0[1,2,3,4],ymm1[5,6,7] ; AVX2-SLOW-NEXT: vbroadcastss 144(%rdx), %ymm1 ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm8 = ymm0[3,1,2,0,7,5,6,4] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm8 = ymm0[3,1,2,0,7,5,6,4] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm8[0,1,2,3,4,5],ymm1[6],ymm8[7] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload @@ -9223,7 +9223,7 @@ ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm7[0],ymm0[1,2,3,4],ymm7[5,6,7] ; AVX2-SLOW-NEXT: vbroadcastss 176(%rdx), %ymm7 ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm8 = ymm0[3,1,2,0,7,5,6,4] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm8 = ymm0[3,1,2,0,7,5,6,4] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1,2,3,4,5],ymm7[6],ymm8[7] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload @@ -9381,7 +9381,7 @@ ; AVX2-FAST-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vmovaps 32(%r9), %xmm4 ; AVX2-FAST-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[1,1,1,1] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,1,1,1] ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3] ; AVX2-FAST-NEXT: vbroadcastsd %xmm1, %ymm1 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5],ymm1[6,7] @@ -9400,7 +9400,7 @@ ; AVX2-FAST-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vmovaps 32(%rsi), %xmm7 ; AVX2-FAST-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[1,1,2,2] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm2 = xmm2[1,1,2,2] ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm8[2],xmm2[3] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,2,1] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3,4,5,6,7] @@ -9409,11 +9409,11 @@ ; AVX2-FAST-NEXT: vmovaps 32(%rax), %xmm0 ; AVX2-FAST-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm1 = xmm4[1,1,1,1] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm4[1,1,1,1] ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2,3] ; AVX2-FAST-NEXT: vbroadcastsd %xmm1, %ymm1 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5],ymm1[6,7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm1 = xmm7[1,1,2,2] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm7[1,1,2,2] ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm6[2],xmm1[3] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-FAST-NEXT: vmovaps 32(%rdx), %xmm2 @@ -9426,7 +9426,7 @@ ; AVX2-FAST-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vmovaps 64(%r9), %xmm0 ; AVX2-FAST-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,1,1] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1,1,1] ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] ; AVX2-FAST-NEXT: vbroadcastsd %xmm0, %ymm0 ; AVX2-FAST-NEXT: vmovaps 64(%rax), %xmm1 @@ -9437,7 +9437,7 @@ ; AVX2-FAST-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vmovaps 64(%rsi), %xmm1 ; AVX2-FAST-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[1,1,2,2] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,1,2,2] ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-FAST-NEXT: vmovaps 64(%rcx), %xmm3 @@ -9452,7 +9452,7 @@ ; AVX2-FAST-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vmovaps 96(%r9), %xmm0 ; AVX2-FAST-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,1,1] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1,1,1] ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] ; AVX2-FAST-NEXT: vbroadcastsd %xmm0, %ymm0 ; AVX2-FAST-NEXT: vmovaps 96(%rax), %xmm1 @@ -9463,7 +9463,7 @@ ; AVX2-FAST-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vmovaps 96(%rsi), %xmm1 ; AVX2-FAST-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[1,1,2,2] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,1,2,2] ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-FAST-NEXT: vmovaps 96(%rcx), %xmm3 @@ -9478,7 +9478,7 @@ ; AVX2-FAST-NEXT: vmovaps %xmm1, (%rsp) # 16-byte Spill ; AVX2-FAST-NEXT: vmovaps 128(%r9), %xmm0 ; AVX2-FAST-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,1,1] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1,1,1] ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] ; AVX2-FAST-NEXT: vbroadcastsd %xmm0, %ymm0 ; AVX2-FAST-NEXT: vmovaps 128(%rax), %xmm1 @@ -9489,7 +9489,7 @@ ; AVX2-FAST-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vmovaps 128(%rsi), %xmm1 ; AVX2-FAST-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[1,1,2,2] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,1,2,2] ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-FAST-NEXT: vmovaps 128(%rcx), %xmm3 @@ -9504,7 +9504,7 @@ ; AVX2-FAST-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vmovaps 160(%r9), %xmm0 ; AVX2-FAST-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,1,1] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1,1,1] ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] ; AVX2-FAST-NEXT: vbroadcastsd %xmm0, %ymm0 ; AVX2-FAST-NEXT: vmovaps 160(%rax), %xmm1 @@ -9515,7 +9515,7 @@ ; AVX2-FAST-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vmovaps 160(%rsi), %xmm1 ; AVX2-FAST-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[1,1,2,2] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,1,2,2] ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-FAST-NEXT: vmovaps 160(%rcx), %xmm3 @@ -9530,7 +9530,7 @@ ; AVX2-FAST-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vmovaps 192(%r8), %xmm1 ; AVX2-FAST-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,1,1] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1,1,1] ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] ; AVX2-FAST-NEXT: vbroadcastsd %xmm0, %ymm0 ; AVX2-FAST-NEXT: vmovaps 192(%rax), %xmm1 @@ -9541,7 +9541,7 @@ ; AVX2-FAST-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vmovaps 192(%rsi), %xmm1 ; AVX2-FAST-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[1,1,2,2] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,1,2,2] ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-FAST-NEXT: vmovaps 192(%rcx), %xmm3 @@ -9567,7 +9567,7 @@ ; AVX2-FAST-NEXT: vmovaps (%r8), %ymm2 ; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps (%r9), %ymm12 -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm1 = ymm12[1,1,2,2,5,5,6,6] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm12[1,1,2,2,5,5,6,6] ; AVX2-FAST-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4,5],ymm2[6],ymm1[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] @@ -9591,7 +9591,7 @@ ; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps 32(%r9), %ymm1 ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[1,1,2,2,5,5,6,6] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,1,2,2,5,5,6,6] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4,5],ymm2[6],ymm1[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] ; AVX2-FAST-NEXT: vmovaps 48(%rax), %xmm2 @@ -9614,7 +9614,7 @@ ; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps 64(%r9), %ymm1 ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[1,1,2,2,5,5,6,6] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,1,2,2,5,5,6,6] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4,5],ymm2[6],ymm1[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] ; AVX2-FAST-NEXT: vmovaps 80(%rax), %xmm2 @@ -9637,7 +9637,7 @@ ; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps 96(%r9), %ymm1 ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[1,1,2,2,5,5,6,6] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,1,2,2,5,5,6,6] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4,5],ymm2[6],ymm1[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] ; AVX2-FAST-NEXT: vmovaps 112(%rax), %xmm2 @@ -9660,7 +9660,7 @@ ; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps 128(%r9), %ymm1 ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[1,1,2,2,5,5,6,6] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,1,2,2,5,5,6,6] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4,5],ymm2[6],ymm1[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] ; AVX2-FAST-NEXT: vmovaps 144(%rax), %xmm2 @@ -9683,7 +9683,7 @@ ; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps 160(%r9), %ymm1 ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[1,1,2,2,5,5,6,6] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,1,2,2,5,5,6,6] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4,5],ymm2[6],ymm1[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] ; AVX2-FAST-NEXT: vmovaps 176(%rax), %xmm2 @@ -9707,7 +9707,7 @@ ; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps 192(%r9), %ymm1 ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[1,1,2,2,5,5,6,6] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,1,2,2,5,5,6,6] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4,5],ymm2[6],ymm1[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] ; AVX2-FAST-NEXT: vmovaps 208(%rax), %xmm2 @@ -9716,7 +9716,7 @@ ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps 224(%rdi), %xmm0 ; AVX2-FAST-NEXT: vmovaps 224(%rsi), %xmm1 -; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[1,1,2,2] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm2 = xmm1[1,1,2,2] ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2],xmm2[3] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,2,1] ; AVX2-FAST-NEXT: vmovaps 224(%rcx), %xmm3 @@ -9726,7 +9726,7 @@ ; AVX2-FAST-NEXT: vbroadcastss 228(%r8), %ymm4 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm4[3],ymm2[4,5,6,7] ; AVX2-FAST-NEXT: vmovaps 224(%r9), %xmm7 -; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm15 = xmm7[1,1,1,1] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm15 = xmm7[1,1,1,1] ; AVX2-FAST-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm15[4,5],ymm2[6,7] ; AVX2-FAST-NEXT: vmovaps 224(%rax), %xmm15 @@ -9755,14 +9755,14 @@ ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6],ymm1[7] ; AVX2-FAST-NEXT: vmovaps 224(%r8), %ymm6 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm6[2,3],ymm0[4,5,6,7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm1 = xmm7[2,2,2,2] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm7[2,2,2,2] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6,7] ; AVX2-FAST-NEXT: vbroadcastss 232(%rax), %ymm1 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4],ymm0[5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps 224(%rdi), %ymm2 ; AVX2-FAST-NEXT: vmovaps 224(%rsi), %ymm1 -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm0 = ymm1[1,1,1,1,5,5,5,5] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm1[1,1,1,1,5,5,5,5] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3,4],ymm2[5],ymm0[6,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm4 = ymm0[2,2,2,2] ; AVX2-FAST-NEXT: vmovaps 224(%rdx), %ymm3 @@ -9789,13 +9789,13 @@ ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm15[0],ymm4[1,2,3,4,5,6],ymm15[7] ; AVX2-FAST-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vbroadcastss 240(%rdx), %ymm4 -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm15 = ymm0[3,1,2,0,7,5,6,4] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm15 = ymm0[3,1,2,0,7,5,6,4] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm15[0,1,2,3,4,5],ymm4[6],ymm15[7] ; AVX2-FAST-NEXT: vunpcklps {{.*#+}} ymm15 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[4],ymm1[4],ymm2[5],ymm1[5] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm15[4,5],ymm4[6,7] ; AVX2-FAST-NEXT: vbroadcastss 236(%r8), %ymm15 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm15[1],ymm4[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm7 = xmm7[2,2,3,3] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm7 = xmm7[2,2,3,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm7[2,3],ymm4[4,5,6,7] ; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm7 = ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[6],ymm1[6],ymm2[7],ymm1[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[2,2,2,2] @@ -9846,7 +9846,7 @@ ; AVX2-FAST-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vpermps %ymm1, %ymm10, %ymm1 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6],ymm1[7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm1 = xmm6[2,2,2,2] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm6[2,2,2,2] ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3] ; AVX2-FAST-NEXT: vbroadcastsd 8(%rax), %ymm2 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] @@ -9860,7 +9860,7 @@ ; AVX2-FAST-NEXT: vshufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm1 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm1 = ymm14[1,1],mem[1,1],ymm14[5,5],mem[5,5] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6],ymm0[7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm1 = ymm12[0,0,0,0,4,4,4,4] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm12[0,0,0,0,4,4,4,4] ; AVX2-FAST-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm2 = mem[0,1,0,1,4,5,4,5] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7] @@ -9892,7 +9892,7 @@ ; AVX2-FAST-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vpermps %ymm1, %ymm10, %ymm1 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6],ymm1[7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm1 = xmm7[2,2,2,2] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm7[2,2,2,2] ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3] ; AVX2-FAST-NEXT: vbroadcastsd 40(%rax), %ymm2 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] @@ -9940,14 +9940,14 @@ ; AVX2-FAST-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vpermps %ymm1, %ymm10, %ymm1 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6],ymm1[7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm1 = xmm3[2,2,2,2] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm3[2,2,2,2] ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm8[0,1,2],xmm1[3] ; AVX2-FAST-NEXT: vbroadcastsd 72(%rax), %ymm2 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4],ymm0[5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm0 = ymm15[1,1,1,1,5,5,5,5] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm15[1,1,1,1,5,5,5,5] ; AVX2-FAST-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm0 = ymm0[0],mem[1],ymm0[2,3,4],mem[5],ymm0[6,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] @@ -9988,7 +9988,7 @@ ; AVX2-FAST-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vpermps %ymm1, %ymm10, %ymm1 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6],ymm1[7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm1 = xmm7[2,2,2,2] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm7[2,2,2,2] ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm8[0,1,2],xmm1[3] ; AVX2-FAST-NEXT: vbroadcastsd 104(%rax), %ymm2 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] @@ -10036,7 +10036,7 @@ ; AVX2-FAST-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vpermps %ymm1, %ymm10, %ymm1 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6],ymm1[7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm1 = xmm7[2,2,2,2] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm7[2,2,2,2] ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm8[0,1,2],xmm1[3] ; AVX2-FAST-NEXT: vbroadcastsd 136(%rax), %ymm2 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] @@ -10084,7 +10084,7 @@ ; AVX2-FAST-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vpermps %ymm1, %ymm10, %ymm1 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6],ymm1[7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm1 = xmm7[2,2,2,2] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm7[2,2,2,2] ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm8[0,1,2],xmm1[3] ; AVX2-FAST-NEXT: vbroadcastsd 168(%rax), %ymm2 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] @@ -10132,7 +10132,7 @@ ; AVX2-FAST-NEXT: vpermps %ymm1, %ymm10, %ymm1 ; AVX2-FAST-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6],ymm1[7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm1 = xmm6[2,2,2,2] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm6[2,2,2,2] ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3] ; AVX2-FAST-NEXT: vbroadcastsd 200(%rax), %ymm2 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] @@ -10142,7 +10142,7 @@ ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,1],ymm13[1,1],ymm0[5,5],ymm13[5,5] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm1 = ymm7[1,1,1,1,5,5,5,5] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm7[1,1,1,1,5,5,5,5] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm6[1],ymm1[2,3,4],ymm6[5],ymm1[6,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,2] @@ -10159,7 +10159,7 @@ ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vbroadcastss 16(%rdx), %ymm0 ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm1 = ymm5[3,1,2,0,7,5,6,4] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm5[3,1,2,0,7,5,6,4] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6],ymm1[7] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload @@ -10188,7 +10188,7 @@ ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vbroadcastss 48(%rdx), %ymm0 ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm1 = ymm5[3,1,2,0,7,5,6,4] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm5[3,1,2,0,7,5,6,4] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6],ymm1[7] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload @@ -10217,7 +10217,7 @@ ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vbroadcastss 80(%rdx), %ymm0 ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm1 = ymm5[3,1,2,0,7,5,6,4] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm5[3,1,2,0,7,5,6,4] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6],ymm1[7] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm15, %ymm3 @@ -10244,7 +10244,7 @@ ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm10 = ymm1[0],ymm0[1,2,3,4],ymm1[5,6,7] ; AVX2-FAST-NEXT: vbroadcastss 112(%rdx), %ymm0 ; AVX2-FAST-NEXT: vmovaps %ymm11, %ymm3 -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm5 = ymm11[3,1,2,0,7,5,6,4] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm5 = ymm11[3,1,2,0,7,5,6,4] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5],ymm0[6],ymm5[7] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-FAST-NEXT: vunpcklps {{.*#+}} ymm5 = ymm12[0],ymm2[0],ymm12[1],ymm2[1],ymm12[4],ymm2[4],ymm12[5],ymm2[5] @@ -10270,7 +10270,7 @@ ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0],ymm0[1,2,3,4],ymm5[5,6,7] ; AVX2-FAST-NEXT: vbroadcastss 144(%rdx), %ymm5 ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm12 = ymm2[3,1,2,0,7,5,6,4] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm12 = ymm2[3,1,2,0,7,5,6,4] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm12[0,1,2,3,4,5],ymm5[6],ymm12[7] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-NEXT: vunpcklps {{.*#+}} ymm12 = ymm9[0],ymm1[0],ymm9[1],ymm1[1],ymm9[4],ymm1[4],ymm9[5],ymm1[5] @@ -10295,7 +10295,7 @@ ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[2,1,2,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0],ymm0[1,2,3,4],ymm5[5,6,7] ; AVX2-FAST-NEXT: vbroadcastss 176(%rdx), %ymm5 -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm9 = ymm14[3,1,2,0,7,5,6,4] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm9 = ymm14[3,1,2,0,7,5,6,4] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm9[0,1,2,3,4,5],ymm5[6],ymm9[7] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -10321,7 +10321,7 @@ ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[2,1,2,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0],ymm0[1,2,3,4],ymm8[5,6,7] ; AVX2-FAST-NEXT: vunpcklps {{.*#+}} ymm8 = ymm6[0],ymm7[0],ymm6[1],ymm7[1],ymm6[4],ymm7[4],ymm6[5],ymm7[5] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm9 = ymm13[3,1,2,0,7,5,6,4] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm9 = ymm13[3,1,2,0,7,5,6,4] ; AVX2-FAST-NEXT: vbroadcastss 208(%rdx), %ymm7 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm9[0,1,2,3,4,5],ymm7[6],ymm9[7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5],ymm7[6,7] @@ -10454,7 +10454,7 @@ ; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%r9), %xmm4 ; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[1,1,1,1] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,1,1,1] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm5[1],xmm1[2,3] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd %xmm1, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5],ymm1[6,7] @@ -10469,7 +10469,7 @@ ; AVX2-FAST-PERLANE-NEXT: vmovaps (%rsi), %xmm5 ; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rsi), %xmm11 ; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm2 = xmm5[1,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm2 = xmm5[1,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm6[2],xmm2[3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,2,1] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3,4,5,6,7] @@ -10478,11 +10478,11 @@ ; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rax), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm1 = xmm4[1,1,1,1] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm4[1,1,1,1] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2,3] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd %xmm1, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5],ymm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm1 = xmm11[1,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm11[1,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm7[2],xmm1[3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rdx), %xmm7 @@ -10494,7 +10494,7 @@ ; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%r9), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,1,1] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1,1,1] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd %xmm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%rax), %xmm1 @@ -10505,7 +10505,7 @@ ; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%rsi), %xmm1 ; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[1,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%rcx), %xmm2 @@ -10519,7 +10519,7 @@ ; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 96(%r9), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,1,1] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1,1,1] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd %xmm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovaps 96(%rax), %xmm1 @@ -10530,7 +10530,7 @@ ; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 96(%rsi), %xmm1 ; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[1,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-FAST-PERLANE-NEXT: vmovaps 96(%rcx), %xmm3 @@ -10545,7 +10545,7 @@ ; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm1, (%rsp) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 128(%r9), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,1,1] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1,1,1] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd %xmm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovaps 128(%rax), %xmm1 @@ -10556,7 +10556,7 @@ ; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 128(%rsi), %xmm1 ; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[1,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-FAST-PERLANE-NEXT: vmovaps 128(%rcx), %xmm3 @@ -10571,7 +10571,7 @@ ; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 160(%r9), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,1,1] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1,1,1] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd %xmm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovaps 160(%rax), %xmm1 @@ -10582,7 +10582,7 @@ ; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 160(%rsi), %xmm1 ; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[1,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-FAST-PERLANE-NEXT: vmovaps 160(%rcx), %xmm3 @@ -10597,7 +10597,7 @@ ; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 192(%r8), %xmm1 ; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,1,1] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1,1,1] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd %xmm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovaps 192(%rax), %xmm1 @@ -10608,7 +10608,7 @@ ; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 192(%rsi), %xmm1 ; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[1,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-FAST-PERLANE-NEXT: vmovaps 192(%rcx), %xmm3 @@ -10635,7 +10635,7 @@ ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps (%r9), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[1,1,2,2,5,5,6,6] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,1,2,2,5,5,6,6] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4,5],ymm2[6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] ; AVX2-FAST-PERLANE-NEXT: vmovaps 16(%rax), %xmm2 @@ -10658,7 +10658,7 @@ ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%r9), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[1,1,2,2,5,5,6,6] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,1,2,2,5,5,6,6] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4,5],ymm2[6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] ; AVX2-FAST-PERLANE-NEXT: vmovaps 48(%rax), %xmm2 @@ -10681,7 +10681,7 @@ ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%r9), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[1,1,2,2,5,5,6,6] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,1,2,2,5,5,6,6] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4,5],ymm2[6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] ; AVX2-FAST-PERLANE-NEXT: vmovaps 80(%rax), %xmm2 @@ -10704,7 +10704,7 @@ ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 96(%r9), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[1,1,2,2,5,5,6,6] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,1,2,2,5,5,6,6] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4,5],ymm2[6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] ; AVX2-FAST-PERLANE-NEXT: vmovaps 112(%rax), %xmm2 @@ -10727,7 +10727,7 @@ ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 128(%r9), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[1,1,2,2,5,5,6,6] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,1,2,2,5,5,6,6] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4,5],ymm2[6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] ; AVX2-FAST-PERLANE-NEXT: vmovaps 144(%rax), %xmm2 @@ -10750,7 +10750,7 @@ ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 160(%r9), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[1,1,2,2,5,5,6,6] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,1,2,2,5,5,6,6] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4,5],ymm2[6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] ; AVX2-FAST-PERLANE-NEXT: vmovaps 176(%rax), %xmm2 @@ -10773,7 +10773,7 @@ ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 192(%r9), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[1,1,2,2,5,5,6,6] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,1,2,2,5,5,6,6] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4,5],ymm2[6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] ; AVX2-FAST-PERLANE-NEXT: vmovaps 208(%rax), %xmm2 @@ -10788,7 +10788,7 @@ ; AVX2-FAST-PERLANE-NEXT: vmovaps 224(%rsi), %xmm4 ; AVX2-FAST-PERLANE-NEXT: vmovaps 224(%rdi), %xmm2 ; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm15 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm15 = xmm15[0,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm15 = xmm15[0,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[0,1,2,1] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm15[0,1],ymm3[2,3],ymm15[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 224(%r8), %ymm15 @@ -10800,14 +10800,14 @@ ; AVX2-FAST-PERLANE-NEXT: vbroadcastss %xmm15, %ymm13 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm10 = ymm14[0,1,2,3,4,5],ymm13[6],ymm14[7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm13 = xmm4[1,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm13 = xmm4[1,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm13 = xmm13[0,1],xmm2[2],xmm13[3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[0,1,2,1] ; AVX2-FAST-PERLANE-NEXT: vinsertps {{.*#+}} xmm14 = zero,xmm1[1],xmm0[1],zero ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0],ymm14[1,2],ymm13[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vbroadcastss 228(%r8), %ymm12 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0,1,2],ymm12[3],ymm13[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm13 = xmm3[1,1,1,1] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm13 = xmm3[1,1,1,1] ; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm13, %ymm0, %ymm13 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm13[4,5],ymm12[6,7] ; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm15, %ymm14, %ymm13 @@ -10816,20 +10816,20 @@ ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm2 = xmm2[3,3],xmm4[3,3] ; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} xmm0 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,2,1] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6],ymm0[7] ; AVX2-FAST-PERLANE-NEXT: vmovaps 224(%r8), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm1 = xmm3[2,2,2,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm3[2,2,2,2] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vbroadcastss 232(%rax), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4],ymm0[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 224(%rdi), %ymm4 ; AVX2-FAST-PERLANE-NEXT: vmovaps 224(%rsi), %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm0 = ymm2[1,1,1,1,5,5,5,5] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm2[1,1,1,1,5,5,5,5] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm4[1],ymm0[2,3,4],ymm4[5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm12 = ymm0[2,2,2,2] ; AVX2-FAST-PERLANE-NEXT: vmovaps 224(%rdx), %ymm10 @@ -10847,7 +10847,7 @@ ; AVX2-FAST-PERLANE-NEXT: vbroadcastss %xmm8, %xmm13 ; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1] ; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm13 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm13 = xmm13[0,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm13 = xmm13[0,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[0,1,2,1] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0,1],ymm12[2,3],ymm13[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload @@ -10861,10 +10861,10 @@ ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm5 = xmm6[3,3],xmm5[3,3] ; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} xmm6 = xmm8[2],xmm9[2],xmm8[3],xmm9[3] ; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm6 = xmm6[0,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm6 = xmm6[0,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,2,1] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3,4],ymm5[5,6],ymm6[7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm6 = xmm0[2,2,2,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm6 = xmm0[2,2,2,2] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm6 = xmm15[0,1,2],xmm6[3] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 8(%rax), %ymm8 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] @@ -10877,7 +10877,7 @@ ; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm6 = xmm13[0],xmm14[0],xmm13[1],xmm14[1] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm6 = xmm6[0,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm6 = xmm6[0,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,2,1] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1],ymm5[2,3],ymm6[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload @@ -10891,10 +10891,10 @@ ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm5 = xmm13[3,3],xmm14[3,3] ; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} xmm6 = xmm7[2],xmm12[2],xmm7[3],xmm12[3] ; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm6 = xmm6[0,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm6 = xmm6[0,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,2,1] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3,4],ymm5[5,6],ymm6[7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm6 = xmm15[2,2,2,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm6 = xmm15[2,2,2,2] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm6 = xmm9[0,1,2],xmm6[3] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 40(%rax), %ymm7 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7] @@ -10907,7 +10907,7 @@ ; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm6 = xmm12[0],xmm9[0],xmm12[1],xmm9[1] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm6 = xmm6[0,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm6 = xmm6[0,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,2,1] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1],ymm5[2,3],ymm6[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -10921,10 +10921,10 @@ ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm5 = xmm12[3,3],xmm9[3,3] ; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} xmm6 = xmm11[2],xmm13[2],xmm11[3],xmm13[3] ; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm6 = xmm6[0,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm6 = xmm6[0,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,2,1] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3,4],ymm5[5,6],ymm6[7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm6 = xmm0[2,2,2,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm6 = xmm0[2,2,2,2] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm6 = xmm8[0,1,2],xmm6[3] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 72(%rax), %ymm7 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7] @@ -10938,7 +10938,7 @@ ; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm6 = xmm12[0],xmm9[0],xmm12[1],xmm9[1] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm6 = xmm6[0,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm6 = xmm6[0,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,2,1] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1],ymm5[2,3],ymm6[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload @@ -10952,10 +10952,10 @@ ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm5 = xmm12[3,3],xmm9[3,3] ; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} xmm6 = xmm14[2],xmm13[2],xmm14[3],xmm13[3] ; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm6 = xmm6[0,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm6 = xmm6[0,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,2,1] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3,4],ymm5[5,6],ymm6[7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm6 = xmm11[2,2,2,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm6 = xmm11[2,2,2,2] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm6 = xmm8[0,1,2],xmm6[3] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 104(%rax), %ymm7 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7] @@ -10969,7 +10969,7 @@ ; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm6 = xmm9[0],xmm8[0],xmm9[1],xmm8[1] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm6 = xmm6[0,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm6 = xmm6[0,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,2,1] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1],ymm5[2,3],ymm6[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload @@ -10983,10 +10983,10 @@ ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm5 = xmm9[3,3],xmm8[3,3] ; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} xmm6 = xmm13[2],xmm12[2],xmm13[3],xmm12[3] ; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm6 = xmm6[0,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm6 = xmm6[0,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,2,1] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3,4],ymm5[5,6],ymm6[7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm6 = xmm11[2,2,2,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm6 = xmm11[2,2,2,2] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm6 = xmm0[0,1,2],xmm6[3] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 136(%rax), %ymm7 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7] @@ -11000,7 +11000,7 @@ ; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm6 = xmm9[0],xmm8[0],xmm9[1],xmm8[1] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm6 = xmm6[0,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm6 = xmm6[0,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,2,1] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1],ymm5[2,3],ymm6[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload @@ -11014,10 +11014,10 @@ ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm5 = xmm9[3,3],xmm8[3,3] ; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} xmm6 = xmm13[2],xmm12[2],xmm13[3],xmm12[3] ; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm6 = xmm6[0,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm6 = xmm6[0,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,2,1] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3,4],ymm5[5,6],ymm6[7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm6 = xmm11[2,2,2,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm6 = xmm11[2,2,2,2] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm6 = xmm0[0,1,2],xmm6[3] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 168(%rax), %ymm7 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7] @@ -11031,7 +11031,7 @@ ; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm6 = xmm9[0],xmm8[0],xmm9[1],xmm8[1] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm6 = xmm6[0,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm6 = xmm6[0,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,2,1] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1],ymm5[2,3],ymm6[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -11045,10 +11045,10 @@ ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm5 = xmm9[3,3],xmm8[3,3] ; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} xmm6 = xmm13[2],xmm11[2],xmm13[3],xmm11[3] ; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm6 = xmm6[0,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm6 = xmm6[0,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,2,1] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3,4],ymm5[5,6],ymm6[7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm6 = xmm0[2,2,2,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm6 = xmm0[2,2,2,2] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm6 = xmm12[0,1,2],xmm6[3] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 200(%rax), %ymm7 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7] @@ -11071,13 +11071,13 @@ ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0],ymm5[1,2,3,4,5,6],ymm6[7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vbroadcastss 240(%rdx), %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm6 = ymm1[3,1,2,0,7,5,6,4] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm6 = ymm1[3,1,2,0,7,5,6,4] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5],ymm5[6],ymm6[7] ; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} ymm6 = ymm4[0],ymm2[0],ymm4[1],ymm2[1],ymm4[4],ymm2[4],ymm4[5],ymm2[5] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5],ymm5[6,7] ; AVX2-FAST-PERLANE-NEXT: vbroadcastss 236(%r8), %ymm6 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm6[1],ymm5[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm3 = xmm3[2,2,3,3] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm3 = xmm3[2,2,3,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1],ymm3[2,3],ymm5[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm5 = ymm4[2],ymm2[2],ymm4[3],ymm2[3],ymm4[6],ymm2[6],ymm4[7],ymm2[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[2,2,2,2] @@ -11107,7 +11107,7 @@ ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm0 = ymm5[1,1,1,1,5,5,5,5] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm5[1,1,1,1,5,5,5,5] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm4[1],ymm0[2,3,4],ymm4[5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] @@ -11116,9 +11116,9 @@ ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm8[1,1],ymm3[1,1],ymm8[5,5],ymm3[5,5] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6],ymm0[7] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm14[0,0,0,0,4,4,4,4] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm14[0,0,0,0,4,4,4,4] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm2 = ymm9[0,1,0,1,4,5,4,5] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm9[0,1,0,1,4,5,4,5] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 16(%rax), %ymm2 @@ -11126,7 +11126,7 @@ ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm0 = ymm15[1,1,1,1,5,5,5,5] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm15[1,1,1,1,5,5,5,5] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm10[1],ymm0[2,3,4],ymm10[5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] @@ -11145,7 +11145,7 @@ ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm0 = ymm6[1,1,1,1,5,5,5,5] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm6[1,1,1,1,5,5,5,5] ; AVX2-FAST-PERLANE-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm0 = ymm0[0],mem[1],ymm0[2,3,4],mem[5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] @@ -11222,7 +11222,7 @@ ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,1],ymm12[1,1],ymm0[5,5],ymm12[5,5] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm13[1,1,1,1,5,5,5,5] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm13[1,1,1,1,5,5,5,5] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm11[1],ymm1[2,3,4],ymm11[5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,2] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6],ymm1[7] @@ -11237,7 +11237,7 @@ ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vbroadcastss 16(%rdx), %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm3[3,1,2,0,7,5,6,4] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm3[3,1,2,0,7,5,6,4] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} ymm1 = ymm4[0],ymm5[0],ymm4[1],ymm5[1],ymm4[4],ymm5[4],ymm4[5],ymm5[5] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] @@ -11259,7 +11259,7 @@ ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm1[0],ymm0[1,2,3,4],ymm1[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vbroadcastss 48(%rdx), %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm7[3,1,2,0,7,5,6,4] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm7[3,1,2,0,7,5,6,4] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} ymm1 = ymm10[0],ymm15[0],ymm10[1],ymm15[1],ymm10[4],ymm15[4],ymm10[5],ymm15[5] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] @@ -11284,7 +11284,7 @@ ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm15 = ymm1[0],ymm0[1,2,3,4],ymm1[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vbroadcastss 80(%rdx), %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm12[3,1,2,0,7,5,6,4] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm12[3,1,2,0,7,5,6,4] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} ymm1 = ymm2[0],ymm6[0],ymm2[1],ymm6[1],ymm2[4],ymm6[4],ymm2[5],ymm6[5] @@ -11310,7 +11310,7 @@ ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm12 = ymm1[0],ymm0[1,2,3,4],ymm1[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vbroadcastss 112(%rdx), %ymm2 ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm0[3,1,2,0,7,5,6,4] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm0[3,1,2,0,7,5,6,4] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm1[0,1,2,3,4,5],ymm2[6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload @@ -11337,7 +11337,7 @@ ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm1[0],ymm0[1,2,3,4],ymm1[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vbroadcastss 144(%rdx), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm8 = ymm0[3,1,2,0,7,5,6,4] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm8 = ymm0[3,1,2,0,7,5,6,4] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm8[0,1,2,3,4,5],ymm1[6],ymm8[7] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload @@ -11364,7 +11364,7 @@ ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm7[0],ymm0[1,2,3,4],ymm7[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vbroadcastss 176(%rdx), %ymm7 ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm8 = ymm0[3,1,2,0,7,5,6,4] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm8 = ymm0[3,1,2,0,7,5,6,4] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1,2,3,4,5],ymm7[6],ymm8[7] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-8.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-8.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-8.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-8.ll @@ -495,7 +495,7 @@ ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm4[0],ymm6[0],ymm4[1],ymm6[1],ymm4[4],ymm6[4],ymm4[5],ymm6[5] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm5[1,0],ymm3[1,0],ymm5[5,4],ymm3[5,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm2 = ymm2[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] @@ -506,7 +506,7 @@ ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm2 = ymm4[2],ymm6[2],ymm4[3],ymm6[3],ymm4[6],ymm6[6],ymm4[7],ymm6[7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm5[3,0],ymm3[3,0],ymm5[7,4],ymm3[7,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm11 = ymm11[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm11[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm11, %xmm11 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm11[0,1],xmm2[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm1[4,5,6,7] @@ -517,7 +517,7 @@ ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm11 = ymm3[2],ymm5[2],ymm3[3],ymm5[3],ymm3[6],ymm5[6],ymm3[7],ymm5[7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm11, %xmm11 ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm12 = ymm6[1],ymm4[1],ymm6[3],ymm4[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm12 = ymm12[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm12[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm12, %xmm12 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm11 = xmm11[0,1],xmm12[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm2[4,5,6,7] @@ -528,7 +528,7 @@ ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm3 = ymm3[0],ymm5[0],ymm3[1],ymm5[1],ymm3[4],ymm5[4],ymm3[5],ymm5[5] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm3 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm6[0],ymm4[0],ymm6[2],ymm4[2] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm4 = ymm4[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm4 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm7[4,5,6,7] @@ -543,18 +543,18 @@ ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm8 ; AVX1-ONLY-NEXT: vmovaps (%r9), %xmm13 ; AVX1-ONLY-NEXT: vmovaps (%r8), %xmm14 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm15 = xmm14[1,1,1,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm14[1,1,1,1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm15[0],xmm13[1],xmm15[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm15[0,1,2,3,4,5],ymm8[6,7] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm15 = xmm11[1,1,1,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm11[1,1,1,1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm15[0],xmm10[1],xmm15[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm15[0,1],xmm9[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm15[0,1,2,3],ymm8[4,5,6,7] ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm15 = xmm7[2],xmm6[2],xmm7[3],xmm6[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15 ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm14[2],xmm13[2],xmm14[3],xmm13[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm2[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm2[2,3,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm15[6,7] ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm15 = xmm5[2],xmm4[2],xmm5[3],xmm4[3] @@ -565,14 +565,14 @@ ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm9 = xmm10[0],xmm9[0] ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm10 = xmm14[0],xmm13[0],xmm14[1],xmm13[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm10 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm11 = xmm12[0,1,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm11 = xmm12[0,1,0,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm11 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm11[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm4 = xmm4[2,2,2,2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm4[2,2,2,2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm4 = xmm6[2,2,2,2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm6[2,2,2,2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm7[0,1,2],xmm4[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 @@ -1079,7 +1079,7 @@ ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm10 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm13 = ymm4[1],ymm3[1],ymm4[3],ymm3[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm13 = ymm13[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm13[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm13, %xmm13 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm10[0,1],xmm13[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm10[0,1,2,3],ymm2[4,5,6,7] @@ -1090,7 +1090,7 @@ ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm10 = ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[6],ymm4[6],ymm3[7],ymm4[7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm1[3,0],ymm0[3,0],ymm1[7,4],ymm0[7,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm13 = ymm13[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm13[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm13, %xmm13 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm13[0,1],xmm10[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm10[0,1,2,3],ymm2[4,5,6,7] @@ -1101,7 +1101,7 @@ ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm4[0],ymm3[0],ymm4[2],ymm3[2] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm13 = ymm13[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm13[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm13, %xmm13 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm13 = xmm2[0,1],xmm13[2,3] ; AVX1-ONLY-NEXT: vmovaps (%rcx), %ymm2 @@ -1117,7 +1117,7 @@ ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[4],ymm4[4],ymm3[5],ymm4[5] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm3 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[1,0],ymm0[1,0],ymm1[5,4],ymm0[5,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7] @@ -1128,7 +1128,7 @@ ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm7[2],ymm8[2],ymm7[3],ymm8[3],ymm7[6],ymm8[6],ymm7[7],ymm8[7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm2[1],ymm9[1],ymm2[3],ymm9[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm3 = ymm3[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm3 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] @@ -1139,7 +1139,7 @@ ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm9[2],ymm2[2],ymm9[3],ymm2[3],ymm9[6],ymm2[6],ymm9[7],ymm2[7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm8[3,0],ymm7[3,0],ymm8[7,4],ymm7[7,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm3 = ymm3[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm3 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] @@ -1150,7 +1150,7 @@ ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm7[0],ymm8[0],ymm7[1],ymm8[1],ymm7[4],ymm8[4],ymm7[5],ymm8[5] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm2[0],ymm9[0],ymm2[2],ymm9[2] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm3 = ymm3[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm3 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] @@ -1161,7 +1161,7 @@ ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm9[0],ymm2[0],ymm9[1],ymm2[1],ymm9[4],ymm2[4],ymm9[5],ymm2[5] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm8[1,0],ymm7[1,0],ymm8[5,4],ymm7[5,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm2 = ymm2[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] @@ -1175,12 +1175,12 @@ ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovaps (%r9), %xmm15 ; AVX1-ONLY-NEXT: vmovaps (%r8), %xmm13 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm3 = xmm13[1,1,1,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm13[1,1,1,1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0],xmm15[1],xmm3[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm0[6,7] ; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm11 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm4 = xmm11[1,1,1,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm11[1,1,1,1] ; AVX1-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0],xmm5[1],xmm4[2,3] ; AVX1-ONLY-NEXT: vmovaps (%rcx), %xmm0 @@ -1194,14 +1194,14 @@ ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm3 = xmm3[0],xmm12[0] ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm4 = xmm13[0],xmm15[0],xmm13[1],xmm15[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[0,1,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,1,0,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5],ymm2[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm2[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 32(%rax), %xmm9 ; AVX1-ONLY-NEXT: vmovaps 32(%r10), %xmm8 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm3 = xmm9[2,2,2,2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm9[2,2,2,2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm8[0,1,2],xmm3[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm7 ; AVX1-ONLY-NEXT: vmovaps 32(%r9), %xmm6 @@ -1211,7 +1211,7 @@ ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm10[0,1,2,3,4,5],ymm7[6,7] ; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %xmm10 ; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %xmm0 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm12 = xmm10[2,2,2,2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm12 = xmm10[2,2,2,2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm12 = xmm0[0,1,2],xmm12[3] ; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %xmm3 ; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm2 @@ -1222,7 +1222,7 @@ ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm12[1] ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm12 = xmm8[2],xmm9[2],xmm8[3],xmm9[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm12 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm4 = xmm4[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm4[2,3,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm12[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] @@ -1230,16 +1230,16 @@ ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1] ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm9 = xmm8[0,1,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm9 = xmm8[0,1,0,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm9 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm9[6,7] ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm9 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm9 = xmm9[0],xmm4[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[1,1,1,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm2[1,1,1,1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm3 = xmm5[1,1,1,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm5[1,1,1,1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0],xmm6[1],xmm3[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm4 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 @@ -1249,14 +1249,14 @@ ; AVX1-ONLY-NEXT: # xmm3 = xmm11[2],mem[2],xmm11[3],mem[3] ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm4 = xmm13[2],xmm15[2],xmm13[3],xmm15[3] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm5 = xmm9[2,2,2,2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm9[2,2,2,2] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm8[0,1,2],xmm5[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm6 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5],ymm5[6,7] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm6 = xmm10[2,2,2,2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm10[2,2,2,2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm14[0,1,2],xmm6[3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm3[0,1],xmm6[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] @@ -1264,7 +1264,7 @@ ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm3[1],xmm6[1] ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm6 = xmm8[2],xmm9[2],xmm8[3],xmm9[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm4 = xmm4[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm4[2,3,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm6[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] @@ -1413,7 +1413,7 @@ ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovaps (%rsi), %xmm2 ; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm4 -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm4[1,1,1,1] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm4[1,1,1,1] ; AVX2-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3] ; AVX2-ONLY-NEXT: vmovaps %xmm2, %xmm5 @@ -1425,7 +1425,7 @@ ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vmovaps (%r9), %xmm6 ; AVX2-ONLY-NEXT: vmovaps (%r8), %xmm14 -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm2 = xmm14[1,1,1,1] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm14[1,1,1,1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm6[1],xmm2[2,3] ; AVX2-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 @@ -1448,7 +1448,7 @@ ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 32(%rax), %xmm8 ; AVX2-ONLY-NEXT: vmovaps 32(%r10), %xmm5 -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm2 = xmm8[2,2,2,2] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm8[2,2,2,2] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm5[0,1,2],xmm2[3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX2-ONLY-NEXT: vmovaps 32(%r9), %xmm10 @@ -1458,7 +1458,7 @@ ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovaps 32(%rcx), %xmm2 ; AVX2-ONLY-NEXT: vmovaps 32(%rdx), %xmm1 -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm13 = xmm2[2,2,2,2] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm13 = xmm2[2,2,2,2] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm13 = xmm1[0,1,2],xmm13[3] ; AVX2-ONLY-NEXT: vmovaps 32(%rsi), %xmm4 ; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm3 @@ -1470,7 +1470,7 @@ ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm7[1] ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm7 = xmm5[2],xmm8[2],xmm5[3],xmm8[3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm6 = xmm6[2,3,2,3] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm6[2,3,2,3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm7[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm0[0,1,2,3],ymm6[4,5,6,7] @@ -1488,11 +1488,11 @@ ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm13[0,1,2,3,4,5],ymm6[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm0[0,1,2,3],ymm6[4,5,6,7] ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm3[1,1,1,1] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm3[1,1,1,1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm4[1],xmm1[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm5[0],xmm8[0],xmm5[1],xmm8[1] -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm2 = xmm9[1,1,1,1] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm9[1,1,1,1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm10[1],xmm2[2,3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 @@ -1503,13 +1503,13 @@ ; AVX2-ONLY-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] ; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm2 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm2 = xmm14[2],mem[2],xmm14[3],mem[3] -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm3 = xmm11[2,2,2,2] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm11[2,2,2,2] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm12[0,1,2],xmm3[3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm4 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7] ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm4 = xmm5[2,2,2,2] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm5[2,2,2,2] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm15[0,1,2],xmm4[3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm1[0,1],xmm4[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] @@ -1517,7 +1517,7 @@ ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm4[1] ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm4 = xmm12[2],xmm11[2],xmm12[3],xmm11[3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[2,3,2,3] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm2[2,3,2,3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] @@ -2335,7 +2335,7 @@ ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm9 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[4],ymm4[4],ymm3[5],ymm4[5] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm9, %xmm9 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm6[0],ymm5[0],ymm6[2],ymm5[2] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm13 = ymm13[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm13[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm13, %xmm13 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm9 = xmm9[0,1],xmm13[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7] @@ -2346,7 +2346,7 @@ ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm9 = ymm5[0],ymm6[0],ymm5[1],ymm6[1],ymm5[4],ymm6[4],ymm5[5],ymm6[5] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm9, %xmm9 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm4[1,0],ymm3[1,0],ymm4[5,4],ymm3[5,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm13 = ymm13[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm13[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm13, %xmm13 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm9 = xmm13[0,1],xmm9[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7] @@ -2357,7 +2357,7 @@ ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm7 = ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[6],ymm4[6],ymm3[7],ymm4[7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm7 ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm13 = ymm6[1],ymm5[1],ymm6[3],ymm5[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm13 = ymm13[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm13[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm13, %xmm13 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm13 = xmm7[0,1],xmm13[2,3] ; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %ymm7 @@ -2373,7 +2373,7 @@ ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm5 = ymm5[2],ymm6[2],ymm5[3],ymm6[3],ymm5[6],ymm6[6],ymm5[7],ymm6[7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm5 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm4[3,0],ymm3[3,0],ymm4[7,4],ymm3[7,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm3 = ymm3[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm3 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm5[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm12[4,5,6,7] @@ -2384,7 +2384,7 @@ ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm4 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm4 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm7[0],ymm2[0],ymm7[2],ymm2[2] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm5 = ymm5[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm5 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] @@ -2395,7 +2395,7 @@ ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm4 = ymm2[0],ymm7[0],ymm2[1],ymm7[1],ymm2[4],ymm7[4],ymm2[5],ymm7[5] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm4 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm1[1,0],ymm0[1,0],ymm1[5,4],ymm0[5,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm5 = ymm5[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm5 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] @@ -2406,7 +2406,7 @@ ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm4 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm4 ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm7[1],ymm2[1],ymm7[3],ymm2[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm5 = ymm5[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm5 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] @@ -2423,7 +2423,7 @@ ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[3,0],ymm0[3,0],ymm1[7,4],ymm0[7,4] ; AVX1-ONLY-NEXT: vmovaps 64(%rax), %ymm0 ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm7 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm7[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm8[4,5,6,7] @@ -2436,7 +2436,7 @@ ; AVX1-ONLY-NEXT: vmovaps 64(%rdx), %ymm8 ; AVX1-ONLY-NEXT: vmovaps 64(%rcx), %ymm9 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm9[0],ymm8[0],ymm9[2],ymm8[2] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm10 = ymm10[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm10[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm1[4,5,6,7] @@ -2447,7 +2447,7 @@ ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm7 = ymm8[0],ymm9[0],ymm8[1],ymm9[1],ymm8[4],ymm9[4],ymm8[5],ymm9[5] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm7 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm4[1,0],ymm3[1,0],ymm4[5,4],ymm3[5,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm10 = ymm10[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm10[0,1],xmm7[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm1[4,5,6,7] @@ -2458,7 +2458,7 @@ ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm7 = ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[6],ymm4[6],ymm3[7],ymm4[7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm7 ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm10 = ymm9[1],ymm8[1],ymm9[3],ymm8[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm10 = ymm10[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm10[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm1[4,5,6,7] @@ -2471,7 +2471,7 @@ ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm4[3,0],ymm3[3,0],ymm4[7,4],ymm3[7,4] ; AVX1-ONLY-NEXT: vmovaps 96(%r9), %ymm1 ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm4 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm3 = ymm3[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm3 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3] ; AVX1-ONLY-NEXT: vmovaps 96(%r10), %ymm4 @@ -2488,7 +2488,7 @@ ; AVX1-ONLY-NEXT: vmovaps 96(%rdx), %ymm8 ; AVX1-ONLY-NEXT: vmovaps 96(%rcx), %ymm9 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm9[0],ymm8[0],ymm9[2],ymm8[2] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm10 = ymm10[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm10[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] @@ -2499,7 +2499,7 @@ ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm7 = ymm8[0],ymm9[0],ymm8[1],ymm9[1],ymm8[4],ymm9[4],ymm8[5],ymm9[5] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm7 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm3[1,0],ymm2[1,0],ymm3[5,4],ymm2[5,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm10 = ymm10[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm10[0,1],xmm7[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] @@ -2510,7 +2510,7 @@ ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm7 = ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[6],ymm3[6],ymm2[7],ymm3[7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm7 ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm10 = ymm9[1],ymm8[1],ymm9[3],ymm8[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm10 = ymm10[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm10[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] @@ -2521,7 +2521,7 @@ ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm8[2],ymm9[2],ymm8[3],ymm9[3],ymm8[6],ymm9[6],ymm8[7],ymm9[7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm3[3,0],ymm2[3,0],ymm3[7,4],ymm2[7,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm2 = ymm2[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] @@ -2533,7 +2533,7 @@ ; AVX1-ONLY-NEXT: vmovaps (%rax), %xmm10 ; AVX1-ONLY-NEXT: vmovaps (%r10), %xmm1 ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm5 = xmm1[0],xmm10[0],xmm1[1],xmm10[1] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm6 = xmm5[0,1,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm5[0,1,0,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm6[6,7] ; AVX1-ONLY-NEXT: vmovaps (%rcx), %xmm6 @@ -2545,11 +2545,11 @@ ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm11 = xmm11[0],xmm8[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm11[0,1,2,3],ymm4[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm4 = xmm0[1,1,1,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm0[1,1,1,1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0],xmm9[1],xmm4[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],xmm8[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm8 = xmm3[1,1,1,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm8 = xmm3[1,1,1,1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0],xmm2[1],xmm8[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm8[0,1,2,3,4,5],ymm5[6,7] @@ -2557,12 +2557,12 @@ ; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm9[2],xmm0[3],xmm9[3] ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm3 = xmm10[2,2,2,2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm10[2,2,2,2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm1[0,1,2],xmm3[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm4 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm4 = xmm6[2,2,2,2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm6[2,2,2,2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm7[0,1,2],xmm4[3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm0[0,1],xmm4[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] @@ -2571,7 +2571,7 @@ ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm3[1] ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm1[2],xmm10[2],xmm1[3],xmm10[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm2[2,3,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] @@ -2583,7 +2583,7 @@ ; AVX1-ONLY-NEXT: vmovaps 32(%rax), %xmm8 ; AVX1-ONLY-NEXT: vmovaps 32(%r10), %xmm7 ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm5 = xmm7[0],xmm8[0],xmm7[1],xmm8[1] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm6 = xmm5[0,1,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm5[0,1,0,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm2[0,1,2,3,4,5],ymm6[6,7] ; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %xmm10 @@ -2595,11 +2595,11 @@ ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm15 = xmm15[0],xmm6[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm15[0,1,2,3],ymm9[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm15 = xmm0[1,1,1,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm0[1,1,1,1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm15[0],xmm1[1],xmm15[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm15[0,1],xmm6[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm15 = xmm3[1,1,1,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm3[1,1,1,1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm15[0],xmm4[1],xmm15[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm15[0,1,2,3,4,5],ymm5[6,7] @@ -2607,12 +2607,12 @@ ; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm3[2],xmm4[2],xmm3[3],xmm4[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm3 = xmm8[2,2,2,2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm8[2,2,2,2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm7[0,1,2],xmm3[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm4 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm4 = xmm10[2,2,2,2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm10[2,2,2,2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm2[0,1,2],xmm4[3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm0[0,1],xmm4[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] @@ -2621,7 +2621,7 @@ ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm7[2],xmm8[2],xmm7[3],xmm8[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] @@ -2633,7 +2633,7 @@ ; AVX1-ONLY-NEXT: vmovaps 64(%rax), %xmm7 ; AVX1-ONLY-NEXT: vmovaps 64(%r10), %xmm8 ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm10 = xmm8[0],xmm7[0],xmm8[1],xmm7[1] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm3 = xmm10[0,1,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm10[0,1,0,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm2[0,1,2,3,4,5],ymm3[6,7] ; AVX1-ONLY-NEXT: vmovaps 64(%rcx), %xmm15 @@ -2644,23 +2644,23 @@ ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm14 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm14 = xmm14[0],xmm2[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm14[0,1,2,3],ymm6[4,5,6,7] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm14 = xmm0[1,1,1,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm0[1,1,1,1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm14[0],xmm1[1],xmm14[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm14[0,1],xmm2[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm10 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm14 = xmm4[1,1,1,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm4[1,1,1,1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm14[0],xmm5[1],xmm14[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm14[0,1,2,3,4,5],ymm10[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm2[0,1,2,3],ymm10[4,5,6,7] ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm4[2],xmm5[2],xmm4[3],xmm5[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm7[2,2,2,2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm7[2,2,2,2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm8[0,1,2],xmm1[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm4 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm4 = xmm15[2,2,2,2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm15[2,2,2,2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm3[0,1,2],xmm4[3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm0[0,1],xmm4[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm4[0,1,2,3],ymm1[4,5,6,7] @@ -2668,7 +2668,7 @@ ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm3[1] ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm3 = xmm8[2],xmm7[2],xmm8[3],xmm7[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm2[2,3,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm0[0,1,2,3],ymm2[4,5,6,7] @@ -2679,7 +2679,7 @@ ; AVX1-ONLY-NEXT: vmovaps 96(%rax), %xmm4 ; AVX1-ONLY-NEXT: vmovaps 96(%r10), %xmm7 ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm8 = xmm7[0],xmm4[0],xmm7[1],xmm4[1] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm14 = xmm8[0,1,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm8[0,1,0,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm0[0,1,2,3,4,5],ymm14[6,7] ; AVX1-ONLY-NEXT: vmovaps 96(%rcx), %xmm14 @@ -2690,23 +2690,23 @@ ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm13 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm13 = xmm13[0],xmm2[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm13[0,1,2,3],ymm6[4,5,6,7] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm13 = xmm0[1,1,1,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm13 = xmm0[1,1,1,1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm13 = xmm13[0],xmm1[1],xmm13[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm13[0,1],xmm2[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm13 = xmm3[1,1,1,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm13 = xmm3[1,1,1,1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm13 = xmm13[0],xmm5[1],xmm13[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm0, %ymm13 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm13[0,1,2,3,4,5],ymm8[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7] ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm3[2],xmm5[2],xmm3[3],xmm5[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm3 = xmm4[2,2,2,2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm4[2,2,2,2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm7[0,1,2],xmm3[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm5 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3,4,5],ymm3[6,7] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm5 = xmm14[2,2,2,2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm14[2,2,2,2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm15[0,1,2],xmm5[3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm0[0,1],xmm5[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5,6,7] @@ -2714,7 +2714,7 @@ ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm5[1] ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm4 = xmm7[2],xmm4[2],xmm7[3],xmm4[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm4[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] @@ -3010,13 +3010,13 @@ ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm10[0,1,2,3,4,5],ymm7[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm6 = xmm5[1,1,1,1] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm5[1,1,1,1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0],xmm4[1],xmm6[2,3] ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm7 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1],xmm7[2,3] ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm7 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm10 = xmm9[1,1,1,1] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm9[1,1,1,1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm10[0],xmm8[1],xmm10[2,3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm10 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm10[0,1,2,3,4,5],ymm7[6,7] @@ -3024,12 +3024,12 @@ ; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm4 = xmm5[2],xmm4[2],xmm5[3],xmm4[3] ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm5 = xmm9[2],xmm8[2],xmm9[3],xmm8[3] -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm6 = xmm2[2,2,2,2] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm2[2,2,2,2] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm3[0,1,2],xmm6[3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm7 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3,4,5],ymm6[6,7] -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm7 = xmm0[2,2,2,2] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm0[2,2,2,2] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm1[0,1,2],xmm7[3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm4[0,1],xmm7[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] @@ -3038,7 +3038,7 @@ ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm4[1],xmm0[1] ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm3[2],xmm2[2],xmm3[3],xmm2[3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm2 = xmm5[2,3,2,3] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm5[2,3,2,3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] @@ -3065,13 +3065,13 @@ ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm10[0,1,2,3,4,5],ymm7[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm6 = xmm5[1,1,1,1] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm5[1,1,1,1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0],xmm4[1],xmm6[2,3] ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm7 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1],xmm7[2,3] ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm7 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm10 = xmm9[1,1,1,1] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm9[1,1,1,1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm10[0],xmm8[1],xmm10[2,3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm10 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm10[0,1,2,3,4,5],ymm7[6,7] @@ -3079,12 +3079,12 @@ ; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm4 = xmm5[2],xmm4[2],xmm5[3],xmm4[3] ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm5 = xmm9[2],xmm8[2],xmm9[3],xmm8[3] -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm6 = xmm2[2,2,2,2] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm2[2,2,2,2] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm3[0,1,2],xmm6[3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm7 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3,4,5],ymm6[6,7] -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm7 = xmm0[2,2,2,2] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm0[2,2,2,2] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm1[0,1,2],xmm7[3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm4[0,1],xmm7[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] @@ -3093,7 +3093,7 @@ ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm4[1],xmm0[1] ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm3[2],xmm2[2],xmm3[3],xmm2[3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm2 = xmm5[2,3,2,3] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm5[2,3,2,3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] @@ -3119,25 +3119,25 @@ ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm15[0,1,2,3,4,5],ymm8[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm9[0,1,2,3],ymm8[4,5,6,7] -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm8 = xmm5[1,1,1,1] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm8 = xmm5[1,1,1,1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0],xmm4[1],xmm8[2,3] ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm15 = xmm1[0],xmm6[0],xmm1[1],xmm6[1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0,1],xmm15[2,3] ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm15 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15 -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm14 = xmm7[1,1,1,1] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm7[1,1,1,1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm14[0],xmm0[1],xmm14[2,3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm8[0,1,2,3],ymm14[4,5,6,7] ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm4 = xmm5[2],xmm4[2],xmm5[3],xmm4[3] ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm7[2],xmm0[2],xmm7[3],xmm0[3] -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm5 = xmm2[2,2,2,2] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm2[2,2,2,2] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm3[0,1,2],xmm5[3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm7 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0,1,2,3,4,5],ymm5[6,7] -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm7 = xmm6[2,2,2,2] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm6[2,2,2,2] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm1[0,1,2],xmm7[3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm4[0,1],xmm7[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm7[0,1,2,3],ymm5[4,5,6,7] @@ -3145,7 +3145,7 @@ ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm4[1],xmm1[1] ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm3[2],xmm2[2],xmm3[3],xmm2[3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm1[0,1,2,3],ymm0[4,5,6,7] @@ -3170,25 +3170,25 @@ ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm0, %ymm13 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],ymm13[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm5[0,1,2,3],ymm12[4,5,6,7] -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm5 = xmm14[1,1,1,1] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm14[1,1,1,1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0],xmm6[1],xmm5[2,3] ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm12 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1],xmm12[2,3] ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm12 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm12 -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm11 = xmm0[1,1,1,1] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm11 = xmm0[1,1,1,1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm11 = xmm11[0],xmm15[1],xmm11[2,3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm11 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],ymm12[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm11[4,5,6,7] ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm6 = xmm14[2],xmm6[2],xmm14[3],xmm6[3] ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm15[2],xmm0[3],xmm15[3] -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm11 = xmm2[2,2,2,2] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm11 = xmm2[2,2,2,2] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm11 = xmm1[0,1,2],xmm11[3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm11 ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm12 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm12[0,1,2,3,4,5],ymm11[6,7] -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm12 = xmm4[2,2,2,2] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm12 = xmm4[2,2,2,2] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm12 = xmm3[0,1,2],xmm12[3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm12 = xmm6[0,1],xmm12[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm12[0,1,2,3],ymm11[4,5,6,7] @@ -3196,7 +3196,7 @@ ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm6[1],xmm3[1] ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] @@ -4905,7 +4905,7 @@ ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm10 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[4],ymm4[4],ymm3[5],ymm4[5] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm6[0],ymm5[0],ymm6[2],ymm5[2] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm13 = ymm13[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm13[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm13, %xmm13 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm10[0,1],xmm13[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7] @@ -4916,7 +4916,7 @@ ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm10 = ymm5[0],ymm6[0],ymm5[1],ymm6[1],ymm5[4],ymm6[4],ymm5[5],ymm6[5] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm4[1,0],ymm3[1,0],ymm4[5,4],ymm3[5,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm13 = ymm13[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm13[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm13, %xmm13 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm13[0,1],xmm10[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7] @@ -4927,7 +4927,7 @@ ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm9 = ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[6],ymm4[6],ymm3[7],ymm4[7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm9, %xmm9 ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm13 = ymm6[1],ymm5[1],ymm6[3],ymm5[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm13 = ymm13[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm13[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm13, %xmm13 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm13 = xmm9[0,1],xmm13[2,3] ; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %ymm9 @@ -4943,7 +4943,7 @@ ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm5 = ymm5[2],ymm6[2],ymm5[3],ymm6[3],ymm5[6],ymm6[6],ymm5[7],ymm6[7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm5 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm4[3,0],ymm3[3,0],ymm4[7,4],ymm3[7,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm3 = ymm3[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm3 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm5[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm12[4,5,6,7] @@ -4954,7 +4954,7 @@ ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm4 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm4 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm9[0],ymm2[0],ymm9[2],ymm2[2] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm5 = ymm5[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm5 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] @@ -4965,7 +4965,7 @@ ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm4 = ymm2[0],ymm9[0],ymm2[1],ymm9[1],ymm2[4],ymm9[4],ymm2[5],ymm9[5] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm4 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm1[1,0],ymm0[1,0],ymm1[5,4],ymm0[5,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm5 = ymm5[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm5 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] @@ -4976,7 +4976,7 @@ ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm4 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm4 ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm9[1],ymm2[1],ymm9[3],ymm2[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm5 = ymm5[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm5 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] @@ -4993,7 +4993,7 @@ ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[3,0],ymm0[3,0],ymm1[7,4],ymm0[7,4] ; AVX1-ONLY-NEXT: vmovaps 64(%rax), %ymm0 ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm8[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm7[4,5,6,7] @@ -5006,7 +5006,7 @@ ; AVX1-ONLY-NEXT: vmovaps 64(%rdx), %ymm8 ; AVX1-ONLY-NEXT: vmovaps 64(%rcx), %ymm9 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm9[0],ymm8[0],ymm9[2],ymm8[2] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm10 = ymm10[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm10[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm1[4,5,6,7] @@ -5017,7 +5017,7 @@ ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm7 = ymm8[0],ymm9[0],ymm8[1],ymm9[1],ymm8[4],ymm9[4],ymm8[5],ymm9[5] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm7 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm4[1,0],ymm3[1,0],ymm4[5,4],ymm3[5,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm10 = ymm10[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm10[0,1],xmm7[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm1[4,5,6,7] @@ -5028,7 +5028,7 @@ ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm7 = ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[6],ymm4[6],ymm3[7],ymm4[7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm7 ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm10 = ymm9[1],ymm8[1],ymm9[3],ymm8[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm10 = ymm10[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm10[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm1[4,5,6,7] @@ -5041,7 +5041,7 @@ ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm4[3,0],ymm3[3,0],ymm4[7,4],ymm3[7,4] ; AVX1-ONLY-NEXT: vmovaps 96(%r9), %ymm1 ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm4 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm3 = ymm3[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm3 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3] ; AVX1-ONLY-NEXT: vmovaps 96(%r10), %ymm4 @@ -5058,7 +5058,7 @@ ; AVX1-ONLY-NEXT: vmovaps 96(%rdx), %ymm8 ; AVX1-ONLY-NEXT: vmovaps 96(%rcx), %ymm9 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm9[0],ymm8[0],ymm9[2],ymm8[2] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm10 = ymm10[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm10[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] @@ -5069,7 +5069,7 @@ ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm7 = ymm8[0],ymm9[0],ymm8[1],ymm9[1],ymm8[4],ymm9[4],ymm8[5],ymm9[5] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm7 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm3[1,0],ymm2[1,0],ymm3[5,4],ymm2[5,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm10 = ymm10[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm10[0,1],xmm7[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] @@ -5080,7 +5080,7 @@ ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm7 = ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[6],ymm3[6],ymm2[7],ymm3[7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm7 ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm10 = ymm9[1],ymm8[1],ymm9[3],ymm8[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm10 = ymm10[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm10[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] @@ -5091,7 +5091,7 @@ ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm8[2],ymm9[2],ymm8[3],ymm9[3],ymm8[6],ymm9[6],ymm8[7],ymm9[7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm3[3,0],ymm2[3,0],ymm3[7,4],ymm2[7,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm2 = ymm2[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] @@ -5110,7 +5110,7 @@ ; AVX1-ONLY-NEXT: vmovaps 128(%rdx), %ymm8 ; AVX1-ONLY-NEXT: vmovaps 128(%rcx), %ymm9 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm9[0],ymm8[0],ymm9[2],ymm8[2] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm10 = ymm10[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm10[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] @@ -5121,7 +5121,7 @@ ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm7 = ymm8[0],ymm9[0],ymm8[1],ymm9[1],ymm8[4],ymm9[4],ymm8[5],ymm9[5] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm7 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm1[1,0],ymm0[1,0],ymm1[5,4],ymm0[5,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm10 = ymm10[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm10[0,1],xmm7[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] @@ -5132,7 +5132,7 @@ ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm7 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm7 ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm10 = ymm9[1],ymm8[1],ymm9[3],ymm8[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm10 = ymm10[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm10[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] @@ -5143,7 +5143,7 @@ ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm3 = ymm8[2],ymm9[2],ymm8[3],ymm9[3],ymm8[6],ymm9[6],ymm8[7],ymm9[7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[3,0],ymm0[3,0],ymm1[7,4],ymm0[7,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm1 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] @@ -5162,7 +5162,7 @@ ; AVX1-ONLY-NEXT: vmovaps 160(%rdx), %ymm8 ; AVX1-ONLY-NEXT: vmovaps 160(%rcx), %ymm9 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm9[0],ymm8[0],ymm9[2],ymm8[2] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm10 = ymm10[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm10[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] @@ -5173,7 +5173,7 @@ ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm7 = ymm8[0],ymm9[0],ymm8[1],ymm9[1],ymm8[4],ymm9[4],ymm8[5],ymm9[5] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm7 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm1[1,0],ymm0[1,0],ymm1[5,4],ymm0[5,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm10 = ymm10[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm10[0,1],xmm7[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] @@ -5184,7 +5184,7 @@ ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm7 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm7 ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm10 = ymm9[1],ymm8[1],ymm9[3],ymm8[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm10 = ymm10[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm10[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] @@ -5195,7 +5195,7 @@ ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm3 = ymm8[2],ymm9[2],ymm8[3],ymm9[3],ymm8[6],ymm9[6],ymm8[7],ymm9[7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[3,0],ymm0[3,0],ymm1[7,4],ymm0[7,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm1 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] @@ -5214,7 +5214,7 @@ ; AVX1-ONLY-NEXT: vmovaps 192(%rdx), %ymm8 ; AVX1-ONLY-NEXT: vmovaps 192(%rcx), %ymm9 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm9[0],ymm8[0],ymm9[2],ymm8[2] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm10 = ymm10[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm10[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] @@ -5225,7 +5225,7 @@ ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm7 = ymm8[0],ymm9[0],ymm8[1],ymm9[1],ymm8[4],ymm9[4],ymm8[5],ymm9[5] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm7 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm1[1,0],ymm0[1,0],ymm1[5,4],ymm0[5,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm10 = ymm10[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm10[0,1],xmm7[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] @@ -5236,7 +5236,7 @@ ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm7 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm7 ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm10 = ymm9[1],ymm8[1],ymm9[3],ymm8[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm10 = ymm10[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm10[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] @@ -5247,7 +5247,7 @@ ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm3 = ymm8[2],ymm9[2],ymm8[3],ymm9[3],ymm8[6],ymm9[6],ymm8[7],ymm9[7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[3,0],ymm0[3,0],ymm1[7,4],ymm0[7,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm1 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] @@ -5266,7 +5266,7 @@ ; AVX1-ONLY-NEXT: vmovaps 224(%rdx), %ymm8 ; AVX1-ONLY-NEXT: vmovaps 224(%rcx), %ymm9 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm9[0],ymm8[0],ymm9[2],ymm8[2] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm10 = ymm10[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm10[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] @@ -5277,7 +5277,7 @@ ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm7 = ymm8[0],ymm9[0],ymm8[1],ymm9[1],ymm8[4],ymm9[4],ymm8[5],ymm9[5] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm7 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm1[1,0],ymm0[1,0],ymm1[5,4],ymm0[5,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm10 = ymm10[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm10[0,1],xmm7[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] @@ -5288,7 +5288,7 @@ ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm7 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm7 ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm10 = ymm9[1],ymm8[1],ymm9[3],ymm8[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm10 = ymm10[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm10[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] @@ -5299,7 +5299,7 @@ ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm3 = ymm8[2],ymm9[2],ymm8[3],ymm9[3],ymm8[6],ymm9[6],ymm8[7],ymm9[7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[3,0],ymm0[3,0],ymm1[7,4],ymm0[7,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm1 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] @@ -5311,7 +5311,7 @@ ; AVX1-ONLY-NEXT: vmovaps (%rax), %xmm0 ; AVX1-ONLY-NEXT: vmovaps (%r10), %xmm1 ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm5 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm6 = xmm5[0,1,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm5[0,1,0,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm6[6,7] ; AVX1-ONLY-NEXT: vmovaps (%rcx), %xmm6 @@ -5323,11 +5323,11 @@ ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm11 = xmm11[0],xmm8[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm11[0,1,2,3],ymm4[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm4 = xmm10[1,1,1,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm10[1,1,1,1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0],xmm9[1],xmm4[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],xmm8[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm8 = xmm3[1,1,1,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm8 = xmm3[1,1,1,1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0],xmm2[1],xmm8[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm8[0,1,2,3,4,5],ymm5[6,7] @@ -5335,12 +5335,12 @@ ; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm4 = xmm10[2],xmm9[2],xmm10[3],xmm9[3] ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm3 = xmm0[2,2,2,2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm0[2,2,2,2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm1[0,1,2],xmm3[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm5 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3,4,5],ymm3[6,7] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm5 = xmm6[2,2,2,2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm6[2,2,2,2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm7[0,1,2],xmm5[3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm4[0,1],xmm5[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5,6,7] @@ -5349,7 +5349,7 @@ ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm4[1],xmm3[1] ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm2[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm2[2,3,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] @@ -5361,7 +5361,7 @@ ; AVX1-ONLY-NEXT: vmovaps 32(%rax), %xmm3 ; AVX1-ONLY-NEXT: vmovaps 32(%r10), %xmm4 ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm5 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm6 = xmm5[0,1,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm5[0,1,0,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm6[6,7] ; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %xmm6 @@ -5373,11 +5373,11 @@ ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm11 = xmm11[0],xmm8[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm11[0,1,2,3],ymm2[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm2 = xmm10[1,1,1,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm10[1,1,1,1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm9[1],xmm2[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm8[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm8 = xmm1[1,1,1,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm8 = xmm1[1,1,1,1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0],xmm0[1],xmm8[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm8[0,1,2,3,4,5],ymm5[6,7] @@ -5385,12 +5385,12 @@ ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm10[2],xmm9[2],xmm10[3],xmm9[3] ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm3[2,2,2,2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm3[2,2,2,2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm4[0,1,2],xmm1[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm5 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm5 = xmm6[2,2,2,2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm6[2,2,2,2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm7[0,1,2],xmm5[3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm2[0,1],xmm5[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm1[4,5,6,7] @@ -5399,7 +5399,7 @@ ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm2[1],xmm1[1] ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] @@ -5411,7 +5411,7 @@ ; AVX1-ONLY-NEXT: vmovaps 64(%rax), %xmm3 ; AVX1-ONLY-NEXT: vmovaps 64(%r10), %xmm4 ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm5 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm6 = xmm5[0,1,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm5[0,1,0,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm6[6,7] ; AVX1-ONLY-NEXT: vmovaps 64(%rcx), %xmm6 @@ -5423,11 +5423,11 @@ ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm11 = xmm11[0],xmm8[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm11[0,1,2,3],ymm2[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm2 = xmm10[1,1,1,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm10[1,1,1,1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm9[1],xmm2[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm8[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm8 = xmm1[1,1,1,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm8 = xmm1[1,1,1,1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0],xmm0[1],xmm8[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm8[0,1,2,3,4,5],ymm5[6,7] @@ -5435,12 +5435,12 @@ ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm10[2],xmm9[2],xmm10[3],xmm9[3] ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm3[2,2,2,2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm3[2,2,2,2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm4[0,1,2],xmm1[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm5 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm5 = xmm6[2,2,2,2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm6[2,2,2,2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm7[0,1,2],xmm5[3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm2[0,1],xmm5[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm1[4,5,6,7] @@ -5449,7 +5449,7 @@ ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm2[1],xmm1[1] ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] @@ -5461,7 +5461,7 @@ ; AVX1-ONLY-NEXT: vmovaps 96(%rax), %xmm3 ; AVX1-ONLY-NEXT: vmovaps 96(%r10), %xmm4 ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm5 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm6 = xmm5[0,1,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm5[0,1,0,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm6[6,7] ; AVX1-ONLY-NEXT: vmovaps 96(%rcx), %xmm6 @@ -5473,11 +5473,11 @@ ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm11 = xmm11[0],xmm8[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm11[0,1,2,3],ymm2[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm2 = xmm10[1,1,1,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm10[1,1,1,1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm9[1],xmm2[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm8[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm8 = xmm1[1,1,1,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm8 = xmm1[1,1,1,1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0],xmm0[1],xmm8[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm8[0,1,2,3,4,5],ymm5[6,7] @@ -5485,12 +5485,12 @@ ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm10[2],xmm9[2],xmm10[3],xmm9[3] ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm3[2,2,2,2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm3[2,2,2,2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm4[0,1,2],xmm1[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm5 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm5 = xmm6[2,2,2,2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm6[2,2,2,2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm7[0,1,2],xmm5[3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm2[0,1],xmm5[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm1[4,5,6,7] @@ -5499,7 +5499,7 @@ ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm2[1],xmm1[1] ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] @@ -5511,7 +5511,7 @@ ; AVX1-ONLY-NEXT: vmovaps 128(%rax), %xmm3 ; AVX1-ONLY-NEXT: vmovaps 128(%r10), %xmm4 ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm5 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm6 = xmm5[0,1,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm5[0,1,0,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm2[0,1,2,3,4,5],ymm6[6,7] ; AVX1-ONLY-NEXT: vmovaps 128(%rcx), %xmm6 @@ -5523,11 +5523,11 @@ ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm11 = xmm11[0],xmm8[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm10 = xmm2[1,1,1,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm2[1,1,1,1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm10[0],xmm9[1],xmm10[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm10[0,1],xmm8[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm10 = xmm1[1,1,1,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm1[1,1,1,1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm10[0],xmm0[1],xmm10[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm10 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm10[0,1,2,3,4,5],ymm5[6,7] @@ -5535,12 +5535,12 @@ ; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm2[2],xmm9[2],xmm2[3],xmm9[3] ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm3[2,2,2,2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm3[2,2,2,2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm4[0,1,2],xmm1[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm5 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm5 = xmm6[2,2,2,2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm6[2,2,2,2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm7[0,1,2],xmm5[3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm2[0,1],xmm5[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm1[4,5,6,7] @@ -5549,7 +5549,7 @@ ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm2[1],xmm1[1] ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] @@ -5561,7 +5561,7 @@ ; AVX1-ONLY-NEXT: vmovaps 160(%rax), %xmm9 ; AVX1-ONLY-NEXT: vmovaps 160(%r10), %xmm8 ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm5 = xmm8[0],xmm9[0],xmm8[1],xmm9[1] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm6 = xmm5[0,1,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm5[0,1,0,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm2[0,1,2,3,4,5],ymm6[6,7] ; AVX1-ONLY-NEXT: vmovaps 160(%rcx), %xmm3 @@ -5573,11 +5573,11 @@ ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm15 = xmm15[0],xmm6[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm15[0,1,2,3],ymm10[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm15 = xmm0[1,1,1,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm0[1,1,1,1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm15[0],xmm1[1],xmm15[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm15[0,1],xmm6[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm15 = xmm4[1,1,1,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm4[1,1,1,1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm15[0],xmm7[1],xmm15[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm15[0,1,2,3,4,5],ymm5[6,7] @@ -5585,12 +5585,12 @@ ; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm4[2],xmm7[2],xmm4[3],xmm7[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm4 = xmm9[2,2,2,2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm9[2,2,2,2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm8[0,1,2],xmm4[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm5 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm5 = xmm3[2,2,2,2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm3[2,2,2,2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm2[0,1,2],xmm5[3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm0[0,1],xmm5[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] @@ -5599,7 +5599,7 @@ ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm8[2],xmm9[2],xmm8[3],xmm9[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] @@ -5611,7 +5611,7 @@ ; AVX1-ONLY-NEXT: vmovaps 192(%rax), %xmm7 ; AVX1-ONLY-NEXT: vmovaps 192(%r10), %xmm8 ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm9 = xmm8[0],xmm7[0],xmm8[1],xmm7[1] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm3 = xmm9[0,1,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm9[0,1,0,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm2[0,1,2,3,4,5],ymm3[6,7] ; AVX1-ONLY-NEXT: vmovaps 192(%rcx), %xmm15 @@ -5622,23 +5622,23 @@ ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm14 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm14 = xmm14[0],xmm2[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm14[0,1,2,3],ymm6[4,5,6,7] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm14 = xmm0[1,1,1,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm0[1,1,1,1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm14[0],xmm1[1],xmm14[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm14[0,1],xmm2[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm9 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm14 = xmm4[1,1,1,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm4[1,1,1,1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm14[0],xmm5[1],xmm14[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm14[0,1,2,3,4,5],ymm9[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm2[0,1,2,3],ymm9[4,5,6,7] ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm4[2],xmm5[2],xmm4[3],xmm5[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm7[2,2,2,2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm7[2,2,2,2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm8[0,1,2],xmm1[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm4 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm4 = xmm15[2,2,2,2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm15[2,2,2,2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm3[0,1,2],xmm4[3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm0[0,1],xmm4[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm4[0,1,2,3],ymm1[4,5,6,7] @@ -5646,7 +5646,7 @@ ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm3[1] ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm3 = xmm8[2],xmm7[2],xmm8[3],xmm7[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm2[2,3,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm0[0,1,2,3],ymm2[4,5,6,7] @@ -5657,7 +5657,7 @@ ; AVX1-ONLY-NEXT: vmovaps 224(%rax), %xmm4 ; AVX1-ONLY-NEXT: vmovaps 224(%r10), %xmm7 ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm8 = xmm7[0],xmm4[0],xmm7[1],xmm4[1] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm14 = xmm8[0,1,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm8[0,1,0,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm0[0,1,2,3,4,5],ymm14[6,7] ; AVX1-ONLY-NEXT: vmovaps 224(%rcx), %xmm14 @@ -5668,23 +5668,23 @@ ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm13 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm13 = xmm13[0],xmm2[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm13[0,1,2,3],ymm6[4,5,6,7] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm13 = xmm0[1,1,1,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm13 = xmm0[1,1,1,1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm13 = xmm13[0],xmm1[1],xmm13[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm13[0,1],xmm2[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm13 = xmm3[1,1,1,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm13 = xmm3[1,1,1,1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm13 = xmm13[0],xmm5[1],xmm13[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm0, %ymm13 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm13[0,1,2,3,4,5],ymm8[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7] ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm3[2],xmm5[2],xmm3[3],xmm5[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm3 = xmm4[2,2,2,2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm4[2,2,2,2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm7[0,1,2],xmm3[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm5 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3,4,5],ymm3[6,7] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm5 = xmm14[2,2,2,2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm14[2,2,2,2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm15[0,1,2],xmm5[3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm0[0,1],xmm5[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5,6,7] @@ -5692,7 +5692,7 @@ ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm5[1] ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm4 = xmm7[2],xmm4[2],xmm7[3],xmm4[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm4[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] @@ -6256,13 +6256,13 @@ ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm10[0,1,2,3,4,5],ymm7[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm6 = xmm4[1,1,1,1] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm4[1,1,1,1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0],xmm3[1],xmm6[2,3] ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm7 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1],xmm7[2,3] ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm7 = xmm5[0],xmm2[0],xmm5[1],xmm2[1] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm10 = xmm9[1,1,1,1] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm9[1,1,1,1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm10[0],xmm8[1],xmm10[2,3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm10 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm10[0,1,2,3,4,5],ymm7[6,7] @@ -6270,12 +6270,12 @@ ; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm3 = xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm4 = xmm9[2],xmm8[2],xmm9[3],xmm8[3] -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm6 = xmm2[2,2,2,2] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm2[2,2,2,2] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm5[0,1,2],xmm6[3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm7 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3,4,5],ymm6[6,7] -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm7 = xmm0[2,2,2,2] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm0[2,2,2,2] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm1[0,1,2],xmm7[3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm3[0,1],xmm7[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] @@ -6284,7 +6284,7 @@ ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm3[1],xmm0[1] ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm5[2],xmm2[2],xmm5[3],xmm2[3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm2 = xmm4[2,3,2,3] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm4[2,3,2,3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] @@ -6311,13 +6311,13 @@ ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm10[0,1,2,3,4,5],ymm7[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm6 = xmm5[1,1,1,1] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm5[1,1,1,1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0],xmm4[1],xmm6[2,3] ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm7 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1],xmm7[2,3] ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm7 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm10 = xmm8[1,1,1,1] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm8[1,1,1,1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm10[0],xmm9[1],xmm10[2,3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm10 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm10[0,1,2,3,4,5],ymm7[6,7] @@ -6325,12 +6325,12 @@ ; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm4 = xmm5[2],xmm4[2],xmm5[3],xmm4[3] ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm5 = xmm8[2],xmm9[2],xmm8[3],xmm9[3] -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm6 = xmm2[2,2,2,2] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm2[2,2,2,2] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm3[0,1,2],xmm6[3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm7 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3,4,5],ymm6[6,7] -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm7 = xmm0[2,2,2,2] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm0[2,2,2,2] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm1[0,1,2],xmm7[3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm4[0,1],xmm7[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] @@ -6339,7 +6339,7 @@ ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm4[1],xmm0[1] ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm3[2],xmm2[2],xmm3[3],xmm2[3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm2 = xmm5[2,3,2,3] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm5[2,3,2,3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] @@ -6366,13 +6366,13 @@ ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm10[0,1,2,3,4,5],ymm7[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm6 = xmm5[1,1,1,1] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm5[1,1,1,1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0],xmm4[1],xmm6[2,3] ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm7 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1],xmm7[2,3] ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm7 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm10 = xmm8[1,1,1,1] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm8[1,1,1,1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm10[0],xmm9[1],xmm10[2,3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm10 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm10[0,1,2,3,4,5],ymm7[6,7] @@ -6380,12 +6380,12 @@ ; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm4 = xmm5[2],xmm4[2],xmm5[3],xmm4[3] ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm5 = xmm8[2],xmm9[2],xmm8[3],xmm9[3] -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm6 = xmm2[2,2,2,2] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm2[2,2,2,2] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm3[0,1,2],xmm6[3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm7 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3,4,5],ymm6[6,7] -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm7 = xmm0[2,2,2,2] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm0[2,2,2,2] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm1[0,1,2],xmm7[3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm4[0,1],xmm7[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] @@ -6394,7 +6394,7 @@ ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm4[1],xmm0[1] ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm3[2],xmm2[2],xmm3[3],xmm2[3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm2 = xmm5[2,3,2,3] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm5[2,3,2,3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] @@ -6421,13 +6421,13 @@ ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm10[0,1,2,3,4,5],ymm7[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm6 = xmm5[1,1,1,1] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm5[1,1,1,1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0],xmm4[1],xmm6[2,3] ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm7 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1],xmm7[2,3] ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm7 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm10 = xmm9[1,1,1,1] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm9[1,1,1,1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm10[0],xmm8[1],xmm10[2,3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm10 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm10[0,1,2,3,4,5],ymm7[6,7] @@ -6435,12 +6435,12 @@ ; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm4 = xmm5[2],xmm4[2],xmm5[3],xmm4[3] ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm5 = xmm9[2],xmm8[2],xmm9[3],xmm8[3] -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm6 = xmm2[2,2,2,2] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm2[2,2,2,2] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm3[0,1,2],xmm6[3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm7 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3,4,5],ymm6[6,7] -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm7 = xmm0[2,2,2,2] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm0[2,2,2,2] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm1[0,1,2],xmm7[3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm4[0,1],xmm7[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] @@ -6449,7 +6449,7 @@ ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm4[1],xmm0[1] ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm3[2],xmm2[2],xmm3[3],xmm2[3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm2 = xmm5[2,3,2,3] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm5[2,3,2,3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] @@ -6476,13 +6476,13 @@ ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm10[0,1,2,3,4,5],ymm7[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm6 = xmm5[1,1,1,1] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm5[1,1,1,1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0],xmm4[1],xmm6[2,3] ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm7 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1],xmm7[2,3] ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm7 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm10 = xmm9[1,1,1,1] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm9[1,1,1,1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm10[0],xmm8[1],xmm10[2,3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm10 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm10[0,1,2,3,4,5],ymm7[6,7] @@ -6490,12 +6490,12 @@ ; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm4 = xmm5[2],xmm4[2],xmm5[3],xmm4[3] ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm5 = xmm9[2],xmm8[2],xmm9[3],xmm8[3] -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm6 = xmm2[2,2,2,2] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm2[2,2,2,2] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm3[0,1,2],xmm6[3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm7 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3,4,5],ymm6[6,7] -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm7 = xmm0[2,2,2,2] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm0[2,2,2,2] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm1[0,1,2],xmm7[3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm4[0,1],xmm7[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] @@ -6504,7 +6504,7 @@ ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm4[1],xmm0[1] ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm3[2],xmm2[2],xmm3[3],xmm2[3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm2 = xmm5[2,3,2,3] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm5[2,3,2,3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] @@ -6531,13 +6531,13 @@ ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm10[0,1,2,3,4,5],ymm7[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm6 = xmm5[1,1,1,1] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm5[1,1,1,1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0],xmm4[1],xmm6[2,3] ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm7 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1],xmm7[2,3] ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm7 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm10 = xmm9[1,1,1,1] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm9[1,1,1,1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm10[0],xmm8[1],xmm10[2,3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm10 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm10[0,1,2,3,4,5],ymm7[6,7] @@ -6545,12 +6545,12 @@ ; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm4 = xmm5[2],xmm4[2],xmm5[3],xmm4[3] ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm5 = xmm9[2],xmm8[2],xmm9[3],xmm8[3] -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm6 = xmm2[2,2,2,2] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm2[2,2,2,2] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm3[0,1,2],xmm6[3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm7 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3,4,5],ymm6[6,7] -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm7 = xmm0[2,2,2,2] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm0[2,2,2,2] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm1[0,1,2],xmm7[3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm4[0,1],xmm7[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] @@ -6559,7 +6559,7 @@ ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm4[1],xmm0[1] ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm3[2],xmm2[2],xmm3[3],xmm2[3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm2 = xmm5[2,3,2,3] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm5[2,3,2,3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] @@ -6585,25 +6585,25 @@ ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm15[0,1,2,3,4,5],ymm8[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm9[0,1,2,3],ymm8[4,5,6,7] -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm8 = xmm5[1,1,1,1] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm8 = xmm5[1,1,1,1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0],xmm4[1],xmm8[2,3] ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm15 = xmm1[0],xmm6[0],xmm1[1],xmm6[1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0,1],xmm15[2,3] ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm15 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15 -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm14 = xmm7[1,1,1,1] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm7[1,1,1,1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm14[0],xmm0[1],xmm14[2,3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm8[0,1,2,3],ymm14[4,5,6,7] ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm4 = xmm5[2],xmm4[2],xmm5[3],xmm4[3] ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm7[2],xmm0[2],xmm7[3],xmm0[3] -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm5 = xmm2[2,2,2,2] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm2[2,2,2,2] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm3[0,1,2],xmm5[3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm7 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0,1,2,3,4,5],ymm5[6,7] -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm7 = xmm6[2,2,2,2] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm6[2,2,2,2] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm1[0,1,2],xmm7[3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm4[0,1],xmm7[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm7[0,1,2,3],ymm5[4,5,6,7] @@ -6611,7 +6611,7 @@ ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm4[1],xmm1[1] ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm3[2],xmm2[2],xmm3[3],xmm2[3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm1[0,1,2,3],ymm0[4,5,6,7] @@ -6636,25 +6636,25 @@ ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm0, %ymm13 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],ymm13[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm5[0,1,2,3],ymm12[4,5,6,7] -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm5 = xmm14[1,1,1,1] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm14[1,1,1,1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0],xmm6[1],xmm5[2,3] ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm12 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1],xmm12[2,3] ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm12 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm12 -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm11 = xmm0[1,1,1,1] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm11 = xmm0[1,1,1,1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm11 = xmm11[0],xmm15[1],xmm11[2,3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm11 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],ymm12[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm11[4,5,6,7] ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm6 = xmm14[2],xmm6[2],xmm14[3],xmm6[3] ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm15[2],xmm0[3],xmm15[3] -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm11 = xmm2[2,2,2,2] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm11 = xmm2[2,2,2,2] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm11 = xmm1[0,1,2],xmm11[3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm11 ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm12 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm12[0,1,2,3,4,5],ymm11[6,7] -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm12 = xmm4[2,2,2,2] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm12 = xmm4[2,2,2,2] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm12 = xmm3[0,1,2],xmm12[3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm12 = xmm6[0,1],xmm12[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm12[0,1,2,3],ymm11[4,5,6,7] @@ -6662,7 +6662,7 @@ ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm6[1],xmm3[1] ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-3.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-3.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-3.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-3.ll @@ -139,7 +139,7 @@ ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1],ymm3[2,3],ymm4[4,5,6,7] ; AVX2-ONLY-NEXT: vbroadcastsd (%rdx), %ymm4 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5],ymm3[6,7] -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} ymm4 = ymm1[2,3,0,1,6,7,4,5] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm1[2,3,0,1,6,7,4,5] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm0[4,5],ymm4[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm2[2,3],ymm4[4,5,6,7] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] @@ -270,7 +270,7 @@ ; AVX2-ONLY-NEXT: vmovaps 32(%rsi), %ymm3 ; AVX2-ONLY-NEXT: vmovaps (%rdx), %ymm4 ; AVX2-ONLY-NEXT: vmovaps 32(%rdx), %ymm5 -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} ymm6 = ymm2[2,3,0,1,6,7,4,5] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm2[2,3,0,1,6,7,4,5] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm0[4,5],ymm6[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm4[2,3],ymm6[4,5,6,7] ; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm7 = mem[0,0] @@ -278,7 +278,7 @@ ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1],ymm7[2,3],ymm8[4,5,6,7] ; AVX2-ONLY-NEXT: vbroadcastsd (%rdx), %ymm8 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5],ymm7[6,7] -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} ymm8 = ymm3[2,3,0,1,6,7,4,5] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm3[2,3,0,1,6,7,4,5] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm1[4,5],ymm8[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm5[2,3],ymm8[4,5,6,7] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm1[1],ymm3[1],ymm1[3],ymm3[3] @@ -547,11 +547,11 @@ ; AVX2-ONLY-NEXT: vbroadcastsd 64(%rdx), %ymm12 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm12[4,5],ymm7[6,7] ; AVX2-ONLY-NEXT: vmovaps 32(%rdx), %ymm12 -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} ymm9 = ymm9[2,3,0,1,6,7,4,5] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm9[2,3,0,1,6,7,4,5] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5],ymm9[6,7] ; AVX2-ONLY-NEXT: vmovaps 96(%rdx), %ymm13 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm11[2,3],ymm8[4,5,6,7] -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} ymm9 = ymm10[2,3,0,1,6,7,4,5] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm10[2,3,0,1,6,7,4,5] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm6[4,5],ymm9[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1],ymm12[2,3],ymm9[4,5,6,7] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm10 = ymm6[1],ymm10[1],ymm6[3],ymm10[3] @@ -563,7 +563,7 @@ ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm11[2,3],ymm6[4,5,6,7] ; AVX2-ONLY-NEXT: vbroadcastsd 32(%rdx), %ymm11 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm11[4,5],ymm6[6,7] -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} ymm11 = ymm5[2,3,0,1,6,7,4,5] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm5[2,3,0,1,6,7,4,5] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm3[4,5],ymm11[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1],ymm13[2,3],ymm11[4,5,6,7] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm3[1],ymm5[1],ymm3[3],ymm5[3] @@ -575,7 +575,7 @@ ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm12[2,3],ymm3[4,5,6,7] ; AVX2-ONLY-NEXT: vbroadcastsd 96(%rdx), %ymm12 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm12[4,5],ymm3[6,7] -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} ymm12 = ymm1[2,3,0,1,6,7,4,5] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm1[2,3,0,1,6,7,4,5] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm0[4,5],ymm12[6,7] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] ; AVX2-ONLY-NEXT: vmovaps (%rdx), %ymm1 @@ -1087,7 +1087,7 @@ ; AVX2-ONLY-NEXT: vbroadcastsd (%rdx), %ymm7 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm7[4,5],ymm4[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} ymm4 = ymm2[2,3,0,1,6,7,4,5] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm2[2,3,0,1,6,7,4,5] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm1[4,5],ymm4[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm3[2,3],ymm4[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -1102,7 +1102,7 @@ ; AVX2-ONLY-NEXT: vbroadcastsd 32(%rdx), %ymm3 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} ymm1 = ymm0[2,3,0,1,6,7,4,5] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm0[2,3,0,1,6,7,4,5] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm6[2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm1, (%rsp) # 32-byte Spill @@ -1117,7 +1117,7 @@ ; AVX2-ONLY-NEXT: vbroadcastsd 64(%rdx), %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} ymm0 = ymm11[2,3,0,1,6,7,4,5] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm11[2,3,0,1,6,7,4,5] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm9[4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm12[2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -1131,7 +1131,7 @@ ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vbroadcastsd 96(%rdx), %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} ymm0 = ymm14[2,3,0,1,6,7,4,5] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm14[2,3,0,1,6,7,4,5] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm13[4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm0[0,1],ymm15[2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm13[1],ymm14[1],ymm13[3],ymm14[3] @@ -1143,7 +1143,7 @@ ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vbroadcastsd 128(%rdx), %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} ymm0 = ymm10[2,3,0,1,6,7,4,5] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm10[2,3,0,1,6,7,4,5] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm8[4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm8[1],ymm10[1],ymm8[3],ymm10[3] ; AVX2-ONLY-NEXT: vmovaps 128(%rdx), %ymm10 @@ -1158,7 +1158,7 @@ ; AVX2-ONLY-NEXT: vbroadcastsd 160(%rdx), %ymm15 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm1[0,1,2,3],ymm15[4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovaps 160(%rsi), %ymm1 -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} ymm2 = ymm1[2,3,0,1,6,7,4,5] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm1[2,3,0,1,6,7,4,5] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm0[4,5],ymm2[6,7] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] ; AVX2-ONLY-NEXT: vmovaps 160(%rdx), %ymm1 @@ -1173,7 +1173,7 @@ ; AVX2-ONLY-NEXT: vbroadcastsd 192(%rdx), %ymm4 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm0[0,1,2,3],ymm4[4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovaps 192(%rsi), %ymm4 -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} ymm5 = ymm4[2,3,0,1,6,7,4,5] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm4[2,3,0,1,6,7,4,5] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm1[4,5],ymm5[6,7] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm4[1],ymm1[3],ymm4[3] ; AVX2-ONLY-NEXT: vmovaps 192(%rdx), %ymm4 @@ -1188,7 +1188,7 @@ ; AVX2-ONLY-NEXT: vbroadcastsd 224(%rdx), %ymm7 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm7[4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovaps 224(%rsi), %ymm7 -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} ymm9 = ymm7[2,3,0,1,6,7,4,5] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm7[2,3,0,1,6,7,4,5] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm4[4,5],ymm9[6,7] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm4[1],ymm7[1],ymm4[3],ymm7[3] ; AVX2-ONLY-NEXT: vmovaps 224(%rdx), %ymm7 @@ -2224,7 +2224,7 @@ ; AVX2-ONLY-NEXT: vbroadcastsd (%rdx), %ymm15 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm15[4,5],ymm14[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} ymm14 = ymm11[2,3,0,1,6,7,4,5] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm11[2,3,0,1,6,7,4,5] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm8[4,5],ymm14[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1],ymm12[2,3],ymm14[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -2239,7 +2239,7 @@ ; AVX2-ONLY-NEXT: vbroadcastsd 32(%rdx), %ymm11 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm11[4,5],ymm8[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} ymm8 = ymm9[2,3,0,1,6,7,4,5] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm9[2,3,0,1,6,7,4,5] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm5[4,5],ymm8[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm13[2,3],ymm8[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -2254,7 +2254,7 @@ ; AVX2-ONLY-NEXT: vbroadcastsd 64(%rdx), %ymm8 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5],ymm5[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} ymm5 = ymm6[2,3,0,1,6,7,4,5] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm6[2,3,0,1,6,7,4,5] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm3[4,5],ymm5[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm10[2,3],ymm5[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -2269,7 +2269,7 @@ ; AVX2-ONLY-NEXT: vbroadcastsd 96(%rdx), %ymm5 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm5[4,5],ymm3[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} ymm3 = ymm4[2,3,0,1,6,7,4,5] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm4[2,3,0,1,6,7,4,5] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm2[4,5],ymm3[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm7[2,3],ymm3[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -2284,7 +2284,7 @@ ; AVX2-ONLY-NEXT: vbroadcastsd 128(%rdx), %ymm3 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5],ymm2[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} ymm2 = ymm1[2,3,0,1,6,7,4,5] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm1[2,3,0,1,6,7,4,5] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm0[4,5],ymm2[6,7] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] ; AVX2-ONLY-NEXT: vmovaps 128(%rdx), %ymm1 @@ -2302,7 +2302,7 @@ ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 160(%rsi), %ymm1 -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} ymm2 = ymm1[2,3,0,1,6,7,4,5] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm1[2,3,0,1,6,7,4,5] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm0[4,5],ymm2[6,7] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] ; AVX2-ONLY-NEXT: vmovaps 160(%rdx), %ymm1 @@ -2320,7 +2320,7 @@ ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 192(%rsi), %ymm1 -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} ymm2 = ymm1[2,3,0,1,6,7,4,5] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm1[2,3,0,1,6,7,4,5] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm0[4,5],ymm2[6,7] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] ; AVX2-ONLY-NEXT: vmovaps 192(%rdx), %ymm1 @@ -2338,7 +2338,7 @@ ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 224(%rsi), %ymm1 -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} ymm2 = ymm1[2,3,0,1,6,7,4,5] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm1[2,3,0,1,6,7,4,5] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm0[4,5],ymm2[6,7] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] ; AVX2-ONLY-NEXT: vmovaps 224(%rdx), %ymm1 @@ -2356,7 +2356,7 @@ ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 256(%rsi), %ymm1 -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} ymm2 = ymm1[2,3,0,1,6,7,4,5] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm1[2,3,0,1,6,7,4,5] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm0[4,5],ymm2[6,7] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] ; AVX2-ONLY-NEXT: vmovaps 256(%rdx), %ymm1 @@ -2374,7 +2374,7 @@ ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 288(%rsi), %ymm1 -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} ymm2 = ymm1[2,3,0,1,6,7,4,5] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm1[2,3,0,1,6,7,4,5] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm0[4,5],ymm2[6,7] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] ; AVX2-ONLY-NEXT: vmovaps 288(%rdx), %ymm1 @@ -2392,7 +2392,7 @@ ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 320(%rsi), %ymm1 -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} ymm2 = ymm1[2,3,0,1,6,7,4,5] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm1[2,3,0,1,6,7,4,5] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm0[4,5],ymm2[6,7] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] ; AVX2-ONLY-NEXT: vmovaps 320(%rdx), %ymm1 @@ -2409,7 +2409,7 @@ ; AVX2-ONLY-NEXT: vbroadcastsd 352(%rdx), %ymm2 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovaps 352(%rsi), %ymm1 -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} ymm2 = ymm1[2,3,0,1,6,7,4,5] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm1[2,3,0,1,6,7,4,5] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm0[4,5],ymm2[6,7] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] ; AVX2-ONLY-NEXT: vmovaps 352(%rdx), %ymm1 @@ -2424,7 +2424,7 @@ ; AVX2-ONLY-NEXT: vbroadcastsd 384(%rdx), %ymm2 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovaps 384(%rsi), %ymm1 -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} ymm2 = ymm1[2,3,0,1,6,7,4,5] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm1[2,3,0,1,6,7,4,5] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm0[4,5],ymm2[6,7] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] ; AVX2-ONLY-NEXT: vmovaps 384(%rdx), %ymm1 @@ -2439,7 +2439,7 @@ ; AVX2-ONLY-NEXT: vbroadcastsd 416(%rdx), %ymm2 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovaps 416(%rsi), %ymm1 -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} ymm15 = ymm1[2,3,0,1,6,7,4,5] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm1[2,3,0,1,6,7,4,5] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0,1,2,3],ymm0[4,5],ymm15[6,7] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] ; AVX2-ONLY-NEXT: vmovaps 416(%rdx), %ymm1 @@ -2454,7 +2454,7 @@ ; AVX2-ONLY-NEXT: vbroadcastsd 448(%rdx), %ymm14 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm0[0,1,2,3],ymm14[4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovaps 448(%rsi), %ymm14 -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} ymm13 = ymm14[2,3,0,1,6,7,4,5] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm14[2,3,0,1,6,7,4,5] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm1[4,5],ymm13[6,7] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm14[1],ymm1[3],ymm14[3] ; AVX2-ONLY-NEXT: vmovaps 448(%rdx), %ymm14 @@ -2469,7 +2469,7 @@ ; AVX2-ONLY-NEXT: vbroadcastsd 480(%rdx), %ymm12 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm12[4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovaps 480(%rsi), %ymm12 -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} ymm11 = ymm12[2,3,0,1,6,7,4,5] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm12[2,3,0,1,6,7,4,5] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm14[4,5],ymm11[6,7] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm12 = ymm14[1],ymm12[1],ymm14[3],ymm12[3] ; AVX2-ONLY-NEXT: vmovaps 480(%rdx), %ymm14 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-5.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-5.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-5.ll @@ -803,7 +803,7 @@ ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm8[0],mem[0],ymm8[2],mem[2] ; AVX1-ONLY-NEXT: vmovapd 96(%rcx), %xmm1 ; AVX1-ONLY-NEXT: vmovapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 @@ -1737,7 +1737,7 @@ ; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 96(%rcx), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm2[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = mem[2,3,2,3] @@ -1749,13 +1749,13 @@ ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm11[0],mem[0],ymm11[2],mem[2] ; AVX1-ONLY-NEXT: vmovapd 160(%rcx), %xmm1 ; AVX1-ONLY-NEXT: vmovapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm4[0],mem[0],ymm4[2],mem[2] ; AVX1-ONLY-NEXT: vmovaps 192(%rcx), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 192(%rcx), %ymm0 @@ -3719,7 +3719,7 @@ ; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 96(%rcx), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm2[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = mem[2,3,2,3] @@ -3733,14 +3733,14 @@ ; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 160(%rcx), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm4[0],mem[0],ymm4[2],mem[2] ; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 192(%rcx), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 192(%rcx), %ymm0 @@ -3763,7 +3763,7 @@ ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 288(%rcx), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %ymm1 @@ -3772,7 +3772,7 @@ ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 320(%rcx), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 320(%rcx), %ymm0 @@ -3788,7 +3788,7 @@ ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 384(%rcx), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 384(%rcx), %ymm0 @@ -3810,7 +3810,7 @@ ; AVX1-ONLY-NEXT: vmovapd %ymm1, %ymm15 ; AVX1-ONLY-NEXT: vmovapd 480(%rcx), %xmm1 ; AVX1-ONLY-NEXT: vmovapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm9 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-7.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-7.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-7.ll @@ -570,12 +570,12 @@ ; AVX1-ONLY-NEXT: vbroadcastsd 8(%rcx), %ymm10 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm10[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm10 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm13 = xmm10[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm13 = xmm10[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm13[0,1],ymm8[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm13 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm13[6,7] ; AVX1-ONLY-NEXT: vmovaps 16(%rcx), %xmm13 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm14 = xmm13[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm13[2,3,2,3] ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm11 = ymm4[1],ymm11[1],ymm4[3],ymm11[3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],mem[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm14[0,1],ymm11[2,3,4,5,6,7] @@ -2030,7 +2030,7 @@ ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm1[0,1,2,3],ymm4[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm6 ; AVX1-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm6 = xmm6[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm6[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm6[0,1],ymm4[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7] @@ -2041,7 +2041,7 @@ ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 16(%rcx), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3] ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm9[1],ymm2[1],ymm9[3],ymm2[3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],mem[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3,4,5,6,7] @@ -2079,7 +2079,7 @@ ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm9[1],ymm0[1],ymm9[3],ymm0[3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] ; AVX1-ONLY-NEXT: vmovaps 48(%rcx), %xmm7 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm7[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm7[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 64(%r9), %xmm8 @@ -2096,7 +2096,7 @@ ; AVX1-ONLY-NEXT: vbroadcastsd 72(%rcx), %ymm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 64(%rsi), %xmm15 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm14 = xmm15[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm15[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm14[0,1],ymm2[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] @@ -2113,7 +2113,7 @@ ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm4[1],ymm0[1],ymm4[3],ymm0[3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm0[0,1,2,3,4,5],mem[6,7] ; AVX1-ONLY-NEXT: vmovaps 80(%rcx), %xmm3 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm13 = xmm3[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm13 = xmm3[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1],ymm14[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 96(%rsi), %xmm13 @@ -4756,7 +4756,7 @@ ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm2[0,1,2,3],ymm7[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm11 ; AVX1-ONLY-NEXT: vmovaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm11 = xmm11[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm11 = xmm11[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1],ymm7[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm7 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm9 @@ -4778,7 +4778,7 @@ ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm9[0,1],ymm1[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm6[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm6[2,3,2,3] ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm4[1],ymm5[1],ymm4[3],ymm5[3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],mem[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3,4,5,6,7] @@ -4824,7 +4824,7 @@ ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 48(%r8), %xmm0 ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm3[1],ymm0[1],ymm3[3],ymm0[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -4841,7 +4841,7 @@ ; AVX1-ONLY-NEXT: vbroadcastsd 72(%rcx), %ymm3 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm2[0,1,2,3],ymm3[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 64(%rsi), %xmm11 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm5 = xmm11[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm11[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1],ymm3[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] @@ -4868,7 +4868,7 @@ ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 80(%r8), %xmm0 ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm4[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm4[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -4916,7 +4916,7 @@ ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 112(%r8), %xmm0 ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm3[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm3[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -4933,7 +4933,7 @@ ; AVX1-ONLY-NEXT: vbroadcastsd 136(%rcx), %ymm3 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm2[0,1,2,3],ymm3[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 128(%rsi), %xmm14 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm5 = xmm14[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm14[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1],ymm3[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] @@ -4960,7 +4960,7 @@ ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 144(%r8), %xmm0 ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm4[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm4[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -5008,7 +5008,7 @@ ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 176(%r8), %xmm0 ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm3[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm3[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -5025,7 +5025,7 @@ ; AVX1-ONLY-NEXT: vbroadcastsd 200(%rcx), %ymm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm4[0,1,2,3],ymm2[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 192(%rsi), %xmm8 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm7 = xmm8[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm8[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5],ymm1[6,7] @@ -10400,7 +10400,7 @@ ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm0[0,1,2,3],ymm3[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm5 ; AVX1-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm5 = xmm5[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm5[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1],ymm3[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] @@ -10412,7 +10412,7 @@ ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 16(%rcx), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX1-ONLY-NEXT: vmovaps 16(%r8), %xmm2 ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm7[1],ymm2[1],ymm7[3],ymm2[3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],mem[6,7] @@ -10456,7 +10456,7 @@ ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] ; AVX1-ONLY-NEXT: vmovaps 48(%rcx), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 64(%r9), %xmm1 @@ -10476,7 +10476,7 @@ ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 64(%rsi), %xmm3 ; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm3 = xmm3[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm3[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] @@ -10496,7 +10496,7 @@ ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] ; AVX1-ONLY-NEXT: vmovaps 80(%rcx), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 96(%rsi), %xmm0 @@ -10538,7 +10538,7 @@ ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] ; AVX1-ONLY-NEXT: vmovaps 112(%rcx), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 128(%r9), %xmm1 @@ -10558,7 +10558,7 @@ ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 128(%rsi), %xmm3 ; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm3 = xmm3[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm3[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] @@ -10578,7 +10578,7 @@ ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] ; AVX1-ONLY-NEXT: vmovaps 144(%rcx), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 160(%rsi), %xmm0 @@ -10620,7 +10620,7 @@ ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] ; AVX1-ONLY-NEXT: vmovaps 176(%rcx), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 192(%r9), %xmm1 @@ -10640,7 +10640,7 @@ ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 192(%rsi), %xmm3 ; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm3 = xmm3[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm3[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] @@ -10660,7 +10660,7 @@ ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] ; AVX1-ONLY-NEXT: vmovaps 208(%rcx), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 224(%rsi), %xmm0 @@ -10702,7 +10702,7 @@ ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] ; AVX1-ONLY-NEXT: vmovaps 240(%rcx), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 256(%r9), %xmm1 @@ -10722,7 +10722,7 @@ ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 256(%rsi), %xmm3 ; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm3 = xmm3[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm3[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] @@ -10742,7 +10742,7 @@ ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] ; AVX1-ONLY-NEXT: vmovaps 272(%rcx), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 288(%rsi), %xmm0 @@ -10784,7 +10784,7 @@ ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] ; AVX1-ONLY-NEXT: vmovaps 304(%rcx), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 320(%r9), %xmm1 @@ -10804,7 +10804,7 @@ ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 320(%rsi), %xmm3 ; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm3 = xmm3[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm3[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] @@ -10824,7 +10824,7 @@ ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] ; AVX1-ONLY-NEXT: vmovaps 336(%rcx), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 352(%rsi), %xmm0 @@ -10863,7 +10863,7 @@ ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm14[1],ymm0[1],ymm14[3],ymm0[3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] ; AVX1-ONLY-NEXT: vmovaps 368(%rcx), %xmm12 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm12[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm12[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 384(%r9), %xmm13 @@ -10881,7 +10881,7 @@ ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 384(%rsi), %xmm3 ; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm3 = xmm3[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm3[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] @@ -10898,7 +10898,7 @@ ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm8[1],ymm0[1],ymm8[3],ymm0[3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] ; AVX1-ONLY-NEXT: vmovaps 400(%rcx), %xmm7 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm7[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm7[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 416(%rsi), %xmm3 @@ -10925,7 +10925,7 @@ ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 448(%rsi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX1-ONLY-NEXT: vbroadcastsd 456(%rcx), %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovapd 448(%rdx), %ymm5 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-6.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-6.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-6.ll @@ -1181,8 +1181,8 @@ ; AVX1-ONLY-NEXT: vpor %xmm9, %xmm8, %xmm8 ; AVX1-ONLY-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm8 = xmm9[0,0,1,1] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm10 = xmm9[1,1,2,2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm8 = xmm9[0,0,1,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm9[1,1,2,2] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm8, %ymm8 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm10 = xmm13[1,0,2,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[0,1,0,1] diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-8.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-8.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-8.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-8.ll @@ -1598,7 +1598,7 @@ ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm1[0,0,2,1,4,5,6,7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm1[0,2,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm2 = ymm0[0,0,2,1,4,4,6,5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm0[0,0,2,1,4,4,6,5] ; AVX1-ONLY-NEXT: vbroadcastsd {{.*#+}} ymm0 = [281474976710655,281474976710655,281474976710655,281474976710655] ; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm0, %ymm2 ; AVX1-ONLY-NEXT: vmovdqa (%r9), %xmm7 @@ -1607,7 +1607,7 @@ ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm12[0,1,1,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm12[2,1,3,3,4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm3 = ymm3[0,0,2,1,4,4,6,5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,0,2,1,4,4,6,5] ; AVX1-ONLY-NEXT: vandps %ymm0, %ymm3, %ymm3 ; AVX1-ONLY-NEXT: vorps %ymm2, %ymm3, %ymm5 ; AVX1-ONLY-NEXT: vmovdqa (%rsi), %xmm2 @@ -1639,9 +1639,9 @@ ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,6,5,7,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm5, %ymm5 ; AVX1-ONLY-NEXT: vmovdqa 16(%r10), %xmm13 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] ; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm5 = ymm5[0,2,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[0,2,2,3,4,6,6,7] ; AVX1-ONLY-NEXT: vandps %ymm0, %ymm5, %ymm5 ; AVX1-ONLY-NEXT: vorps %ymm1, %ymm5, %ymm1 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm15[0,1,2,3,4,4,6,5] @@ -1653,7 +1653,7 @@ ; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm5, %ymm3 ; AVX1-ONLY-NEXT: vmovdqa 16(%rax), %xmm9 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm12 = ymm12[2,1,3,3,6,5,7,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm12[2,1,3,3,6,5,7,7] ; AVX1-ONLY-NEXT: vandnps %ymm12, %ymm2, %ymm12 ; AVX1-ONLY-NEXT: vandps %ymm2, %ymm3, %ymm3 ; AVX1-ONLY-NEXT: vorps %ymm3, %ymm12, %ymm3 @@ -1674,9 +1674,9 @@ ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm4 ; AVX1-ONLY-NEXT: vmovdqa 16(%r8), %xmm7 ; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] ; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm4 = ymm4[0,2,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,2,2,3,4,6,6,7] ; AVX1-ONLY-NEXT: vandps %ymm0, %ymm4, %ymm4 ; AVX1-ONLY-NEXT: vorps %ymm3, %ymm4, %ymm3 ; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm11[8],xmm10[8],xmm11[9],xmm10[9],xmm11[10],xmm10[10],xmm11[11],xmm10[11],xmm11[12],xmm10[12],xmm11[13],xmm10[13],xmm11[14],xmm10[14],xmm11[15],xmm10[15] @@ -1684,7 +1684,7 @@ ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm1[0,1,2,3,4,4,6,5] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm10 = xmm1[0,1,2,3,4,6,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm4 = ymm4[2,1,3,3,6,5,7,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[2,1,3,3,6,5,7,7] ; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm2, %ymm4 ; AVX1-ONLY-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm1 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm1 = xmm6[8],mem[8],xmm6[9],mem[9],xmm6[10],mem[10],xmm6[11],mem[11],xmm6[12],mem[12],xmm6[13],mem[13],xmm6[14],mem[14],xmm6[15],mem[15] @@ -1707,9 +1707,9 @@ ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm12[0,1,2,3,4,5,5,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm6 = xmm12[0,1,2,3,6,5,7,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] ; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm4 = ymm4[0,2,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,2,2,3,4,6,6,7] ; AVX1-ONLY-NEXT: vandps %ymm0, %ymm4, %ymm4 ; AVX1-ONLY-NEXT: vorps %ymm3, %ymm4, %ymm5 ; AVX1-ONLY-NEXT: vmovdqa 16(%rcx), %xmm7 @@ -1726,7 +1726,7 @@ ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm14[3,3,3,3] ; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm15 = xmm15[0],zero,zero,zero,xmm15[1],zero,zero,zero ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm13, %ymm13 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[2,1,3,3,6,5,7,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,1,3,3,6,5,7,7] ; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm2, %ymm1 ; AVX1-ONLY-NEXT: vandps %ymm2, %ymm13, %ymm13 ; AVX1-ONLY-NEXT: vorps %ymm1, %ymm13, %ymm1 @@ -1741,9 +1741,9 @@ ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm8 = xmm15[0,1,2,3,4,5,5,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm9 = xmm15[0,1,2,3,6,5,7,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm8, %ymm8 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] ; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm8 = ymm8[0,2,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[0,2,2,3,4,6,6,7] ; AVX1-ONLY-NEXT: vandps %ymm0, %ymm8, %ymm8 ; AVX1-ONLY-NEXT: vorps %ymm1, %ymm8, %ymm8 ; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm3[8],xmm7[8],xmm3[9],xmm7[9],xmm3[10],xmm7[10],xmm3[11],xmm7[11],xmm3[12],xmm7[12],xmm3[13],xmm7[13],xmm3[14],xmm7[14],xmm3[15],xmm7[15] @@ -1756,7 +1756,7 @@ ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm4[3,3,3,3] ; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm6, %ymm6 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm3 = ymm3[2,1,3,3,6,5,7,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[2,1,3,3,6,5,7,7] ; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm2, %ymm3 ; AVX1-ONLY-NEXT: vandps %ymm2, %ymm6, %ymm6 ; AVX1-ONLY-NEXT: vorps %ymm3, %ymm6, %ymm3 @@ -1767,9 +1767,9 @@ ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm15[0,1,1,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm15[2,1,3,3,4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm6, %ymm6 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm5 = ymm5[0,0,2,1,4,4,6,5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[0,0,2,1,4,4,6,5] ; AVX1-ONLY-NEXT: vandnps %ymm5, %ymm0, %ymm5 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm6 = ymm6[0,0,2,1,4,4,6,5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm6[0,0,2,1,4,4,6,5] ; AVX1-ONLY-NEXT: vandps %ymm0, %ymm6, %ymm6 ; AVX1-ONLY-NEXT: vorps %ymm5, %ymm6, %ymm5 ; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm6 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero @@ -1791,9 +1791,9 @@ ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm12[0,1,1,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm12[2,1,3,3,4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm4 = ymm4[0,0,2,1,4,4,6,5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,0,2,1,4,4,6,5] ; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm0, %ymm4 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm5 = ymm5[0,0,2,1,4,4,6,5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[0,0,2,1,4,4,6,5] ; AVX1-ONLY-NEXT: vandps %ymm0, %ymm5, %ymm5 ; AVX1-ONLY-NEXT: vorps %ymm4, %ymm5, %ymm4 ; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm14[0],zero,zero,zero,xmm14[1],zero,zero,zero @@ -1817,9 +1817,9 @@ ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm7[0,1,1,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[2,1,3,3,4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm6, %ymm6 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm5 = ymm5[0,0,2,1,4,4,6,5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[0,0,2,1,4,4,6,5] ; AVX1-ONLY-NEXT: vandnps %ymm5, %ymm0, %ymm5 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm6 = ymm6[0,0,2,1,4,4,6,5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm6[0,0,2,1,4,4,6,5] ; AVX1-ONLY-NEXT: vandps %ymm0, %ymm6, %ymm0 ; AVX1-ONLY-NEXT: vorps %ymm5, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload @@ -4157,7 +4157,7 @@ ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,4,6,5] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm1[0,1,2,3,4,6,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] ; AVX1-ONLY-NEXT: vbroadcastsd {{.*#+}} ymm5 = [281474976710655,281474976710655,281474976710655,281474976710655] ; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm5, %ymm0 ; AVX1-ONLY-NEXT: vmovdqa (%r9), %xmm3 @@ -4168,7 +4168,7 @@ ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm2[0,1,2,3,4,5,5,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm2[0,1,2,3,6,5,7,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] ; AVX1-ONLY-NEXT: vandps %ymm5, %ymm3, %ymm3 ; AVX1-ONLY-NEXT: vorps %ymm0, %ymm3, %ymm0 ; AVX1-ONLY-NEXT: vmovdqa (%rcx), %xmm4 @@ -4189,7 +4189,7 @@ ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm6[3,3,3,3] ; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm8 = xmm8[0],zero,zero,zero,xmm8[1],zero,zero,zero ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm7, %ymm7 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm8 = ymm4[2,1,3,3,6,5,7,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm4[2,1,3,3,6,5,7,7] ; AVX1-ONLY-NEXT: vbroadcastsd {{.*#+}} ymm4 = [18446744069414649855,18446744069414649855,18446744069414649855,18446744069414649855] ; AVX1-ONLY-NEXT: vandnps %ymm8, %ymm4, %ymm8 ; AVX1-ONLY-NEXT: vandps %ymm4, %ymm7, %ymm7 @@ -4204,9 +4204,9 @@ ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm2 ; AVX1-ONLY-NEXT: vmovdqa 48(%r10), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,2,1,4,4,6,5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0,2,1,4,4,6,5] ; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm5, %ymm0 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm2 = ymm2[0,0,2,1,4,4,6,5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,0,2,1,4,4,6,5] ; AVX1-ONLY-NEXT: vandps %ymm5, %ymm2, %ymm2 ; AVX1-ONLY-NEXT: vorps %ymm0, %ymm2, %ymm0 ; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero @@ -4235,9 +4235,9 @@ ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm12[0,1,1,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm8 = xmm12[2,1,3,3,4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm7, %ymm7 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm6 = ymm6[0,0,2,1,4,4,6,5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm6[0,0,2,1,4,4,6,5] ; AVX1-ONLY-NEXT: vandnps %ymm6, %ymm5, %ymm6 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm7 = ymm7[0,0,2,1,4,4,6,5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm7[0,0,2,1,4,4,6,5] ; AVX1-ONLY-NEXT: vandps %ymm5, %ymm7, %ymm7 ; AVX1-ONLY-NEXT: vorps %ymm6, %ymm7, %ymm13 ; AVX1-ONLY-NEXT: vmovdqa 48(%rsi), %xmm8 @@ -4266,9 +4266,9 @@ ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm12[0,1,2,3,4,5,5,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm11 = xmm12[0,1,2,3,6,5,7,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] ; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm5, %ymm1 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] ; AVX1-ONLY-NEXT: vandps %ymm5, %ymm2, %ymm2 ; AVX1-ONLY-NEXT: vorps %ymm1, %ymm2, %ymm1 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm15[0,1,2,3,4,4,6,5] @@ -4279,7 +4279,7 @@ ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[3,3,3,3] ; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm10 = xmm10[0],zero,zero,zero,xmm10[1],zero,zero,zero ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm11, %ymm10 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm2 = ymm2[2,1,3,3,6,5,7,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,1,3,3,6,5,7,7] ; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm4, %ymm2 ; AVX1-ONLY-NEXT: vandps %ymm4, %ymm10, %ymm10 ; AVX1-ONLY-NEXT: vorps %ymm2, %ymm10, %ymm2 @@ -4295,9 +4295,9 @@ ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm0[0,1,1,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm10 = xmm0[2,1,3,3,4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,0,2,1,4,4,6,5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,0,2,1,4,4,6,5] ; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm5, %ymm1 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm3 = ymm3[0,0,2,1,4,4,6,5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,0,2,1,4,4,6,5] ; AVX1-ONLY-NEXT: vandps %ymm5, %ymm3, %ymm3 ; AVX1-ONLY-NEXT: vorps %ymm1, %ymm3, %ymm3 ; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3],xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] @@ -4322,9 +4322,9 @@ ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm0[0,1,2,3,4,5,5,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,7,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] ; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm5, %ymm2 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] ; AVX1-ONLY-NEXT: vandps %ymm5, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vorps %ymm2, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm6[0,1,2,3,4,4,6,5] @@ -4335,7 +4335,7 @@ ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,3,3,3] ; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm2 = ymm2[2,1,3,3,6,5,7,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,1,3,3,6,5,7,7] ; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm4, %ymm2 ; AVX1-ONLY-NEXT: vandps %ymm4, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vorps %ymm2, %ymm1, %ymm1 @@ -4355,9 +4355,9 @@ ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm12[0,1,1,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm12[2,1,3,3,4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,2,1,4,4,6,5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0,2,1,4,4,6,5] ; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm5, %ymm0 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm3 = ymm3[0,0,2,1,4,4,6,5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,0,2,1,4,4,6,5] ; AVX1-ONLY-NEXT: vandps %ymm5, %ymm3, %ymm3 ; AVX1-ONLY-NEXT: vorps %ymm0, %ymm3, %ymm13 ; AVX1-ONLY-NEXT: vmovdqa 32(%rsi), %xmm7 @@ -4386,9 +4386,9 @@ ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm12[0,1,2,3,4,5,5,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm11 = xmm12[0,1,2,3,6,5,7,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] ; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm5, %ymm1 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] ; AVX1-ONLY-NEXT: vandps %ymm5, %ymm2, %ymm2 ; AVX1-ONLY-NEXT: vorps %ymm1, %ymm2, %ymm1 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm15[0,1,2,3,4,4,6,5] @@ -4399,7 +4399,7 @@ ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[3,3,3,3] ; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm10 = xmm10[0],zero,zero,zero,xmm10[1],zero,zero,zero ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm11, %ymm10 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm2 = ymm2[2,1,3,3,6,5,7,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,1,3,3,6,5,7,7] ; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm4, %ymm2 ; AVX1-ONLY-NEXT: vandps %ymm4, %ymm10, %ymm10 ; AVX1-ONLY-NEXT: vorps %ymm2, %ymm10, %ymm2 @@ -4415,9 +4415,9 @@ ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm8 = xmm2[0,1,1,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm10 = xmm2[2,1,3,3,4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm8, %ymm8 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,0,2,1,4,4,6,5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,0,2,1,4,4,6,5] ; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm5, %ymm1 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm8 = ymm8[0,0,2,1,4,4,6,5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[0,0,2,1,4,4,6,5] ; AVX1-ONLY-NEXT: vandps %ymm5, %ymm8, %ymm8 ; AVX1-ONLY-NEXT: vorps %ymm1, %ymm8, %ymm8 ; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm9[0],xmm7[0],xmm9[1],xmm7[1],xmm9[2],xmm7[2],xmm9[3],xmm7[3],xmm9[4],xmm7[4],xmm9[5],xmm7[5],xmm9[6],xmm7[6],xmm9[7],xmm7[7] @@ -4442,9 +4442,9 @@ ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm6 = xmm2[0,1,2,3,4,5,5,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,7,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm6, %ymm2 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] ; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm5, %ymm3 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] ; AVX1-ONLY-NEXT: vandps %ymm5, %ymm2, %ymm2 ; AVX1-ONLY-NEXT: vorps %ymm3, %ymm2, %ymm2 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm0[0,1,2,3,4,4,6,5] @@ -4455,7 +4455,7 @@ ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,3,3,3] ; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,1,3,3,6,5,7,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,1,3,3,6,5,7,7] ; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm4, %ymm0 ; AVX1-ONLY-NEXT: vandps %ymm4, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vorps %ymm0, %ymm1, %ymm0 @@ -4475,9 +4475,9 @@ ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm1[0,1,1,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm8 = xmm1[2,1,3,3,4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm6, %ymm6 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm2 = ymm2[0,0,2,1,4,4,6,5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,0,2,1,4,4,6,5] ; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm5, %ymm2 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm6 = ymm6[0,0,2,1,4,4,6,5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm6[0,0,2,1,4,4,6,5] ; AVX1-ONLY-NEXT: vandps %ymm5, %ymm6, %ymm6 ; AVX1-ONLY-NEXT: vorps %ymm2, %ymm6, %ymm13 ; AVX1-ONLY-NEXT: vmovdqa 16(%rsi), %xmm11 @@ -4505,9 +4505,9 @@ ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm1[0,1,2,3,4,5,5,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,7,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] ; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm5, %ymm2 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] ; AVX1-ONLY-NEXT: vandps %ymm5, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vorps %ymm2, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,4,6,5] @@ -4518,7 +4518,7 @@ ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm10[3,3,3,3] ; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,1,3,3,6,5,7,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,1,3,3,6,5,7,7] ; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm4, %ymm0 ; AVX1-ONLY-NEXT: vandps %ymm4, %ymm2, %ymm2 ; AVX1-ONLY-NEXT: vorps %ymm0, %ymm2, %ymm0 @@ -4533,9 +4533,9 @@ ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm3[0,1,1,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm3[2,1,3,3,4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,0,2,1,4,4,6,5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,0,2,1,4,4,6,5] ; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm5, %ymm1 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm2 = ymm2[0,0,2,1,4,4,6,5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,0,2,1,4,4,6,5] ; AVX1-ONLY-NEXT: vandps %ymm5, %ymm2, %ymm2 ; AVX1-ONLY-NEXT: vorps %ymm1, %ymm2, %ymm2 ; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm6[0],xmm11[0],xmm6[1],xmm11[1],xmm6[2],xmm11[2],xmm6[3],xmm11[3],xmm6[4],xmm11[4],xmm6[5],xmm11[5],xmm6[6],xmm11[6],xmm6[7],xmm11[7] @@ -4559,9 +4559,9 @@ ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm6 = xmm3[0,1,2,3,4,5,5,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,7,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm6, %ymm3 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] ; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm5, %ymm0 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] ; AVX1-ONLY-NEXT: vandps %ymm5, %ymm3, %ymm3 ; AVX1-ONLY-NEXT: vorps %ymm0, %ymm3, %ymm0 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm7[0,1,2,3,4,4,6,5] @@ -4572,7 +4572,7 @@ ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,3,3,3] ; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm6, %ymm1 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm3 = ymm3[2,1,3,3,6,5,7,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[2,1,3,3,6,5,7,7] ; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm4, %ymm3 ; AVX1-ONLY-NEXT: vandps %ymm4, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vorps %ymm3, %ymm1, %ymm1 @@ -4589,9 +4589,9 @@ ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm6[0,1,1,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm8 = xmm6[2,1,3,3,4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm7, %ymm7 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm3 = ymm3[0,0,2,1,4,4,6,5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,0,2,1,4,4,6,5] ; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm5, %ymm3 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm7 = ymm7[0,0,2,1,4,4,6,5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm7[0,0,2,1,4,4,6,5] ; AVX1-ONLY-NEXT: vandps %ymm5, %ymm7, %ymm7 ; AVX1-ONLY-NEXT: vorps %ymm3, %ymm7, %ymm7 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload @@ -4619,9 +4619,9 @@ ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm8 = xmm6[0,1,2,3,4,5,5,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,6,5,7,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm8, %ymm6 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] ; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm5, %ymm0 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm6 = ymm6[0,2,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm6[0,2,2,3,4,6,6,7] ; AVX1-ONLY-NEXT: vandps %ymm5, %ymm6, %ymm5 ; AVX1-ONLY-NEXT: vorps %ymm0, %ymm5, %ymm0 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm9[0,1,2,3,4,4,6,5] @@ -4632,7 +4632,7 @@ ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[3,3,3,3] ; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm6, %ymm3 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm5 = ymm5[2,1,3,3,6,5,7,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[2,1,3,3,6,5,7,7] ; AVX1-ONLY-NEXT: vandnps %ymm5, %ymm4, %ymm5 ; AVX1-ONLY-NEXT: vandps %ymm4, %ymm3, %ymm3 ; AVX1-ONLY-NEXT: vorps %ymm5, %ymm3, %ymm3 diff --git a/llvm/test/CodeGen/X86/vector-reduce-add-mask.ll b/llvm/test/CodeGen/X86/vector-reduce-add-mask.ll --- a/llvm/test/CodeGen/X86/vector-reduce-add-mask.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-add-mask.ll @@ -446,7 +446,7 @@ ; AVX1-SLOW-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX1-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-SLOW-NEXT: vorps %xmm1, %xmm0, %xmm0 -; AVX1-SLOW-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 diff --git a/llvm/test/CodeGen/X86/vector-reduce-and-cmp.ll b/llvm/test/CodeGen/X86/vector-reduce-and-cmp.ll --- a/llvm/test/CodeGen/X86/vector-reduce-and-cmp.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-and-cmp.ll @@ -102,7 +102,7 @@ ; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vmovq %xmm0, %rax ; AVX1-NEXT: testq %rax, %rax @@ -165,7 +165,7 @@ ; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vmovq %xmm0, %rax ; AVX1-NEXT: testq %rax, %rax @@ -340,9 +340,9 @@ ; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vmovd %xmm0, %eax ; AVX1-NEXT: testl %eax, %eax @@ -411,9 +411,9 @@ ; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vmovd %xmm0, %eax ; AVX1-NEXT: testl %eax, %eax @@ -638,9 +638,9 @@ ; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 @@ -718,9 +718,9 @@ ; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 @@ -1002,9 +1002,9 @@ ; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 @@ -1091,9 +1091,9 @@ ; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 diff --git a/llvm/test/CodeGen/X86/vector-reduce-and.ll b/llvm/test/CodeGen/X86/vector-reduce-and.ll --- a/llvm/test/CodeGen/X86/vector-reduce-and.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-and.ll @@ -86,7 +86,7 @@ ; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vmovq %xmm0, %rax ; AVX1-NEXT: vzeroupper @@ -140,7 +140,7 @@ ; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vmovq %xmm0, %rax ; AVX1-NEXT: vzeroupper @@ -287,9 +287,9 @@ ; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vmovd %xmm0, %eax ; AVX1-NEXT: vzeroupper @@ -349,9 +349,9 @@ ; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vmovd %xmm0, %eax ; AVX1-NEXT: vzeroupper @@ -554,9 +554,9 @@ ; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 @@ -629,9 +629,9 @@ ; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 @@ -891,9 +891,9 @@ ; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 @@ -975,9 +975,9 @@ ; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 diff --git a/llvm/test/CodeGen/X86/vector-reduce-fadd.ll b/llvm/test/CodeGen/X86/vector-reduce-fadd.ll --- a/llvm/test/CodeGen/X86/vector-reduce-fadd.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-fadd.ll @@ -76,7 +76,7 @@ ; AVX-NEXT: vaddss %xmm2, %xmm0, %xmm0 ; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] ; AVX-NEXT: vaddss %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,3,3,3] +; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3] ; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq ; @@ -87,7 +87,7 @@ ; AVX512-NEXT: vaddss %xmm2, %xmm0, %xmm0 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] ; AVX512-NEXT: vaddss %xmm2, %xmm0, %xmm0 -; AVX512-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,3,3,3] +; AVX512-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3] ; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: retq %1 = call float @llvm.vector.reduce.fadd.f32.v4f32(float %a0, <4 x float> %a1) @@ -144,7 +144,7 @@ ; AVX-NEXT: vaddss %xmm2, %xmm0, %xmm0 ; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] ; AVX-NEXT: vaddss %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[3,3,3,3] +; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm1[3,3,3,3] ; AVX-NEXT: vaddss %xmm2, %xmm0, %xmm0 ; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0 @@ -152,7 +152,7 @@ ; AVX-NEXT: vaddss %xmm2, %xmm0, %xmm0 ; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] ; AVX-NEXT: vaddss %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,3,3,3] +; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3] ; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq @@ -164,7 +164,7 @@ ; AVX512-NEXT: vaddss %xmm2, %xmm0, %xmm0 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] ; AVX512-NEXT: vaddss %xmm2, %xmm0, %xmm0 -; AVX512-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[3,3,3,3] +; AVX512-NEXT: vshufps {{.*#+}} xmm2 = xmm1[3,3,3,3] ; AVX512-NEXT: vaddss %xmm2, %xmm0, %xmm0 ; AVX512-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0 @@ -172,7 +172,7 @@ ; AVX512-NEXT: vaddss %xmm2, %xmm0, %xmm0 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] ; AVX512-NEXT: vaddss %xmm2, %xmm0, %xmm0 -; AVX512-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,3,3,3] +; AVX512-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3] ; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -264,7 +264,7 @@ ; AVX-NEXT: vaddss %xmm3, %xmm0, %xmm0 ; AVX-NEXT: vpermilpd {{.*#+}} xmm3 = xmm1[1,0] ; AVX-NEXT: vaddss %xmm3, %xmm0, %xmm0 -; AVX-NEXT: vpermilps {{.*#+}} xmm3 = xmm1[3,3,3,3] +; AVX-NEXT: vshufps {{.*#+}} xmm3 = xmm1[3,3,3,3] ; AVX-NEXT: vaddss %xmm3, %xmm0, %xmm0 ; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0 @@ -272,14 +272,14 @@ ; AVX-NEXT: vaddss %xmm3, %xmm0, %xmm0 ; AVX-NEXT: vpermilpd {{.*#+}} xmm3 = xmm1[1,0] ; AVX-NEXT: vaddss %xmm3, %xmm0, %xmm0 -; AVX-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,3,3,3] +; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3] ; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vaddss %xmm2, %xmm0, %xmm0 ; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm2[1,1,3,3] ; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm2[1,0] ; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpermilps {{.*#+}} xmm1 = xmm2[3,3,3,3] +; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm2[3,3,3,3] ; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vextractf128 $1, %ymm2, %xmm1 ; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0 @@ -287,7 +287,7 @@ ; AVX-NEXT: vaddss %xmm2, %xmm0, %xmm0 ; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] ; AVX-NEXT: vaddss %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,3,3,3] +; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3] ; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq @@ -299,7 +299,7 @@ ; AVX512-NEXT: vaddss %xmm2, %xmm0, %xmm0 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] ; AVX512-NEXT: vaddss %xmm2, %xmm0, %xmm0 -; AVX512-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[3,3,3,3] +; AVX512-NEXT: vshufps {{.*#+}} xmm2 = xmm1[3,3,3,3] ; AVX512-NEXT: vaddss %xmm2, %xmm0, %xmm0 ; AVX512-NEXT: vextractf128 $1, %ymm1, %xmm2 ; AVX512-NEXT: vaddss %xmm2, %xmm0, %xmm0 @@ -307,7 +307,7 @@ ; AVX512-NEXT: vaddss %xmm3, %xmm0, %xmm0 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm3 = xmm2[1,0] ; AVX512-NEXT: vaddss %xmm3, %xmm0, %xmm0 -; AVX512-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[3,3,3,3] +; AVX512-NEXT: vshufps {{.*#+}} xmm2 = xmm2[3,3,3,3] ; AVX512-NEXT: vaddss %xmm2, %xmm0, %xmm0 ; AVX512-NEXT: vextractf32x4 $2, %zmm1, %xmm2 ; AVX512-NEXT: vaddss %xmm2, %xmm0, %xmm0 @@ -315,7 +315,7 @@ ; AVX512-NEXT: vaddss %xmm3, %xmm0, %xmm0 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm3 = xmm2[1,0] ; AVX512-NEXT: vaddss %xmm3, %xmm0, %xmm0 -; AVX512-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[3,3,3,3] +; AVX512-NEXT: vshufps {{.*#+}} xmm2 = xmm2[3,3,3,3] ; AVX512-NEXT: vaddss %xmm2, %xmm0, %xmm0 ; AVX512-NEXT: vextractf32x4 $3, %zmm1, %xmm1 ; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0 @@ -323,7 +323,7 @@ ; AVX512-NEXT: vaddss %xmm2, %xmm0, %xmm0 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] ; AVX512-NEXT: vaddss %xmm2, %xmm0, %xmm0 -; AVX512-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,3,3,3] +; AVX512-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3] ; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -405,7 +405,7 @@ ; AVX1-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm1 ; AVX1-SLOW-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] ; AVX1-SLOW-NEXT: vaddss %xmm2, %xmm1, %xmm1 -; AVX1-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX1-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX1-SLOW-NEXT: vaddss %xmm0, %xmm1, %xmm0 ; AVX1-SLOW-NEXT: retq ; @@ -414,7 +414,7 @@ ; AVX1-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm1 ; AVX1-FAST-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] ; AVX1-FAST-NEXT: vaddss %xmm2, %xmm1, %xmm1 -; AVX1-FAST-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX1-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX1-FAST-NEXT: vaddss %xmm0, %xmm1, %xmm0 ; AVX1-FAST-NEXT: retq ; @@ -424,7 +424,7 @@ ; AVX2-NEXT: vaddss %xmm1, %xmm0, %xmm1 ; AVX2-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] ; AVX2-NEXT: vaddss %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX2-NEXT: vaddss %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -434,7 +434,7 @@ ; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm1 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] ; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1 -; AVX512-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX512-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX512-NEXT: vaddss %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: retq %1 = call float @llvm.vector.reduce.fadd.f32.v4f32(float -0.0, <4 x float> %a0) @@ -488,7 +488,7 @@ ; AVX1-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm1 ; AVX1-SLOW-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] ; AVX1-SLOW-NEXT: vaddss %xmm2, %xmm1, %xmm1 -; AVX1-SLOW-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[3,3,3,3] +; AVX1-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm0[3,3,3,3] ; AVX1-SLOW-NEXT: vaddss %xmm2, %xmm1, %xmm1 ; AVX1-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-SLOW-NEXT: vaddss %xmm0, %xmm1, %xmm1 @@ -496,7 +496,7 @@ ; AVX1-SLOW-NEXT: vaddss %xmm2, %xmm1, %xmm1 ; AVX1-SLOW-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] ; AVX1-SLOW-NEXT: vaddss %xmm2, %xmm1, %xmm1 -; AVX1-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX1-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX1-SLOW-NEXT: vaddss %xmm0, %xmm1, %xmm0 ; AVX1-SLOW-NEXT: vzeroupper ; AVX1-SLOW-NEXT: retq @@ -506,7 +506,7 @@ ; AVX1-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm1 ; AVX1-FAST-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] ; AVX1-FAST-NEXT: vaddss %xmm2, %xmm1, %xmm1 -; AVX1-FAST-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[3,3,3,3] +; AVX1-FAST-NEXT: vshufps {{.*#+}} xmm2 = xmm0[3,3,3,3] ; AVX1-FAST-NEXT: vaddss %xmm2, %xmm1, %xmm1 ; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-FAST-NEXT: vaddss %xmm0, %xmm1, %xmm1 @@ -514,7 +514,7 @@ ; AVX1-FAST-NEXT: vaddss %xmm2, %xmm1, %xmm1 ; AVX1-FAST-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] ; AVX1-FAST-NEXT: vaddss %xmm2, %xmm1, %xmm1 -; AVX1-FAST-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX1-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX1-FAST-NEXT: vaddss %xmm0, %xmm1, %xmm0 ; AVX1-FAST-NEXT: vzeroupper ; AVX1-FAST-NEXT: retq @@ -525,7 +525,7 @@ ; AVX2-NEXT: vaddss %xmm1, %xmm0, %xmm1 ; AVX2-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] ; AVX2-NEXT: vaddss %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[3,3,3,3] +; AVX2-NEXT: vshufps {{.*#+}} xmm2 = xmm0[3,3,3,3] ; AVX2-NEXT: vaddss %xmm2, %xmm1, %xmm1 ; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX2-NEXT: vaddss %xmm0, %xmm1, %xmm1 @@ -533,7 +533,7 @@ ; AVX2-NEXT: vaddss %xmm2, %xmm1, %xmm1 ; AVX2-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] ; AVX2-NEXT: vaddss %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX2-NEXT: vaddss %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -544,7 +544,7 @@ ; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm1 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] ; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1 -; AVX512-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[3,3,3,3] +; AVX512-NEXT: vshufps {{.*#+}} xmm2 = xmm0[3,3,3,3] ; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1 ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX512-NEXT: vaddss %xmm0, %xmm1, %xmm1 @@ -552,7 +552,7 @@ ; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] ; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1 -; AVX512-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX512-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX512-NEXT: vaddss %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -641,7 +641,7 @@ ; AVX1-SLOW-NEXT: vaddss %xmm2, %xmm0, %xmm2 ; AVX1-SLOW-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0] ; AVX1-SLOW-NEXT: vaddss %xmm3, %xmm2, %xmm2 -; AVX1-SLOW-NEXT: vpermilps {{.*#+}} xmm3 = xmm0[3,3,3,3] +; AVX1-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm0[3,3,3,3] ; AVX1-SLOW-NEXT: vaddss %xmm3, %xmm2, %xmm2 ; AVX1-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-SLOW-NEXT: vaddss %xmm0, %xmm2, %xmm2 @@ -649,14 +649,14 @@ ; AVX1-SLOW-NEXT: vaddss %xmm3, %xmm2, %xmm2 ; AVX1-SLOW-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0] ; AVX1-SLOW-NEXT: vaddss %xmm3, %xmm2, %xmm2 -; AVX1-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX1-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX1-SLOW-NEXT: vaddss %xmm0, %xmm2, %xmm0 ; AVX1-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm0 ; AVX1-SLOW-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] ; AVX1-SLOW-NEXT: vaddss %xmm2, %xmm0, %xmm0 ; AVX1-SLOW-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] ; AVX1-SLOW-NEXT: vaddss %xmm2, %xmm0, %xmm0 -; AVX1-SLOW-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[3,3,3,3] +; AVX1-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm1[3,3,3,3] ; AVX1-SLOW-NEXT: vaddss %xmm2, %xmm0, %xmm0 ; AVX1-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm0 @@ -664,7 +664,7 @@ ; AVX1-SLOW-NEXT: vaddss %xmm2, %xmm0, %xmm0 ; AVX1-SLOW-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] ; AVX1-SLOW-NEXT: vaddss %xmm2, %xmm0, %xmm0 -; AVX1-SLOW-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,3,3,3] +; AVX1-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3] ; AVX1-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm0 ; AVX1-SLOW-NEXT: vzeroupper ; AVX1-SLOW-NEXT: retq @@ -674,7 +674,7 @@ ; AVX1-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm2 ; AVX1-FAST-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0] ; AVX1-FAST-NEXT: vaddss %xmm3, %xmm2, %xmm2 -; AVX1-FAST-NEXT: vpermilps {{.*#+}} xmm3 = xmm0[3,3,3,3] +; AVX1-FAST-NEXT: vshufps {{.*#+}} xmm3 = xmm0[3,3,3,3] ; AVX1-FAST-NEXT: vaddss %xmm3, %xmm2, %xmm2 ; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-FAST-NEXT: vaddss %xmm0, %xmm2, %xmm2 @@ -682,14 +682,14 @@ ; AVX1-FAST-NEXT: vaddss %xmm3, %xmm2, %xmm2 ; AVX1-FAST-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0] ; AVX1-FAST-NEXT: vaddss %xmm3, %xmm2, %xmm2 -; AVX1-FAST-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX1-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX1-FAST-NEXT: vaddss %xmm0, %xmm2, %xmm0 ; AVX1-FAST-NEXT: vaddss %xmm1, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] ; AVX1-FAST-NEXT: vaddss %xmm2, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] ; AVX1-FAST-NEXT: vaddss %xmm2, %xmm0, %xmm0 -; AVX1-FAST-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[3,3,3,3] +; AVX1-FAST-NEXT: vshufps {{.*#+}} xmm2 = xmm1[3,3,3,3] ; AVX1-FAST-NEXT: vaddss %xmm2, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-FAST-NEXT: vaddss %xmm1, %xmm0, %xmm0 @@ -697,7 +697,7 @@ ; AVX1-FAST-NEXT: vaddss %xmm2, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] ; AVX1-FAST-NEXT: vaddss %xmm2, %xmm0, %xmm0 -; AVX1-FAST-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,3,3,3] +; AVX1-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3] ; AVX1-FAST-NEXT: vaddss %xmm1, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vzeroupper ; AVX1-FAST-NEXT: retq @@ -708,7 +708,7 @@ ; AVX2-NEXT: vaddss %xmm2, %xmm0, %xmm2 ; AVX2-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0] ; AVX2-NEXT: vaddss %xmm3, %xmm2, %xmm2 -; AVX2-NEXT: vpermilps {{.*#+}} xmm3 = xmm0[3,3,3,3] +; AVX2-NEXT: vshufps {{.*#+}} xmm3 = xmm0[3,3,3,3] ; AVX2-NEXT: vaddss %xmm3, %xmm2, %xmm2 ; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX2-NEXT: vaddss %xmm0, %xmm2, %xmm2 @@ -716,14 +716,14 @@ ; AVX2-NEXT: vaddss %xmm3, %xmm2, %xmm2 ; AVX2-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0] ; AVX2-NEXT: vaddss %xmm3, %xmm2, %xmm2 -; AVX2-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX2-NEXT: vaddss %xmm0, %xmm2, %xmm0 ; AVX2-NEXT: vaddss %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] ; AVX2-NEXT: vaddss %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] ; AVX2-NEXT: vaddss %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[3,3,3,3] +; AVX2-NEXT: vshufps {{.*#+}} xmm2 = xmm1[3,3,3,3] ; AVX2-NEXT: vaddss %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX2-NEXT: vaddss %xmm1, %xmm0, %xmm0 @@ -731,7 +731,7 @@ ; AVX2-NEXT: vaddss %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] ; AVX2-NEXT: vaddss %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,3,3,3] +; AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3] ; AVX2-NEXT: vaddss %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -742,7 +742,7 @@ ; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm1 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] ; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1 -; AVX512-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[3,3,3,3] +; AVX512-NEXT: vshufps {{.*#+}} xmm2 = xmm0[3,3,3,3] ; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1 ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm2 ; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1 @@ -750,7 +750,7 @@ ; AVX512-NEXT: vaddss %xmm3, %xmm1, %xmm1 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm3 = xmm2[1,0] ; AVX512-NEXT: vaddss %xmm3, %xmm1, %xmm1 -; AVX512-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[3,3,3,3] +; AVX512-NEXT: vshufps {{.*#+}} xmm2 = xmm2[3,3,3,3] ; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1 ; AVX512-NEXT: vextractf32x4 $2, %zmm0, %xmm2 ; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1 @@ -758,7 +758,7 @@ ; AVX512-NEXT: vaddss %xmm3, %xmm1, %xmm1 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm3 = xmm2[1,0] ; AVX512-NEXT: vaddss %xmm3, %xmm1, %xmm1 -; AVX512-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[3,3,3,3] +; AVX512-NEXT: vshufps {{.*#+}} xmm2 = xmm2[3,3,3,3] ; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1 ; AVX512-NEXT: vextractf32x4 $3, %zmm0, %xmm0 ; AVX512-NEXT: vaddss %xmm0, %xmm1, %xmm1 @@ -766,7 +766,7 @@ ; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] ; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1 -; AVX512-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX512-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX512-NEXT: vaddss %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -836,7 +836,7 @@ ; AVX-NEXT: vaddss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] ; AVX-NEXT: vaddss %xmm2, %xmm1, %xmm1 -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX-NEXT: vaddss %xmm0, %xmm1, %xmm0 ; AVX-NEXT: retq ; @@ -846,7 +846,7 @@ ; AVX512-NEXT: vaddss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] ; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1 -; AVX512-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX512-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX512-NEXT: vaddss %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: retq %1 = call float @llvm.vector.reduce.fadd.f32.v4f32(float undef, <4 x float> %a0) @@ -900,7 +900,7 @@ ; AVX-NEXT: vaddss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] ; AVX-NEXT: vaddss %xmm2, %xmm1, %xmm1 -; AVX-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[3,3,3,3] +; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm0[3,3,3,3] ; AVX-NEXT: vaddss %xmm2, %xmm1, %xmm1 ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX-NEXT: vaddss %xmm0, %xmm1, %xmm1 @@ -908,7 +908,7 @@ ; AVX-NEXT: vaddss %xmm2, %xmm1, %xmm1 ; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] ; AVX-NEXT: vaddss %xmm2, %xmm1, %xmm1 -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX-NEXT: vaddss %xmm0, %xmm1, %xmm0 ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq @@ -919,7 +919,7 @@ ; AVX512-NEXT: vaddss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] ; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1 -; AVX512-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[3,3,3,3] +; AVX512-NEXT: vshufps {{.*#+}} xmm2 = xmm0[3,3,3,3] ; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1 ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX512-NEXT: vaddss %xmm0, %xmm1, %xmm1 @@ -927,7 +927,7 @@ ; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] ; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1 -; AVX512-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX512-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX512-NEXT: vaddss %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -1016,7 +1016,7 @@ ; AVX-NEXT: vaddss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 ; AVX-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0] ; AVX-NEXT: vaddss %xmm3, %xmm2, %xmm2 -; AVX-NEXT: vpermilps {{.*#+}} xmm3 = xmm0[3,3,3,3] +; AVX-NEXT: vshufps {{.*#+}} xmm3 = xmm0[3,3,3,3] ; AVX-NEXT: vaddss %xmm3, %xmm2, %xmm2 ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX-NEXT: vaddss %xmm0, %xmm2, %xmm2 @@ -1024,14 +1024,14 @@ ; AVX-NEXT: vaddss %xmm3, %xmm2, %xmm2 ; AVX-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0] ; AVX-NEXT: vaddss %xmm3, %xmm2, %xmm2 -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX-NEXT: vaddss %xmm0, %xmm2, %xmm0 ; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] ; AVX-NEXT: vaddss %xmm2, %xmm0, %xmm0 ; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] ; AVX-NEXT: vaddss %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[3,3,3,3] +; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm1[3,3,3,3] ; AVX-NEXT: vaddss %xmm2, %xmm0, %xmm0 ; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0 @@ -1039,7 +1039,7 @@ ; AVX-NEXT: vaddss %xmm2, %xmm0, %xmm0 ; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] ; AVX-NEXT: vaddss %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,3,3,3] +; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3] ; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq @@ -1050,7 +1050,7 @@ ; AVX512-NEXT: vaddss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] ; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1 -; AVX512-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[3,3,3,3] +; AVX512-NEXT: vshufps {{.*#+}} xmm2 = xmm0[3,3,3,3] ; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1 ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm2 ; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1 @@ -1058,7 +1058,7 @@ ; AVX512-NEXT: vaddss %xmm3, %xmm1, %xmm1 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm3 = xmm2[1,0] ; AVX512-NEXT: vaddss %xmm3, %xmm1, %xmm1 -; AVX512-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[3,3,3,3] +; AVX512-NEXT: vshufps {{.*#+}} xmm2 = xmm2[3,3,3,3] ; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1 ; AVX512-NEXT: vextractf32x4 $2, %zmm0, %xmm2 ; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1 @@ -1066,7 +1066,7 @@ ; AVX512-NEXT: vaddss %xmm3, %xmm1, %xmm1 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm3 = xmm2[1,0] ; AVX512-NEXT: vaddss %xmm3, %xmm1, %xmm1 -; AVX512-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[3,3,3,3] +; AVX512-NEXT: vshufps {{.*#+}} xmm2 = xmm2[3,3,3,3] ; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1 ; AVX512-NEXT: vextractf32x4 $3, %zmm0, %xmm0 ; AVX512-NEXT: vaddss %xmm0, %xmm1, %xmm1 @@ -1074,7 +1074,7 @@ ; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] ; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1 -; AVX512-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX512-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX512-NEXT: vaddss %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-reduce-fmax.ll b/llvm/test/CodeGen/X86/vector-reduce-fmax.ll --- a/llvm/test/CodeGen/X86/vector-reduce-fmax.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-fmax.ll @@ -197,7 +197,7 @@ ; ; AVX-LABEL: test_v4f32: ; AVX: # %bb.0: -; AVX-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,3,3,3] +; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm0[3,3,3,3] ; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] ; AVX-NEXT: vmovshdup {{.*#+}} xmm3 = xmm0[1,1,3,3] ; AVX-NEXT: vmaxss %xmm0, %xmm3, %xmm4 @@ -213,7 +213,7 @@ ; ; AVX512-LABEL: test_v4f32: ; AVX512: # %bb.0: -; AVX512-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,3,3,3] +; AVX512-NEXT: vshufps {{.*#+}} xmm1 = xmm0[3,3,3,3] ; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] ; AVX512-NEXT: vmovshdup {{.*#+}} xmm3 = xmm0[1,1,3,3] ; AVX512-NEXT: vmaxss %xmm0, %xmm3, %xmm4 @@ -302,10 +302,10 @@ ; AVX-LABEL: test_v8f32: ; AVX: # %bb.0: ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[3,3,3,3] +; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm1[3,3,3,3] ; AVX-NEXT: vpermilpd {{.*#+}} xmm3 = xmm1[1,0] ; AVX-NEXT: vmovshdup {{.*#+}} xmm4 = xmm1[1,1,3,3] -; AVX-NEXT: vpermilps {{.*#+}} xmm5 = xmm0[3,3,3,3] +; AVX-NEXT: vshufps {{.*#+}} xmm5 = xmm0[3,3,3,3] ; AVX-NEXT: vpermilpd {{.*#+}} xmm6 = xmm0[1,0] ; AVX-NEXT: vmovshdup {{.*#+}} xmm7 = xmm0[1,1,3,3] ; AVX-NEXT: vmaxss %xmm0, %xmm7, %xmm8 @@ -335,10 +335,10 @@ ; AVX512BW-LABEL: test_v8f32: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX512BW-NEXT: vpermilps {{.*#+}} xmm1 = xmm3[3,3,3,3] +; AVX512BW-NEXT: vshufps {{.*#+}} xmm1 = xmm3[3,3,3,3] ; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm2 = xmm3[1,0] ; AVX512BW-NEXT: vmovshdup {{.*#+}} xmm4 = xmm3[1,1,3,3] -; AVX512BW-NEXT: vpermilps {{.*#+}} xmm5 = xmm0[3,3,3,3] +; AVX512BW-NEXT: vshufps {{.*#+}} xmm5 = xmm0[3,3,3,3] ; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm6 = xmm0[1,0] ; AVX512BW-NEXT: vmovshdup {{.*#+}} xmm7 = xmm0[1,1,3,3] ; AVX512BW-NEXT: vmaxss %xmm0, %xmm7, %xmm8 @@ -368,10 +368,10 @@ ; AVX512VL-LABEL: test_v8f32: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX512VL-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[3,3,3,3] +; AVX512VL-NEXT: vshufps {{.*#+}} xmm2 = xmm1[3,3,3,3] ; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm3 = xmm1[1,0] ; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm4 = xmm1[1,1,3,3] -; AVX512VL-NEXT: vpermilps {{.*#+}} xmm5 = xmm0[3,3,3,3] +; AVX512VL-NEXT: vshufps {{.*#+}} xmm5 = xmm0[3,3,3,3] ; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm6 = xmm0[1,0] ; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm7 = xmm0[1,1,3,3] ; AVX512VL-NEXT: vmaxss %xmm0, %xmm7, %xmm8 @@ -506,7 +506,7 @@ ; AVX-NEXT: vmaxss %xmm1, %xmm3, %xmm1 ; AVX-NEXT: vblendvps %xmm2, %xmm3, %xmm1, %xmm1 ; AVX-NEXT: vcmpunordss %xmm1, %xmm1, %xmm2 -; AVX-NEXT: vpermilps {{.*#+}} xmm3 = xmm0[3,3,3,3] +; AVX-NEXT: vshufps {{.*#+}} xmm3 = xmm0[3,3,3,3] ; AVX-NEXT: vmaxss %xmm1, %xmm3, %xmm1 ; AVX-NEXT: vblendvps %xmm2, %xmm3, %xmm1, %xmm1 ; AVX-NEXT: vcmpunordss %xmm1, %xmm1, %xmm2 @@ -522,7 +522,7 @@ ; AVX-NEXT: vmaxss %xmm1, %xmm3, %xmm1 ; AVX-NEXT: vblendvps %xmm2, %xmm3, %xmm1, %xmm1 ; AVX-NEXT: vcmpunordss %xmm1, %xmm1, %xmm2 -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX-NEXT: vmaxss %xmm1, %xmm0, %xmm1 ; AVX-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 ; AVX-NEXT: vzeroupper @@ -539,7 +539,7 @@ ; AVX512BW-NEXT: vcmpunordss %xmm3, %xmm3, %k1 ; AVX512BW-NEXT: vmaxss %xmm3, %xmm2, %xmm3 ; AVX512BW-NEXT: vmovss %xmm2, %xmm3, %xmm3 {%k1} -; AVX512BW-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[3,3,3,3] +; AVX512BW-NEXT: vshufps {{.*#+}} xmm2 = xmm0[3,3,3,3] ; AVX512BW-NEXT: vcmpunordss %xmm3, %xmm3, %k1 ; AVX512BW-NEXT: vmaxss %xmm3, %xmm2, %xmm3 ; AVX512BW-NEXT: vmovss %xmm2, %xmm3, %xmm3 {%k1} @@ -555,7 +555,7 @@ ; AVX512BW-NEXT: vmaxss %xmm3, %xmm2, %xmm3 ; AVX512BW-NEXT: vmovss %xmm2, %xmm3, %xmm3 {%k1} ; AVX512BW-NEXT: vextractf32x4 $2, %zmm0, %xmm2 -; AVX512BW-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,3,3,3] +; AVX512BW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3] ; AVX512BW-NEXT: vcmpunordss %xmm3, %xmm3, %k1 ; AVX512BW-NEXT: vmaxss %xmm3, %xmm1, %xmm3 ; AVX512BW-NEXT: vmovss %xmm1, %xmm3, %xmm3 {%k1} @@ -571,7 +571,7 @@ ; AVX512BW-NEXT: vmaxss %xmm3, %xmm1, %xmm3 ; AVX512BW-NEXT: vmovss %xmm1, %xmm3, %xmm3 {%k1} ; AVX512BW-NEXT: vextractf32x4 $3, %zmm0, %xmm0 -; AVX512BW-NEXT: vpermilps {{.*#+}} xmm1 = xmm2[3,3,3,3] +; AVX512BW-NEXT: vshufps {{.*#+}} xmm1 = xmm2[3,3,3,3] ; AVX512BW-NEXT: vcmpunordss %xmm3, %xmm3, %k1 ; AVX512BW-NEXT: vmaxss %xmm3, %xmm1, %xmm2 ; AVX512BW-NEXT: vmovss %xmm1, %xmm2, %xmm2 {%k1} @@ -586,7 +586,7 @@ ; AVX512BW-NEXT: vcmpunordss %xmm2, %xmm2, %k1 ; AVX512BW-NEXT: vmaxss %xmm2, %xmm1, %xmm2 ; AVX512BW-NEXT: vmovss %xmm1, %xmm2, %xmm2 {%k1} -; AVX512BW-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,3,3,3] +; AVX512BW-NEXT: vshufps {{.*#+}} xmm1 = xmm0[3,3,3,3] ; AVX512BW-NEXT: vcmpunordss %xmm2, %xmm2, %k1 ; AVX512BW-NEXT: vmaxss %xmm2, %xmm1, %xmm0 ; AVX512BW-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1} @@ -596,18 +596,18 @@ ; AVX512VL-LABEL: test_v16f32: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vextractf32x4 $3, %zmm0, %xmm3 -; AVX512VL-NEXT: vpermilps {{.*#+}} xmm1 = xmm3[3,3,3,3] +; AVX512VL-NEXT: vshufps {{.*#+}} xmm1 = xmm3[3,3,3,3] ; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm2 = xmm3[1,0] ; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm4 = xmm3[1,1,3,3] ; AVX512VL-NEXT: vextractf32x4 $2, %zmm0, %xmm6 -; AVX512VL-NEXT: vpermilps {{.*#+}} xmm5 = xmm6[3,3,3,3] +; AVX512VL-NEXT: vshufps {{.*#+}} xmm5 = xmm6[3,3,3,3] ; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm7 = xmm6[1,0] ; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm8 = xmm6[1,1,3,3] ; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm9 -; AVX512VL-NEXT: vpermilps {{.*#+}} xmm10 = xmm9[3,3,3,3] +; AVX512VL-NEXT: vshufps {{.*#+}} xmm10 = xmm9[3,3,3,3] ; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm11 = xmm9[1,0] ; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm12 = xmm9[1,1,3,3] -; AVX512VL-NEXT: vpermilps {{.*#+}} xmm13 = xmm0[3,3,3,3] +; AVX512VL-NEXT: vshufps {{.*#+}} xmm13 = xmm0[3,3,3,3] ; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm14 = xmm0[1,0] ; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm15 = xmm0[1,1,3,3] ; AVX512VL-NEXT: vmaxss %xmm0, %xmm15, %xmm16 diff --git a/llvm/test/CodeGen/X86/vector-reduce-fmin.ll b/llvm/test/CodeGen/X86/vector-reduce-fmin.ll --- a/llvm/test/CodeGen/X86/vector-reduce-fmin.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-fmin.ll @@ -130,7 +130,7 @@ ; ; AVX-LABEL: test_v4f32: ; AVX: # %bb.0: -; AVX-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,3,3,3] +; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm0[3,3,3,3] ; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] ; AVX-NEXT: vmovshdup {{.*#+}} xmm3 = xmm0[1,1,3,3] ; AVX-NEXT: vminss %xmm0, %xmm3, %xmm4 @@ -146,7 +146,7 @@ ; ; AVX512-LABEL: test_v4f32: ; AVX512: # %bb.0: -; AVX512-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,3,3,3] +; AVX512-NEXT: vshufps {{.*#+}} xmm1 = xmm0[3,3,3,3] ; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] ; AVX512-NEXT: vmovshdup {{.*#+}} xmm3 = xmm0[1,1,3,3] ; AVX512-NEXT: vminss %xmm0, %xmm3, %xmm4 @@ -235,10 +235,10 @@ ; AVX-LABEL: test_v8f32: ; AVX: # %bb.0: ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[3,3,3,3] +; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm1[3,3,3,3] ; AVX-NEXT: vpermilpd {{.*#+}} xmm3 = xmm1[1,0] ; AVX-NEXT: vmovshdup {{.*#+}} xmm4 = xmm1[1,1,3,3] -; AVX-NEXT: vpermilps {{.*#+}} xmm5 = xmm0[3,3,3,3] +; AVX-NEXT: vshufps {{.*#+}} xmm5 = xmm0[3,3,3,3] ; AVX-NEXT: vpermilpd {{.*#+}} xmm6 = xmm0[1,0] ; AVX-NEXT: vmovshdup {{.*#+}} xmm7 = xmm0[1,1,3,3] ; AVX-NEXT: vminss %xmm0, %xmm7, %xmm8 @@ -268,10 +268,10 @@ ; AVX512BW-LABEL: test_v8f32: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX512BW-NEXT: vpermilps {{.*#+}} xmm1 = xmm3[3,3,3,3] +; AVX512BW-NEXT: vshufps {{.*#+}} xmm1 = xmm3[3,3,3,3] ; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm2 = xmm3[1,0] ; AVX512BW-NEXT: vmovshdup {{.*#+}} xmm4 = xmm3[1,1,3,3] -; AVX512BW-NEXT: vpermilps {{.*#+}} xmm5 = xmm0[3,3,3,3] +; AVX512BW-NEXT: vshufps {{.*#+}} xmm5 = xmm0[3,3,3,3] ; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm6 = xmm0[1,0] ; AVX512BW-NEXT: vmovshdup {{.*#+}} xmm7 = xmm0[1,1,3,3] ; AVX512BW-NEXT: vminss %xmm0, %xmm7, %xmm8 @@ -301,10 +301,10 @@ ; AVX512VL-LABEL: test_v8f32: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX512VL-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[3,3,3,3] +; AVX512VL-NEXT: vshufps {{.*#+}} xmm2 = xmm1[3,3,3,3] ; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm3 = xmm1[1,0] ; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm4 = xmm1[1,1,3,3] -; AVX512VL-NEXT: vpermilps {{.*#+}} xmm5 = xmm0[3,3,3,3] +; AVX512VL-NEXT: vshufps {{.*#+}} xmm5 = xmm0[3,3,3,3] ; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm6 = xmm0[1,0] ; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm7 = xmm0[1,1,3,3] ; AVX512VL-NEXT: vminss %xmm0, %xmm7, %xmm8 @@ -439,7 +439,7 @@ ; AVX-NEXT: vminss %xmm1, %xmm3, %xmm1 ; AVX-NEXT: vblendvps %xmm2, %xmm3, %xmm1, %xmm1 ; AVX-NEXT: vcmpunordss %xmm1, %xmm1, %xmm2 -; AVX-NEXT: vpermilps {{.*#+}} xmm3 = xmm0[3,3,3,3] +; AVX-NEXT: vshufps {{.*#+}} xmm3 = xmm0[3,3,3,3] ; AVX-NEXT: vminss %xmm1, %xmm3, %xmm1 ; AVX-NEXT: vblendvps %xmm2, %xmm3, %xmm1, %xmm1 ; AVX-NEXT: vcmpunordss %xmm1, %xmm1, %xmm2 @@ -455,7 +455,7 @@ ; AVX-NEXT: vminss %xmm1, %xmm3, %xmm1 ; AVX-NEXT: vblendvps %xmm2, %xmm3, %xmm1, %xmm1 ; AVX-NEXT: vcmpunordss %xmm1, %xmm1, %xmm2 -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX-NEXT: vminss %xmm1, %xmm0, %xmm1 ; AVX-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 ; AVX-NEXT: vzeroupper @@ -472,7 +472,7 @@ ; AVX512BW-NEXT: vcmpunordss %xmm3, %xmm3, %k1 ; AVX512BW-NEXT: vminss %xmm3, %xmm2, %xmm3 ; AVX512BW-NEXT: vmovss %xmm2, %xmm3, %xmm3 {%k1} -; AVX512BW-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[3,3,3,3] +; AVX512BW-NEXT: vshufps {{.*#+}} xmm2 = xmm0[3,3,3,3] ; AVX512BW-NEXT: vcmpunordss %xmm3, %xmm3, %k1 ; AVX512BW-NEXT: vminss %xmm3, %xmm2, %xmm3 ; AVX512BW-NEXT: vmovss %xmm2, %xmm3, %xmm3 {%k1} @@ -488,7 +488,7 @@ ; AVX512BW-NEXT: vminss %xmm3, %xmm2, %xmm3 ; AVX512BW-NEXT: vmovss %xmm2, %xmm3, %xmm3 {%k1} ; AVX512BW-NEXT: vextractf32x4 $2, %zmm0, %xmm2 -; AVX512BW-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,3,3,3] +; AVX512BW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3] ; AVX512BW-NEXT: vcmpunordss %xmm3, %xmm3, %k1 ; AVX512BW-NEXT: vminss %xmm3, %xmm1, %xmm3 ; AVX512BW-NEXT: vmovss %xmm1, %xmm3, %xmm3 {%k1} @@ -504,7 +504,7 @@ ; AVX512BW-NEXT: vminss %xmm3, %xmm1, %xmm3 ; AVX512BW-NEXT: vmovss %xmm1, %xmm3, %xmm3 {%k1} ; AVX512BW-NEXT: vextractf32x4 $3, %zmm0, %xmm0 -; AVX512BW-NEXT: vpermilps {{.*#+}} xmm1 = xmm2[3,3,3,3] +; AVX512BW-NEXT: vshufps {{.*#+}} xmm1 = xmm2[3,3,3,3] ; AVX512BW-NEXT: vcmpunordss %xmm3, %xmm3, %k1 ; AVX512BW-NEXT: vminss %xmm3, %xmm1, %xmm2 ; AVX512BW-NEXT: vmovss %xmm1, %xmm2, %xmm2 {%k1} @@ -519,7 +519,7 @@ ; AVX512BW-NEXT: vcmpunordss %xmm2, %xmm2, %k1 ; AVX512BW-NEXT: vminss %xmm2, %xmm1, %xmm2 ; AVX512BW-NEXT: vmovss %xmm1, %xmm2, %xmm2 {%k1} -; AVX512BW-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,3,3,3] +; AVX512BW-NEXT: vshufps {{.*#+}} xmm1 = xmm0[3,3,3,3] ; AVX512BW-NEXT: vcmpunordss %xmm2, %xmm2, %k1 ; AVX512BW-NEXT: vminss %xmm2, %xmm1, %xmm0 ; AVX512BW-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1} @@ -529,18 +529,18 @@ ; AVX512VL-LABEL: test_v16f32: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vextractf32x4 $3, %zmm0, %xmm3 -; AVX512VL-NEXT: vpermilps {{.*#+}} xmm1 = xmm3[3,3,3,3] +; AVX512VL-NEXT: vshufps {{.*#+}} xmm1 = xmm3[3,3,3,3] ; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm2 = xmm3[1,0] ; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm4 = xmm3[1,1,3,3] ; AVX512VL-NEXT: vextractf32x4 $2, %zmm0, %xmm6 -; AVX512VL-NEXT: vpermilps {{.*#+}} xmm5 = xmm6[3,3,3,3] +; AVX512VL-NEXT: vshufps {{.*#+}} xmm5 = xmm6[3,3,3,3] ; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm7 = xmm6[1,0] ; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm8 = xmm6[1,1,3,3] ; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm9 -; AVX512VL-NEXT: vpermilps {{.*#+}} xmm10 = xmm9[3,3,3,3] +; AVX512VL-NEXT: vshufps {{.*#+}} xmm10 = xmm9[3,3,3,3] ; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm11 = xmm9[1,0] ; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm12 = xmm9[1,1,3,3] -; AVX512VL-NEXT: vpermilps {{.*#+}} xmm13 = xmm0[3,3,3,3] +; AVX512VL-NEXT: vshufps {{.*#+}} xmm13 = xmm0[3,3,3,3] ; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm14 = xmm0[1,0] ; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm15 = xmm0[1,1,3,3] ; AVX512VL-NEXT: vminss %xmm0, %xmm15, %xmm16 diff --git a/llvm/test/CodeGen/X86/vector-reduce-fmul.ll b/llvm/test/CodeGen/X86/vector-reduce-fmul.ll --- a/llvm/test/CodeGen/X86/vector-reduce-fmul.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-fmul.ll @@ -75,7 +75,7 @@ ; AVX-NEXT: vmulss %xmm2, %xmm0, %xmm0 ; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] ; AVX-NEXT: vmulss %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,3,3,3] +; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3] ; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq ; @@ -86,7 +86,7 @@ ; AVX512-NEXT: vmulss %xmm2, %xmm0, %xmm0 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] ; AVX512-NEXT: vmulss %xmm2, %xmm0, %xmm0 -; AVX512-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,3,3,3] +; AVX512-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3] ; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: retq %1 = call float @llvm.vector.reduce.fmul.f32.v4f32(float %a0, <4 x float> %a1) @@ -143,7 +143,7 @@ ; AVX-NEXT: vmulss %xmm2, %xmm0, %xmm0 ; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] ; AVX-NEXT: vmulss %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[3,3,3,3] +; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm1[3,3,3,3] ; AVX-NEXT: vmulss %xmm2, %xmm0, %xmm0 ; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0 @@ -151,7 +151,7 @@ ; AVX-NEXT: vmulss %xmm2, %xmm0, %xmm0 ; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] ; AVX-NEXT: vmulss %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,3,3,3] +; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3] ; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq @@ -163,7 +163,7 @@ ; AVX512-NEXT: vmulss %xmm2, %xmm0, %xmm0 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] ; AVX512-NEXT: vmulss %xmm2, %xmm0, %xmm0 -; AVX512-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[3,3,3,3] +; AVX512-NEXT: vshufps {{.*#+}} xmm2 = xmm1[3,3,3,3] ; AVX512-NEXT: vmulss %xmm2, %xmm0, %xmm0 ; AVX512-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm0 @@ -171,7 +171,7 @@ ; AVX512-NEXT: vmulss %xmm2, %xmm0, %xmm0 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] ; AVX512-NEXT: vmulss %xmm2, %xmm0, %xmm0 -; AVX512-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,3,3,3] +; AVX512-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3] ; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -263,7 +263,7 @@ ; AVX-NEXT: vmulss %xmm3, %xmm0, %xmm0 ; AVX-NEXT: vpermilpd {{.*#+}} xmm3 = xmm1[1,0] ; AVX-NEXT: vmulss %xmm3, %xmm0, %xmm0 -; AVX-NEXT: vpermilps {{.*#+}} xmm3 = xmm1[3,3,3,3] +; AVX-NEXT: vshufps {{.*#+}} xmm3 = xmm1[3,3,3,3] ; AVX-NEXT: vmulss %xmm3, %xmm0, %xmm0 ; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0 @@ -271,14 +271,14 @@ ; AVX-NEXT: vmulss %xmm3, %xmm0, %xmm0 ; AVX-NEXT: vpermilpd {{.*#+}} xmm3 = xmm1[1,0] ; AVX-NEXT: vmulss %xmm3, %xmm0, %xmm0 -; AVX-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,3,3,3] +; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3] ; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vmulss %xmm2, %xmm0, %xmm0 ; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm2[1,1,3,3] ; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm2[1,0] ; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpermilps {{.*#+}} xmm1 = xmm2[3,3,3,3] +; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm2[3,3,3,3] ; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vextractf128 $1, %ymm2, %xmm1 ; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0 @@ -286,7 +286,7 @@ ; AVX-NEXT: vmulss %xmm2, %xmm0, %xmm0 ; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] ; AVX-NEXT: vmulss %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,3,3,3] +; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3] ; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq @@ -298,7 +298,7 @@ ; AVX512-NEXT: vmulss %xmm2, %xmm0, %xmm0 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] ; AVX512-NEXT: vmulss %xmm2, %xmm0, %xmm0 -; AVX512-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[3,3,3,3] +; AVX512-NEXT: vshufps {{.*#+}} xmm2 = xmm1[3,3,3,3] ; AVX512-NEXT: vmulss %xmm2, %xmm0, %xmm0 ; AVX512-NEXT: vextractf128 $1, %ymm1, %xmm2 ; AVX512-NEXT: vmulss %xmm2, %xmm0, %xmm0 @@ -306,7 +306,7 @@ ; AVX512-NEXT: vmulss %xmm3, %xmm0, %xmm0 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm3 = xmm2[1,0] ; AVX512-NEXT: vmulss %xmm3, %xmm0, %xmm0 -; AVX512-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[3,3,3,3] +; AVX512-NEXT: vshufps {{.*#+}} xmm2 = xmm2[3,3,3,3] ; AVX512-NEXT: vmulss %xmm2, %xmm0, %xmm0 ; AVX512-NEXT: vextractf32x4 $2, %zmm1, %xmm2 ; AVX512-NEXT: vmulss %xmm2, %xmm0, %xmm0 @@ -314,7 +314,7 @@ ; AVX512-NEXT: vmulss %xmm3, %xmm0, %xmm0 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm3 = xmm2[1,0] ; AVX512-NEXT: vmulss %xmm3, %xmm0, %xmm0 -; AVX512-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[3,3,3,3] +; AVX512-NEXT: vshufps {{.*#+}} xmm2 = xmm2[3,3,3,3] ; AVX512-NEXT: vmulss %xmm2, %xmm0, %xmm0 ; AVX512-NEXT: vextractf32x4 $3, %zmm1, %xmm1 ; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm0 @@ -322,7 +322,7 @@ ; AVX512-NEXT: vmulss %xmm2, %xmm0, %xmm0 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] ; AVX512-NEXT: vmulss %xmm2, %xmm0, %xmm0 -; AVX512-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,3,3,3] +; AVX512-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3] ; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -393,7 +393,7 @@ ; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm1 ; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] ; AVX-NEXT: vmulss %xmm2, %xmm1, %xmm1 -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX-NEXT: vmulss %xmm0, %xmm1, %xmm0 ; AVX-NEXT: retq ; @@ -403,7 +403,7 @@ ; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm1 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] ; AVX512-NEXT: vmulss %xmm2, %xmm1, %xmm1 -; AVX512-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX512-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX512-NEXT: vmulss %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: retq %1 = call float @llvm.vector.reduce.fmul.f32.v4f32(float 1.0, <4 x float> %a0) @@ -457,7 +457,7 @@ ; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm1 ; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] ; AVX-NEXT: vmulss %xmm2, %xmm1, %xmm1 -; AVX-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[3,3,3,3] +; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm0[3,3,3,3] ; AVX-NEXT: vmulss %xmm2, %xmm1, %xmm1 ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX-NEXT: vmulss %xmm0, %xmm1, %xmm1 @@ -465,7 +465,7 @@ ; AVX-NEXT: vmulss %xmm2, %xmm1, %xmm1 ; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] ; AVX-NEXT: vmulss %xmm2, %xmm1, %xmm1 -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX-NEXT: vmulss %xmm0, %xmm1, %xmm0 ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq @@ -476,7 +476,7 @@ ; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm1 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] ; AVX512-NEXT: vmulss %xmm2, %xmm1, %xmm1 -; AVX512-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[3,3,3,3] +; AVX512-NEXT: vshufps {{.*#+}} xmm2 = xmm0[3,3,3,3] ; AVX512-NEXT: vmulss %xmm2, %xmm1, %xmm1 ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX512-NEXT: vmulss %xmm0, %xmm1, %xmm1 @@ -484,7 +484,7 @@ ; AVX512-NEXT: vmulss %xmm2, %xmm1, %xmm1 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] ; AVX512-NEXT: vmulss %xmm2, %xmm1, %xmm1 -; AVX512-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX512-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX512-NEXT: vmulss %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -573,7 +573,7 @@ ; AVX-NEXT: vmulss %xmm2, %xmm0, %xmm2 ; AVX-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0] ; AVX-NEXT: vmulss %xmm3, %xmm2, %xmm2 -; AVX-NEXT: vpermilps {{.*#+}} xmm3 = xmm0[3,3,3,3] +; AVX-NEXT: vshufps {{.*#+}} xmm3 = xmm0[3,3,3,3] ; AVX-NEXT: vmulss %xmm3, %xmm2, %xmm2 ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX-NEXT: vmulss %xmm0, %xmm2, %xmm2 @@ -581,14 +581,14 @@ ; AVX-NEXT: vmulss %xmm3, %xmm2, %xmm2 ; AVX-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0] ; AVX-NEXT: vmulss %xmm3, %xmm2, %xmm2 -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX-NEXT: vmulss %xmm0, %xmm2, %xmm0 ; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] ; AVX-NEXT: vmulss %xmm2, %xmm0, %xmm0 ; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] ; AVX-NEXT: vmulss %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[3,3,3,3] +; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm1[3,3,3,3] ; AVX-NEXT: vmulss %xmm2, %xmm0, %xmm0 ; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0 @@ -596,7 +596,7 @@ ; AVX-NEXT: vmulss %xmm2, %xmm0, %xmm0 ; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] ; AVX-NEXT: vmulss %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,3,3,3] +; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3] ; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq @@ -607,7 +607,7 @@ ; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm1 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] ; AVX512-NEXT: vmulss %xmm2, %xmm1, %xmm1 -; AVX512-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[3,3,3,3] +; AVX512-NEXT: vshufps {{.*#+}} xmm2 = xmm0[3,3,3,3] ; AVX512-NEXT: vmulss %xmm2, %xmm1, %xmm1 ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm2 ; AVX512-NEXT: vmulss %xmm2, %xmm1, %xmm1 @@ -615,7 +615,7 @@ ; AVX512-NEXT: vmulss %xmm3, %xmm1, %xmm1 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm3 = xmm2[1,0] ; AVX512-NEXT: vmulss %xmm3, %xmm1, %xmm1 -; AVX512-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[3,3,3,3] +; AVX512-NEXT: vshufps {{.*#+}} xmm2 = xmm2[3,3,3,3] ; AVX512-NEXT: vmulss %xmm2, %xmm1, %xmm1 ; AVX512-NEXT: vextractf32x4 $2, %zmm0, %xmm2 ; AVX512-NEXT: vmulss %xmm2, %xmm1, %xmm1 @@ -623,7 +623,7 @@ ; AVX512-NEXT: vmulss %xmm3, %xmm1, %xmm1 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm3 = xmm2[1,0] ; AVX512-NEXT: vmulss %xmm3, %xmm1, %xmm1 -; AVX512-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[3,3,3,3] +; AVX512-NEXT: vshufps {{.*#+}} xmm2 = xmm2[3,3,3,3] ; AVX512-NEXT: vmulss %xmm2, %xmm1, %xmm1 ; AVX512-NEXT: vextractf32x4 $3, %zmm0, %xmm0 ; AVX512-NEXT: vmulss %xmm0, %xmm1, %xmm1 @@ -631,7 +631,7 @@ ; AVX512-NEXT: vmulss %xmm2, %xmm1, %xmm1 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] ; AVX512-NEXT: vmulss %xmm2, %xmm1, %xmm1 -; AVX512-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX512-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX512-NEXT: vmulss %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -701,7 +701,7 @@ ; AVX-NEXT: vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] ; AVX-NEXT: vmulss %xmm2, %xmm1, %xmm1 -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX-NEXT: vmulss %xmm0, %xmm1, %xmm0 ; AVX-NEXT: retq ; @@ -711,7 +711,7 @@ ; AVX512-NEXT: vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] ; AVX512-NEXT: vmulss %xmm2, %xmm1, %xmm1 -; AVX512-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX512-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX512-NEXT: vmulss %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: retq %1 = call float @llvm.vector.reduce.fmul.f32.v4f32(float undef, <4 x float> %a0) @@ -765,7 +765,7 @@ ; AVX-NEXT: vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] ; AVX-NEXT: vmulss %xmm2, %xmm1, %xmm1 -; AVX-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[3,3,3,3] +; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm0[3,3,3,3] ; AVX-NEXT: vmulss %xmm2, %xmm1, %xmm1 ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX-NEXT: vmulss %xmm0, %xmm1, %xmm1 @@ -773,7 +773,7 @@ ; AVX-NEXT: vmulss %xmm2, %xmm1, %xmm1 ; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] ; AVX-NEXT: vmulss %xmm2, %xmm1, %xmm1 -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX-NEXT: vmulss %xmm0, %xmm1, %xmm0 ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq @@ -784,7 +784,7 @@ ; AVX512-NEXT: vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] ; AVX512-NEXT: vmulss %xmm2, %xmm1, %xmm1 -; AVX512-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[3,3,3,3] +; AVX512-NEXT: vshufps {{.*#+}} xmm2 = xmm0[3,3,3,3] ; AVX512-NEXT: vmulss %xmm2, %xmm1, %xmm1 ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX512-NEXT: vmulss %xmm0, %xmm1, %xmm1 @@ -792,7 +792,7 @@ ; AVX512-NEXT: vmulss %xmm2, %xmm1, %xmm1 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] ; AVX512-NEXT: vmulss %xmm2, %xmm1, %xmm1 -; AVX512-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX512-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX512-NEXT: vmulss %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -881,7 +881,7 @@ ; AVX-NEXT: vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 ; AVX-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0] ; AVX-NEXT: vmulss %xmm3, %xmm2, %xmm2 -; AVX-NEXT: vpermilps {{.*#+}} xmm3 = xmm0[3,3,3,3] +; AVX-NEXT: vshufps {{.*#+}} xmm3 = xmm0[3,3,3,3] ; AVX-NEXT: vmulss %xmm3, %xmm2, %xmm2 ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX-NEXT: vmulss %xmm0, %xmm2, %xmm2 @@ -889,14 +889,14 @@ ; AVX-NEXT: vmulss %xmm3, %xmm2, %xmm2 ; AVX-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0] ; AVX-NEXT: vmulss %xmm3, %xmm2, %xmm2 -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX-NEXT: vmulss %xmm0, %xmm2, %xmm0 ; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] ; AVX-NEXT: vmulss %xmm2, %xmm0, %xmm0 ; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] ; AVX-NEXT: vmulss %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[3,3,3,3] +; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm1[3,3,3,3] ; AVX-NEXT: vmulss %xmm2, %xmm0, %xmm0 ; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0 @@ -904,7 +904,7 @@ ; AVX-NEXT: vmulss %xmm2, %xmm0, %xmm0 ; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] ; AVX-NEXT: vmulss %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,3,3,3] +; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3] ; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq @@ -915,7 +915,7 @@ ; AVX512-NEXT: vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] ; AVX512-NEXT: vmulss %xmm2, %xmm1, %xmm1 -; AVX512-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[3,3,3,3] +; AVX512-NEXT: vshufps {{.*#+}} xmm2 = xmm0[3,3,3,3] ; AVX512-NEXT: vmulss %xmm2, %xmm1, %xmm1 ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm2 ; AVX512-NEXT: vmulss %xmm2, %xmm1, %xmm1 @@ -923,7 +923,7 @@ ; AVX512-NEXT: vmulss %xmm3, %xmm1, %xmm1 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm3 = xmm2[1,0] ; AVX512-NEXT: vmulss %xmm3, %xmm1, %xmm1 -; AVX512-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[3,3,3,3] +; AVX512-NEXT: vshufps {{.*#+}} xmm2 = xmm2[3,3,3,3] ; AVX512-NEXT: vmulss %xmm2, %xmm1, %xmm1 ; AVX512-NEXT: vextractf32x4 $2, %zmm0, %xmm2 ; AVX512-NEXT: vmulss %xmm2, %xmm1, %xmm1 @@ -931,7 +931,7 @@ ; AVX512-NEXT: vmulss %xmm3, %xmm1, %xmm1 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm3 = xmm2[1,0] ; AVX512-NEXT: vmulss %xmm3, %xmm1, %xmm1 -; AVX512-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[3,3,3,3] +; AVX512-NEXT: vshufps {{.*#+}} xmm2 = xmm2[3,3,3,3] ; AVX512-NEXT: vmulss %xmm2, %xmm1, %xmm1 ; AVX512-NEXT: vextractf32x4 $3, %zmm0, %xmm0 ; AVX512-NEXT: vmulss %xmm0, %xmm1, %xmm1 @@ -939,7 +939,7 @@ ; AVX512-NEXT: vmulss %xmm2, %xmm1, %xmm1 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] ; AVX512-NEXT: vmulss %xmm2, %xmm1, %xmm1 -; AVX512-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX512-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX512-NEXT: vmulss %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-reduce-or.ll b/llvm/test/CodeGen/X86/vector-reduce-or.ll --- a/llvm/test/CodeGen/X86/vector-reduce-or.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-or.ll @@ -86,7 +86,7 @@ ; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vmovq %xmm0, %rax ; AVX1-NEXT: vzeroupper @@ -140,7 +140,7 @@ ; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vmovq %xmm0, %rax ; AVX1-NEXT: vzeroupper @@ -287,9 +287,9 @@ ; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vmovd %xmm0, %eax ; AVX1-NEXT: vzeroupper @@ -349,9 +349,9 @@ ; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vmovd %xmm0, %eax ; AVX1-NEXT: vzeroupper @@ -554,9 +554,9 @@ ; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 ; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 @@ -629,9 +629,9 @@ ; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 ; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 @@ -891,9 +891,9 @@ ; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 ; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 @@ -975,9 +975,9 @@ ; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 ; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 diff --git a/llvm/test/CodeGen/X86/vector-reduce-smax.ll b/llvm/test/CodeGen/X86/vector-reduce-smax.ll --- a/llvm/test/CodeGen/X86/vector-reduce-smax.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-smax.ll @@ -170,7 +170,7 @@ ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vmovq %xmm0, %rax @@ -182,7 +182,7 @@ ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: vmovq %xmm0, %rax @@ -360,7 +360,7 @@ ; AVX1-NEXT: vblendvpd %xmm3, %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm0, %xmm1 ; AVX1-NEXT: vblendvpd %xmm1, %xmm0, %xmm2, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vmovq %xmm0, %rax @@ -374,7 +374,7 @@ ; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: vmovq %xmm0, %rax @@ -681,7 +681,7 @@ ; AVX1-NEXT: vblendvpd %xmm1, %xmm5, %xmm4, %xmm1 ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 ; AVX1-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vmovq %xmm0, %rax @@ -699,7 +699,7 @@ ; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: vmovq %xmm0, %rax diff --git a/llvm/test/CodeGen/X86/vector-reduce-smin.ll b/llvm/test/CodeGen/X86/vector-reduce-smin.ll --- a/llvm/test/CodeGen/X86/vector-reduce-smin.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-smin.ll @@ -170,7 +170,7 @@ ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 ; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 ; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vmovq %xmm0, %rax @@ -182,7 +182,7 @@ ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 ; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 ; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: vmovq %xmm0, %rax @@ -360,7 +360,7 @@ ; AVX1-NEXT: vblendvpd %xmm3, %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm0, %xmm1 ; AVX1-NEXT: vblendvpd %xmm1, %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 ; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vmovq %xmm0, %rax @@ -374,7 +374,7 @@ ; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 ; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 ; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: vmovq %xmm0, %rax @@ -681,7 +681,7 @@ ; AVX1-NEXT: vblendvpd %xmm1, %xmm4, %xmm5, %xmm1 ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 ; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 ; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vmovq %xmm0, %rax @@ -699,7 +699,7 @@ ; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 ; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 ; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: vmovq %xmm0, %rax diff --git a/llvm/test/CodeGen/X86/vector-reduce-umax.ll b/llvm/test/CodeGen/X86/vector-reduce-umax.ll --- a/llvm/test/CodeGen/X86/vector-reduce-umax.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-umax.ll @@ -185,7 +185,7 @@ ; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm4 ; AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3 ; AVX1-NEXT: vblendvpd %xmm3, %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vxorpd %xmm2, %xmm0, %xmm3 ; AVX1-NEXT: vxorpd %xmm2, %xmm1, %xmm2 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 @@ -202,7 +202,7 @@ ; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm4 ; AVX2-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3 ; AVX2-NEXT: vblendvpd %xmm3, %xmm0, %xmm1, %xmm0 -; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vxorpd %xmm2, %xmm0, %xmm3 ; AVX2-NEXT: vxorpd %xmm2, %xmm1, %xmm2 ; AVX2-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 @@ -400,7 +400,7 @@ ; AVX1-NEXT: vxorpd %xmm2, %xmm3, %xmm4 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm4, %xmm1 ; AVX1-NEXT: vblendvpd %xmm1, %xmm3, %xmm0, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vxorpd %xmm2, %xmm0, %xmm3 ; AVX1-NEXT: vxorpd %xmm2, %xmm1, %xmm2 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 @@ -421,7 +421,7 @@ ; AVX2-NEXT: vxorpd %xmm2, %xmm0, %xmm4 ; AVX2-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3 ; AVX2-NEXT: vblendvpd %xmm3, %xmm0, %xmm1, %xmm0 -; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vxorpd %xmm2, %xmm0, %xmm3 ; AVX2-NEXT: vxorpd %xmm2, %xmm1, %xmm2 ; AVX2-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 @@ -769,7 +769,7 @@ ; AVX1-NEXT: vxorpd %xmm4, %xmm0, %xmm3 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 ; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vxorpd %xmm4, %xmm0, %xmm2 ; AVX1-NEXT: vxorpd %xmm4, %xmm1, %xmm3 ; AVX1-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2 @@ -798,7 +798,7 @@ ; AVX2-NEXT: vxorpd %xmm4, %xmm0, %xmm3 ; AVX2-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 ; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vxorpd %xmm4, %xmm0, %xmm2 ; AVX2-NEXT: vxorpd %xmm4, %xmm1, %xmm3 ; AVX2-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2 diff --git a/llvm/test/CodeGen/X86/vector-reduce-umin.ll b/llvm/test/CodeGen/X86/vector-reduce-umin.ll --- a/llvm/test/CodeGen/X86/vector-reduce-umin.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-umin.ll @@ -186,7 +186,7 @@ ; AVX1-NEXT: vpxor %xmm1, %xmm3, %xmm4 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm4, %xmm2 ; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm3, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[2,3,2,3] +; AVX1-NEXT: vshufps {{.*#+}} xmm2 = xmm0[2,3,2,3] ; AVX1-NEXT: vxorpd %xmm1, %xmm0, %xmm3 ; AVX1-NEXT: vxorpd %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vpcmpgtq %xmm3, %xmm1, %xmm1 @@ -203,7 +203,7 @@ ; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm4 ; AVX2-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3 ; AVX2-NEXT: vblendvpd %xmm3, %xmm0, %xmm1, %xmm0 -; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vxorpd %xmm2, %xmm0, %xmm3 ; AVX2-NEXT: vxorpd %xmm2, %xmm1, %xmm2 ; AVX2-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2 @@ -403,7 +403,7 @@ ; AVX1-NEXT: vxorpd %xmm3, %xmm2, %xmm4 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm4, %xmm1 ; AVX1-NEXT: vblendvpd %xmm1, %xmm0, %xmm2, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vxorpd %xmm3, %xmm0, %xmm2 ; AVX1-NEXT: vxorpd %xmm3, %xmm1, %xmm3 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 @@ -424,7 +424,7 @@ ; AVX2-NEXT: vxorpd %xmm2, %xmm1, %xmm4 ; AVX2-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3 ; AVX2-NEXT: vblendvpd %xmm3, %xmm0, %xmm1, %xmm0 -; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vxorpd %xmm2, %xmm0, %xmm3 ; AVX2-NEXT: vxorpd %xmm2, %xmm1, %xmm2 ; AVX2-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2 @@ -773,7 +773,7 @@ ; AVX1-NEXT: vxorpd %xmm4, %xmm0, %xmm3 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 ; AVX1-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vxorpd %xmm4, %xmm0, %xmm2 ; AVX1-NEXT: vxorpd %xmm4, %xmm1, %xmm3 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 @@ -802,7 +802,7 @@ ; AVX2-NEXT: vxorpd %xmm4, %xmm1, %xmm3 ; AVX2-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 ; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vxorpd %xmm4, %xmm0, %xmm2 ; AVX2-NEXT: vxorpd %xmm4, %xmm1, %xmm3 ; AVX2-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 diff --git a/llvm/test/CodeGen/X86/vector-reduce-xor.ll b/llvm/test/CodeGen/X86/vector-reduce-xor.ll --- a/llvm/test/CodeGen/X86/vector-reduce-xor.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-xor.ll @@ -86,7 +86,7 @@ ; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vmovq %xmm0, %rax ; AVX1-NEXT: vzeroupper @@ -140,7 +140,7 @@ ; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vmovq %xmm0, %rax ; AVX1-NEXT: vzeroupper @@ -287,9 +287,9 @@ ; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vmovd %xmm0, %eax ; AVX1-NEXT: vzeroupper @@ -349,9 +349,9 @@ ; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vmovd %xmm0, %eax ; AVX1-NEXT: vzeroupper @@ -554,9 +554,9 @@ ; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 ; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 @@ -629,9 +629,9 @@ ; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 ; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 @@ -891,9 +891,9 @@ ; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 ; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 @@ -975,9 +975,9 @@ ; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 ; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 diff --git a/llvm/test/CodeGen/X86/vector-sext.ll b/llvm/test/CodeGen/X86/vector-sext.ll --- a/llvm/test/CodeGen/X86/vector-sext.ll +++ b/llvm/test/CodeGen/X86/vector-sext.ll @@ -2681,7 +2681,7 @@ ; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,1,1,4,5,6,7] ; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,2,3,3,4,5,6,7] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,1,1,4,4,5,5] +; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0,1,1,4,4,5,5] ; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [9241421688590303745,9241421688590303745] diff --git a/llvm/test/CodeGen/X86/vector-shift-lshr-128.ll b/llvm/test/CodeGen/X86/vector-shift-lshr-128.ll --- a/llvm/test/CodeGen/X86/vector-shift-lshr-128.ll +++ b/llvm/test/CodeGen/X86/vector-shift-lshr-128.ll @@ -1526,8 +1526,8 @@ ; XOPAVX1-LABEL: vector_variable_shift_right: ; XOPAVX1: # %bb.0: ; XOPAVX1-NEXT: vpslld $31, %xmm0, %xmm0 -; XOPAVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,0,0,0] -; XOPAVX1-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[0,0,0,0] +; XOPAVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,0,0,0] +; XOPAVX1-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,0,0,0] ; XOPAVX1-NEXT: vblendvps %xmm0, %xmm1, %xmm2, %xmm0 ; XOPAVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; XOPAVX1-NEXT: vpsubd %xmm0, %xmm1, %xmm0 diff --git a/llvm/test/CodeGen/X86/vector-shift-lshr-256.ll b/llvm/test/CodeGen/X86/vector-shift-lshr-256.ll --- a/llvm/test/CodeGen/X86/vector-shift-lshr-256.ll +++ b/llvm/test/CodeGen/X86/vector-shift-lshr-256.ll @@ -1712,7 +1712,7 @@ ; AVX1: # %bb.0: ; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm1[1,3],ymm0[5,7],ymm1[5,7] -; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7] +; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7] ; AVX1-NEXT: retq ; ; AVX2-LABEL: shift32_v4i64: @@ -1724,7 +1724,7 @@ ; XOPAVX1: # %bb.0: ; XOPAVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; XOPAVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm1[1,3],ymm0[5,7],ymm1[5,7] -; XOPAVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7] +; XOPAVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7] ; XOPAVX1-NEXT: retq ; ; XOPAVX2-LABEL: shift32_v4i64: @@ -1746,7 +1746,7 @@ ; X86-AVX1: # %bb.0: ; X86-AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; X86-AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm1[1,3],ymm0[5,7],ymm1[5,7] -; X86-AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7] +; X86-AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7] ; X86-AVX1-NEXT: retl ; ; X86-AVX2-LABEL: shift32_v4i64: diff --git a/llvm/test/CodeGen/X86/vector-shift-shl-256.ll b/llvm/test/CodeGen/X86/vector-shift-shl-256.ll --- a/llvm/test/CodeGen/X86/vector-shift-shl-256.ll +++ b/llvm/test/CodeGen/X86/vector-shift-shl-256.ll @@ -1600,7 +1600,7 @@ ; AVX1: # %bb.0: ; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,2],ymm0[0,2],ymm1[4,6],ymm0[4,6] -; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7] +; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7] ; AVX1-NEXT: retq ; ; AVX2-LABEL: shift32_v4i64: @@ -1612,7 +1612,7 @@ ; XOPAVX1: # %bb.0: ; XOPAVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; XOPAVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,2],ymm0[0,2],ymm1[4,6],ymm0[4,6] -; XOPAVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7] +; XOPAVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7] ; XOPAVX1-NEXT: retq ; ; XOPAVX2-LABEL: shift32_v4i64: @@ -1634,7 +1634,7 @@ ; X86-AVX1: # %bb.0: ; X86-AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; X86-AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,2],ymm0[0,2],ymm1[4,6],ymm0[4,6] -; X86-AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7] +; X86-AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7] ; X86-AVX1-NEXT: retl ; ; X86-AVX2-LABEL: shift32_v4i64: diff --git a/llvm/test/CodeGen/X86/vector-shuffle-128-v2.ll b/llvm/test/CodeGen/X86/vector-shuffle-128-v2.ll --- a/llvm/test/CodeGen/X86/vector-shuffle-128-v2.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-128-v2.ll @@ -15,7 +15,7 @@ ; ; AVX1-LABEL: shuffle_v2i64_00: ; AVX1: # %bb.0: -; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,0,1] +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1,0,1] ; AVX1-NEXT: retq ; ; AVX2-LABEL: shuffle_v2i64_00: @@ -38,7 +38,7 @@ ; ; AVX-LABEL: shuffle_v2i64_10: ; AVX: # %bb.0: -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,0,1] ; AVX-NEXT: retq %shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> ret <2 x i64> %shuffle @@ -51,7 +51,7 @@ ; ; AVX-LABEL: shuffle_v2i64_11: ; AVX: # %bb.0: -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX-NEXT: retq %shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> ret <2 x i64> %shuffle @@ -64,7 +64,7 @@ ; ; AVX1-LABEL: shuffle_v2i64_22: ; AVX1: # %bb.0: -; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm1[0,1,0,1] +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm1[0,1,0,1] ; AVX1-NEXT: retq ; ; AVX2-LABEL: shuffle_v2i64_22: @@ -87,7 +87,7 @@ ; ; AVX-LABEL: shuffle_v2i64_32: ; AVX: # %bb.0: -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm1[2,3,0,1] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm1[2,3,0,1] ; AVX-NEXT: retq %shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> ret <2 x i64> %shuffle @@ -100,7 +100,7 @@ ; ; AVX-LABEL: shuffle_v2i64_33: ; AVX: # %bb.0: -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm1[2,3,2,3] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm1[2,3,2,3] ; AVX-NEXT: retq %shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> ret <2 x i64> %shuffle diff --git a/llvm/test/CodeGen/X86/vector-shuffle-128-v4.ll b/llvm/test/CodeGen/X86/vector-shuffle-128-v4.ll --- a/llvm/test/CodeGen/X86/vector-shuffle-128-v4.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-128-v4.ll @@ -18,7 +18,7 @@ ; ; AVX-LABEL: shuffle_v4i32_0001: ; AVX: # %bb.0: -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,1] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0,0,1] ; AVX-NEXT: retq %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> ret <4 x i32> %shuffle @@ -31,7 +31,7 @@ ; ; AVX-LABEL: shuffle_v4i32_0020: ; AVX: # %bb.0: -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,2,0] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0,2,0] ; AVX-NEXT: retq %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> ret <4 x i32> %shuffle @@ -44,7 +44,7 @@ ; ; AVX-LABEL: shuffle_v4i32_0112: ; AVX: # %bb.0: -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,2] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1,1,2] ; AVX-NEXT: retq %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> ret <4 x i32> %shuffle @@ -57,7 +57,7 @@ ; ; AVX-LABEL: shuffle_v4i32_0300: ; AVX: # %bb.0: -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,3,0,0] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,3,0,0] ; AVX-NEXT: retq %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> ret <4 x i32> %shuffle @@ -70,7 +70,7 @@ ; ; AVX-LABEL: shuffle_v4i32_1000: ; AVX: # %bb.0: -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,0,0,0] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,0,0,0] ; AVX-NEXT: retq %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> ret <4 x i32> %shuffle @@ -83,7 +83,7 @@ ; ; AVX-LABEL: shuffle_v4i32_2200: ; AVX: # %bb.0: -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,2,0,0] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,2,0,0] ; AVX-NEXT: retq %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> ret <4 x i32> %shuffle @@ -96,7 +96,7 @@ ; ; AVX-LABEL: shuffle_v4i32_3330: ; AVX: # %bb.0: -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,0] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,0] ; AVX-NEXT: retq %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> ret <4 x i32> %shuffle @@ -109,7 +109,7 @@ ; ; AVX-LABEL: shuffle_v4i32_3210: ; AVX: # %bb.0: -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,2,1,0] ; AVX-NEXT: retq %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> ret <4 x i32> %shuffle @@ -123,7 +123,7 @@ ; ; AVX-LABEL: shuffle_v4i32_2121: ; AVX: # %bb.0: -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,1,2,1] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,1,2,1] ; AVX-NEXT: retq %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> ret <4 x i32> %shuffle @@ -137,7 +137,7 @@ ; ; AVX-LABEL: shuffle_v4f32_0001: ; AVX: # %bb.0: -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,1] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0,0,1] ; AVX-NEXT: retq %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> ret <4 x float> %shuffle @@ -150,7 +150,7 @@ ; ; AVX-LABEL: shuffle_v4f32_0020: ; AVX: # %bb.0: -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,2,0] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0,2,0] ; AVX-NEXT: retq %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> ret <4 x float> %shuffle @@ -163,7 +163,7 @@ ; ; AVX-LABEL: shuffle_v4f32_0300: ; AVX: # %bb.0: -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,3,0,0] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,3,0,0] ; AVX-NEXT: retq %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> ret <4 x float> %shuffle @@ -176,7 +176,7 @@ ; ; AVX-LABEL: shuffle_v4f32_1000: ; AVX: # %bb.0: -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,0,0,0] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,0,0,0] ; AVX-NEXT: retq %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> ret <4 x float> %shuffle @@ -189,7 +189,7 @@ ; ; AVX-LABEL: shuffle_v4f32_2200: ; AVX: # %bb.0: -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,2,0,0] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,2,0,0] ; AVX-NEXT: retq %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> ret <4 x float> %shuffle @@ -202,7 +202,7 @@ ; ; AVX-LABEL: shuffle_v4f32_3330: ; AVX: # %bb.0: -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,0] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,0] ; AVX-NEXT: retq %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> ret <4 x float> %shuffle @@ -215,7 +215,7 @@ ; ; AVX-LABEL: shuffle_v4f32_3210: ; AVX: # %bb.0: -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,2,1,0] ; AVX-NEXT: retq %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> ret <4 x float> %shuffle @@ -228,7 +228,7 @@ ; ; AVX-LABEL: shuffle_v4f32_0011: ; AVX: # %bb.0: -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,1,1] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0,1,1] ; AVX-NEXT: retq %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> ret <4 x float> %shuffle @@ -241,7 +241,7 @@ ; ; AVX-LABEL: shuffle_v4f32_2233: ; AVX: # %bb.0: -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,2,3,3] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,2,3,3] ; AVX-NEXT: retq %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> ret <4 x float> %shuffle @@ -358,7 +358,7 @@ ; ; AVX1-LABEL: shuffle_v4i32_0124: ; AVX1: # %bb.0: -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,0,0,0] +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,0,0,0] ; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3] ; AVX1-NEXT: retq ; @@ -404,15 +404,15 @@ ; ; AVX1-LABEL: shuffle_v4i32_0142: ; AVX1: # %bb.0: -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,1,0,1] -; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,2,2] +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,0,1] +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1,2,2] ; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3] ; AVX1-NEXT: retq ; ; AVX2-LABEL: shuffle_v4i32_0142: ; AVX2: # %bb.0: ; AVX2-NEXT: vbroadcastss %xmm1, %xmm1 -; AVX2-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,2,2] +; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1,2,2] ; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3] ; AVX2-NEXT: retq ; @@ -455,15 +455,15 @@ ; ; AVX1-LABEL: shuffle_v4i32_0412: ; AVX1: # %bb.0: -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,0,1,1] -; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,2] +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,0,1,1] +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1,1,2] ; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] ; AVX1-NEXT: retq ; ; AVX2-LABEL: shuffle_v4i32_0412: ; AVX2: # %bb.0: ; AVX2-NEXT: vbroadcastss %xmm1, %xmm1 -; AVX2-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,2] +; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1,1,2] ; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] ; AVX2-NEXT: retq ; @@ -505,7 +505,7 @@ ; ; AVX1OR2-LABEL: shuffle_v4i32_4012: ; AVX1OR2: # %bb.0: -; AVX1OR2-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,1,2] +; AVX1OR2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0,1,2] ; AVX1OR2-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] ; AVX1OR2-NEXT: retq ; @@ -540,7 +540,7 @@ ; AVX1OR2-LABEL: shuffle_v4i32_0451: ; AVX1OR2: # %bb.0: ; AVX1OR2-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX1OR2-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,3,2] +; AVX1OR2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1,3,2] ; AVX1OR2-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v4i32_0451: @@ -575,7 +575,7 @@ ; AVX1OR2-LABEL: shuffle_v4i32_4015: ; AVX1OR2: # %bb.0: ; AVX1OR2-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX1OR2-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,0,2,3] +; AVX1OR2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,0,2,3] ; AVX1OR2-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v4i32_4015: @@ -1203,14 +1203,14 @@ ; AVX1: # %bb.0: ; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] -; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,0,1,1] +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,0,1,1] ; AVX1-NEXT: retq ; ; AVX2-SLOW-LABEL: shuffle_v4i32_z4zz: ; AVX2-SLOW: # %bb.0: ; AVX2-SLOW-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,0,1,1] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,0,1,1] ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-LABEL: shuffle_v4i32_z4zz: @@ -1259,14 +1259,14 @@ ; AVX1: # %bb.0: ; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] -; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,0,1] +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1,0,1] ; AVX1-NEXT: retq ; ; AVX2-SLOW-LABEL: shuffle_v4i32_zz4z: ; AVX2-SLOW: # %bb.0: ; AVX2-SLOW-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,0,1] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1,0,1] ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-LABEL: shuffle_v4i32_zz4z: @@ -1327,14 +1327,14 @@ ; ; AVX1-LABEL: shuffle_v4i32_z6zz: ; AVX1: # %bb.0: -; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,2,3,3] +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,2,3,3] ; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] ; AVX1-NEXT: retq ; ; AVX2-SLOW-LABEL: shuffle_v4i32_z6zz: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,2,3,3] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,2,3,3] ; AVX2-SLOW-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] ; AVX2-SLOW-NEXT: retq @@ -2409,7 +2409,7 @@ ; AVX1-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] ; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vunpckhps {{.*#+}} xmm1 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0] +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0,0,0] ; AVX1-NEXT: vmovaps %xmm1, (%rsi) ; AVX1-NEXT: retq ; @@ -2517,7 +2517,7 @@ ; AVX1OR2-LABEL: shuffle_mem_v4f32_0624: ; AVX1OR2: # %bb.0: ; AVX1OR2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,0],mem[0,2] -; AVX1OR2-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,0,3,1] +; AVX1OR2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,0,3,1] ; AVX1OR2-NEXT: retq ; ; AVX512VL-LABEL: shuffle_mem_v4f32_0624: diff --git a/llvm/test/CodeGen/X86/vector-shuffle-128-v8.ll b/llvm/test/CodeGen/X86/vector-shuffle-128-v8.ll --- a/llvm/test/CodeGen/X86/vector-shuffle-128-v8.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-128-v8.ll @@ -20,7 +20,7 @@ ; ; AVX-LABEL: shuffle_v8i16_01012323: ; AVX: # %bb.0: -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,1,1] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0,1,1] ; AVX-NEXT: retq %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> ret <8 x i16> %shuffle @@ -33,7 +33,7 @@ ; ; AVX-LABEL: shuffle_v8i16_67452301: ; AVX: # %bb.0: -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,2,1,0] ; AVX-NEXT: retq %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> ret <8 x i16> %shuffle @@ -240,7 +240,7 @@ ; ; AVX-LABEL: shuffle_v8i16_23016745: ; AVX: # %bb.0: -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,0,3,2] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,0,3,2] ; AVX-NEXT: retq %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> ret <8 x i16> %shuffle @@ -1382,7 +1382,7 @@ ; ; AVX-LABEL: shuffle_v8i16_XXXdXXXX: ; AVX: # %bb.0: -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm1[2,2,3,3] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm1[2,2,3,3] ; AVX-NEXT: retq %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> ret <8 x i16> %shuffle diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll --- a/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll @@ -691,7 +691,7 @@ ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4] +; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4] ; AVX1-NEXT: retq ; ; AVX2-SLOW-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08: @@ -722,7 +722,7 @@ ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] ; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; XOPAVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4] +; XOPAVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4] ; XOPAVX1-NEXT: retq ; ; XOPAVX2-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08: @@ -741,7 +741,7 @@ ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,7,7,7] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,2,2,2,6,6,6,6] +; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,2,2,2,6,6,6,6] ; AVX1-NEXT: retq ; ; AVX2-SLOW-LABEL: shuffle_v16i16_07_07_07_07_07_07_07_07_15_15_15_15_15_15_15_15: @@ -772,7 +772,7 @@ ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; XOPAVX1-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,7,7,7] ; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; XOPAVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,2,2,2,6,6,6,6] +; XOPAVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,2,2,2,6,6,6,6] ; XOPAVX1-NEXT: retq ; ; XOPAVX2-LABEL: shuffle_v16i16_07_07_07_07_07_07_07_07_15_15_15_15_15_15_15_15: @@ -1542,7 +1542,7 @@ ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4] +; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4] ; AVX1-NEXT: retq ; ; AVX2-LABEL: shuffle_v16i16_00_16_00_16_00_16_00_16_08_24_08_24_08_24_08_24: @@ -1564,7 +1564,7 @@ ; XOPAVX1-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] ; XOPAVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; XOPAVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4] +; XOPAVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4] ; XOPAVX1-NEXT: retq ; ; XOPAVX2-LABEL: shuffle_v16i16_00_16_00_16_00_16_00_16_08_24_08_24_08_24_08_24: @@ -1728,7 +1728,7 @@ ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,1,0,4,5,6,7] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,0,1,4,4,4,5] +; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0,0,1,4,4,4,5] ; AVX1-NEXT: retq ; ; AVX2-SLOW-LABEL: shuffle_v16i16_00_00_00_00_00_00_01_00_08_08_08_08_08_08_09_08: @@ -1759,7 +1759,7 @@ ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,1,0,4,5,6,7] ; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; XOPAVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,0,1,4,4,4,5] +; XOPAVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0,0,1,4,4,4,5] ; XOPAVX1-NEXT: retq ; ; XOPAVX2-LABEL: shuffle_v16i16_00_00_00_00_00_00_01_00_08_08_08_08_08_08_09_08: @@ -1778,7 +1778,7 @@ ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,2,4,5,6,7] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,1,0,4,4,5,4] +; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0,1,0,4,4,5,4] ; AVX1-NEXT: retq ; ; AVX2-SLOW-LABEL: shuffle_v16i16_00_00_00_00_00_02_00_00_08_08_08_08_08_10_08_08: @@ -1809,7 +1809,7 @@ ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,2,4,5,6,7] ; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; XOPAVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,1,0,4,4,5,4] +; XOPAVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0,1,0,4,4,5,4] ; XOPAVX1-NEXT: retq ; ; XOPAVX2-LABEL: shuffle_v16i16_00_00_00_00_00_02_00_00_08_08_08_08_08_10_08_08: @@ -1828,7 +1828,7 @@ ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,3,0,4,5,6,7] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,1,0,4,4,5,4] +; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0,1,0,4,4,5,4] ; AVX1-NEXT: retq ; ; AVX2-SLOW-LABEL: shuffle_v16i16_00_00_00_00_03_00_00_00_08_08_08_08_11_08_08_08: @@ -1859,7 +1859,7 @@ ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,3,0,4,5,6,7] ; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; XOPAVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,1,0,4,4,5,4] +; XOPAVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0,1,0,4,4,5,4] ; XOPAVX1-NEXT: retq ; ; XOPAVX2-LABEL: shuffle_v16i16_00_00_00_00_03_00_00_00_08_08_08_08_11_08_08_08: @@ -2990,7 +2990,7 @@ ; AVX1: # %bb.0: ; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm1[1,3],ymm0[5,7],ymm1[5,7] -; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7] +; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7] ; AVX1-NEXT: retq ; ; AVX2OR512VL-LABEL: shuffle_v16i16_02_03_zz_zz_06_07_zz_zz_10_11_zz_zz_14_15_zz_zz: @@ -3002,7 +3002,7 @@ ; XOPAVX1: # %bb.0: ; XOPAVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; XOPAVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm1[1,3],ymm0[5,7],ymm1[5,7] -; XOPAVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7] +; XOPAVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7] ; XOPAVX1-NEXT: retq ; ; XOPAVX2-LABEL: shuffle_v16i16_02_03_zz_zz_06_07_zz_zz_10_11_zz_zz_14_15_zz_zz: @@ -3363,7 +3363,7 @@ ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7] ; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] ; AVX1-NEXT: retq ; ; AVX2-LABEL: shuffle_v16i16_06_07_04_05_02_03_00_09_14_15_12_13_10_11_08_09: @@ -3384,7 +3384,7 @@ ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; XOPAVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7] ; XOPAVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; XOPAVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; XOPAVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] ; XOPAVX1-NEXT: retq ; ; XOPAVX2-LABEL: shuffle_v16i16_06_07_04_05_02_03_00_09_14_15_12_13_10_11_08_09: @@ -3638,7 +3638,7 @@ ; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,0,4,5,6,7] ; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,0,4,5,6,7] ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,1,3,2,4,5,7,6] +; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,3,2,4,5,7,6] ; AVX1-NEXT: retq ; ; AVX2-SLOW-LABEL: shuffle_v16i16_03_01_02_00_06_07_04_13_11_09_10_08_14_15_12_13: @@ -3747,7 +3747,7 @@ ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1,2,3,4],xmm1[5],xmm0[6,7] ; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6] +; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6] ; AVX1-NEXT: retq ; ; AVX2-LABEL: shuffle_v16i16_02_03_00_01_06_07_04_13_10_11_08_09_14_15_12_13: @@ -3768,7 +3768,7 @@ ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; XOPAVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1,2,3,4],xmm1[5],xmm0[6,7] ; XOPAVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; XOPAVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6] +; XOPAVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6] ; XOPAVX1-NEXT: retq ; ; XOPAVX2-LABEL: shuffle_v16i16_02_03_00_01_06_07_04_13_10_11_08_09_14_15_12_13: @@ -3789,7 +3789,7 @@ ; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,3,0,2,4,5,6,7] ; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,3,0,2,4,5,6,7] ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,1,3,2,4,5,7,6] +; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,3,2,4,5,7,6] ; AVX1-NEXT: retq ; ; AVX2-SLOW-LABEL: shuffle_v16i16_02_03_00_02_06_07_04_13_10_11_08_10_14_15_12_13: @@ -5788,7 +5788,7 @@ define <16 x i16> @shuffle_v16i16_uu_uu_uu_21_uu_uu_uu_uu_uu_uu_uu_29_uu_uu_uu_uu(<16 x i16> %a, <16 x i16> %b) { ; ALL-LABEL: shuffle_v16i16_uu_uu_uu_21_uu_uu_uu_uu_uu_uu_uu_29_uu_uu_uu_uu: ; ALL: # %bb.0: -; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm1[2,2,2,2,6,6,6,6] +; ALL-NEXT: vshufps {{.*#+}} ymm0 = ymm1[2,2,2,2,6,6,6,6] ; ALL-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle @@ -6979,7 +6979,7 @@ ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5] +; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5] ; AVX1-NEXT: retq ; ; AVX2-LABEL: shuffle_v16i16_02_18_03_19_00_16_01_17_10_26_11_27_08_24_09_25: @@ -7001,7 +7001,7 @@ ; XOPAVX1-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] ; XOPAVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; XOPAVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5] +; XOPAVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5] ; XOPAVX1-NEXT: retq ; ; XOPAVX2-LABEL: shuffle_v16i16_02_18_03_19_00_16_01_17_10_26_11_27_08_24_09_25: @@ -7171,7 +7171,7 @@ ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,7,5] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] +; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] ; AVX1-NEXT: retq ; ; AVX2-SLOW-LABEL: shuffle_v16i16_04_06_07_uu_uu_06_07_05_12_14_15_uu_uu_14_15_13: @@ -7202,7 +7202,7 @@ ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; XOPAVX1-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,7,5] ; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; XOPAVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] +; XOPAVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] ; XOPAVX1-NEXT: retq ; ; XOPAVX2-LABEL: shuffle_v16i16_04_06_07_uu_uu_06_07_05_12_14_15_uu_uu_14_15_13: diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll --- a/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll @@ -2436,7 +2436,7 @@ ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 -; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4] +; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4] ; AVX1-NEXT: retq ; ; AVX2-SLOW-LABEL: shuffle_v32i8_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_16_48_16_48_16_48_16_48_16_48_16_48_16_48_16_48: diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll --- a/llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll @@ -794,7 +794,7 @@ define <4 x i64> @shuffle_v4i64_0000(<4 x i64> %a, <4 x i64> %b) { ; AVX1-LABEL: shuffle_v4i64_0000: ; AVX1: # %bb.0: -; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,0,1] +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1,0,1] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-NEXT: retq ; @@ -814,7 +814,7 @@ define <4 x i64> @shuffle_v4i64_0001(<4 x i64> %a, <4 x i64> %b) { ; AVX1-LABEL: shuffle_v4i64_0001: ; AVX1: # %bb.0: -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,1,0,1] +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[0,1,0,1] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; @@ -1244,13 +1244,13 @@ ; AVX2-LABEL: shuffle_v4i64_1054: ; AVX2: # %bb.0: ; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5] +; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5] ; AVX2-NEXT: retq ; ; AVX512VL-SLOW-LABEL: shuffle_v4i64_1054: ; AVX512VL-SLOW: # %bb.0: ; AVX512VL-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX512VL-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5] +; AVX512VL-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5] ; AVX512VL-SLOW-NEXT: retq ; ; AVX512VL-FAST-ALL-LABEL: shuffle_v4i64_1054: @@ -1262,7 +1262,7 @@ ; AVX512VL-FAST-PERLANE-LABEL: shuffle_v4i64_1054: ; AVX512VL-FAST-PERLANE: # %bb.0: ; AVX512VL-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX512VL-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5] +; AVX512VL-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5] ; AVX512VL-FAST-PERLANE-NEXT: retq %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> ret <4 x i64> %shuffle @@ -1312,7 +1312,7 @@ ; AVX2-LABEL: shuffle_v4i64_3276: ; AVX2: # %bb.0: ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] -; AVX2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5] +; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5] ; AVX2-NEXT: retq ; ; AVX512VL-SLOW-LABEL: shuffle_v4i64_3276: @@ -1346,13 +1346,13 @@ ; AVX2-LABEL: shuffle_v4i64_1076: ; AVX2: # %bb.0: ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5] +; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5] ; AVX2-NEXT: retq ; ; AVX512VL-SLOW-LABEL: shuffle_v4i64_1076: ; AVX512VL-SLOW: # %bb.0: ; AVX512VL-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512VL-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5] +; AVX512VL-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5] ; AVX512VL-SLOW-NEXT: retq ; ; AVX512VL-FAST-LABEL: shuffle_v4i64_1076: @@ -1507,7 +1507,7 @@ define <4 x i64> @shuffle_v4i64_11uu(<4 x i64> %a, <4 x i64> %b) { ; ALL-LABEL: shuffle_v4i64_11uu: ; ALL: # %bb.0: -; ALL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,2,3] +; ALL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] ; ALL-NEXT: retq %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> ret <4 x i64> %shuffle @@ -1517,7 +1517,7 @@ ; AVX1-LABEL: shuffle_v4i64_22uu: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,0,1] +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1,0,1] ; AVX1-NEXT: retq ; ; AVX2-LABEL: shuffle_v4i64_22uu: @@ -1585,14 +1585,14 @@ ; AVX2: # %bb.0: ; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 ; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,1,0,1,4,5,4,5] +; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,0,1,4,5,4,5] ; AVX2-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v4i64_0044_v2i64: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 ; AVX512VL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX512VL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,1,0,1,4,5,4,5] +; AVX512VL-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,0,1,4,5,4,5] ; AVX512VL-NEXT: retq %1 = shufflevector <2 x i64> %a, <2 x i64> undef, <2 x i32> %2 = shufflevector <2 x i64> %b, <2 x i64> undef, <2 x i32> @@ -1605,7 +1605,7 @@ ; ALL: # %bb.0: ; ALL-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 ; ALL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5] +; ALL-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5] ; ALL-NEXT: retq %1 = shufflevector <2 x i64> %a, <2 x i64> undef, <2 x i32> %2 = shufflevector <2 x i64> %b, <2 x i64> undef, <2 x i32> @@ -1767,7 +1767,7 @@ define <4 x double> @broadcast_v4f64_0000_from_v2i64(<2 x i64> %a0) { ; AVX1-LABEL: broadcast_v4f64_0000_from_v2i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,0,1] +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1,0,1] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll --- a/llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll @@ -10,7 +10,7 @@ define <8 x float> @shuffle_v8f32_00000000(<8 x float> %a, <8 x float> %b) { ; AVX1-LABEL: shuffle_v8f32_00000000: ; AVX1: # %bb.0: -; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0] +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0,0,0] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-NEXT: retq ; @@ -25,14 +25,14 @@ define <8 x float> @shuffle_v8f32_00000010(<8 x float> %a, <8 x float> %b) { ; AVX1-LABEL: shuffle_v8f32_00000010: ; AVX1: # %bb.0: -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,0,0,0] -; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,1,0] +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[0,0,0,0] +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0,1,0] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-SLOW-LABEL: shuffle_v8f32_00000010: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,1,0] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0,1,0] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,0,1] ; AVX2-SLOW-NEXT: retq ; @@ -44,13 +44,13 @@ ; ; AVX2-FAST-PERLANE-LABEL: shuffle_v8f32_00000010: ; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,1,0] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0,1,0] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,0,1] ; AVX2-FAST-PERLANE-NEXT: retq ; ; AVX512VL-SLOW-LABEL: shuffle_v8f32_00000010: ; AVX512VL-SLOW: # %bb.0: -; AVX512VL-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,1,0] +; AVX512VL-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0,1,0] ; AVX512VL-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,0,1] ; AVX512VL-SLOW-NEXT: retq ; @@ -62,7 +62,7 @@ ; ; AVX512VL-FAST-PERLANE-LABEL: shuffle_v8f32_00000010: ; AVX512VL-FAST-PERLANE: # %bb.0: -; AVX512VL-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,1,0] +; AVX512VL-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0,1,0] ; AVX512VL-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,0,1] ; AVX512VL-FAST-PERLANE-NEXT: retq %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> @@ -72,14 +72,14 @@ define <8 x float> @shuffle_v8f32_00000200(<8 x float> %a, <8 x float> %b) { ; AVX1-LABEL: shuffle_v8f32_00000200: ; AVX1: # %bb.0: -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,0,0,0] -; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,0,0] +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[0,0,0,0] +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,0,0] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-SLOW-LABEL: shuffle_v8f32_00000200: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0,0,2] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,0] ; AVX2-SLOW-NEXT: retq ; @@ -91,13 +91,13 @@ ; ; AVX2-FAST-PERLANE-LABEL: shuffle_v8f32_00000200: ; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0,0,2] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,0] ; AVX2-FAST-PERLANE-NEXT: retq ; ; AVX512VL-SLOW-LABEL: shuffle_v8f32_00000200: ; AVX512VL-SLOW: # %bb.0: -; AVX512VL-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,2] +; AVX512VL-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0,0,2] ; AVX512VL-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,0] ; AVX512VL-SLOW-NEXT: retq ; @@ -109,7 +109,7 @@ ; ; AVX512VL-FAST-PERLANE-LABEL: shuffle_v8f32_00000200: ; AVX512VL-FAST-PERLANE: # %bb.0: -; AVX512VL-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,2] +; AVX512VL-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0,0,2] ; AVX512VL-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,0] ; AVX512VL-FAST-PERLANE-NEXT: retq %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> @@ -119,14 +119,14 @@ define <8 x float> @shuffle_v8f32_00003000(<8 x float> %a, <8 x float> %b) { ; AVX1-LABEL: shuffle_v8f32_00003000: ; AVX1: # %bb.0: -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,0,0,0] -; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,0,0,0] +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[0,0,0,0] +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,0,0,0] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-SLOW-LABEL: shuffle_v8f32_00003000: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,3,0] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0,3,0] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,0] ; AVX2-SLOW-NEXT: retq ; @@ -138,13 +138,13 @@ ; ; AVX2-FAST-PERLANE-LABEL: shuffle_v8f32_00003000: ; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,3,0] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0,3,0] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,0] ; AVX2-FAST-PERLANE-NEXT: retq ; ; AVX512VL-SLOW-LABEL: shuffle_v8f32_00003000: ; AVX512VL-SLOW: # %bb.0: -; AVX512VL-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,3,0] +; AVX512VL-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0,3,0] ; AVX512VL-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,0] ; AVX512VL-SLOW-NEXT: retq ; @@ -156,7 +156,7 @@ ; ; AVX512VL-FAST-PERLANE-LABEL: shuffle_v8f32_00003000: ; AVX512VL-FAST-PERLANE: # %bb.0: -; AVX512VL-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,3,0] +; AVX512VL-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0,3,0] ; AVX512VL-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,0] ; AVX512VL-FAST-PERLANE-NEXT: retq %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> @@ -244,8 +244,8 @@ define <8 x float> @shuffle_v8f32_00112233(<8 x float> %a, <8 x float> %b) { ; AVX1-LABEL: shuffle_v8f32_00112233: ; AVX1: # %bb.0: -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,0,1,1] -; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,2,3,3] +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[0,0,1,1] +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,2,3,3] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; @@ -261,14 +261,14 @@ define <8 x float> @shuffle_v8f32_00001111(<8 x float> %a, <8 x float> %b) { ; AVX1-LABEL: shuffle_v8f32_00001111: ; AVX1: # %bb.0: -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,0,0,0] -; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,1,1] +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[0,0,0,0] +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1,1,1] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-SLOW-LABEL: shuffle_v8f32_00001111: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,1,1] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0,1,1] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,1] ; AVX2-SLOW-NEXT: retq ; @@ -280,13 +280,13 @@ ; ; AVX2-FAST-PERLANE-LABEL: shuffle_v8f32_00001111: ; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,1,1] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0,1,1] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,1] ; AVX2-FAST-PERLANE-NEXT: retq ; ; AVX512VL-SLOW-LABEL: shuffle_v8f32_00001111: ; AVX512VL-SLOW: # %bb.0: -; AVX512VL-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,1,1] +; AVX512VL-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0,1,1] ; AVX512VL-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,1] ; AVX512VL-SLOW-NEXT: retq ; @@ -298,7 +298,7 @@ ; ; AVX512VL-FAST-PERLANE-LABEL: shuffle_v8f32_00001111: ; AVX512VL-FAST-PERLANE: # %bb.0: -; AVX512VL-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,1,1] +; AVX512VL-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0,1,1] ; AVX512VL-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,1] ; AVX512VL-FAST-PERLANE-NEXT: retq %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> @@ -318,7 +318,7 @@ ; AVX1-LABEL: shuffle_v8f32_08080808: ; AVX1: # %bb.0: ; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,0] -; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,1,3] +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,1,3] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-NEXT: retq ; @@ -353,13 +353,13 @@ ; AVX1OR2-LABEL: shuffle_v8f32_08084c4c: ; AVX1OR2: # %bb.0: ; AVX1OR2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0],ymm1[0,0],ymm0[4,4],ymm1[4,4] -; AVX1OR2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7] +; AVX1OR2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7] ; AVX1OR2-NEXT: retq ; ; AVX512VL-SLOW-LABEL: shuffle_v8f32_08084c4c: ; AVX512VL-SLOW: # %bb.0: ; AVX512VL-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0],ymm1[0,0],ymm0[4,4],ymm1[4,4] -; AVX512VL-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7] +; AVX512VL-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7] ; AVX512VL-SLOW-NEXT: retq ; ; AVX512VL-FAST-LABEL: shuffle_v8f32_08084c4c: @@ -439,7 +439,7 @@ ; AVX1-NEXT: vmovlhps {{.*#+}} xmm2 = xmm0[0],xmm1[0] ; AVX1-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,2],xmm1[1,1] ; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,2,3,3] +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,2,3,3] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 ; AVX1-NEXT: retq ; @@ -447,7 +447,7 @@ ; AVX2-SLOW: # %bb.0: ; AVX2-SLOW-NEXT: vmovaps {{.*#+}} ymm2 = ; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm2, %ymm1 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,3] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1,1,3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] ; AVX2-SLOW-NEXT: retq @@ -465,7 +465,7 @@ ; AVX2-FAST-PERLANE: # %bb.0: ; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} ymm2 = ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm2, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,3] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1,1,3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] ; AVX2-FAST-PERLANE-NEXT: retq @@ -483,8 +483,8 @@ define <8 x float> @shuffle_v8f32_091b2d3f(<8 x float> %a, <8 x float> %b) { ; AVX1-LABEL: shuffle_v8f32_091b2d3f: ; AVX1: # %bb.0: -; AVX1-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[0,1,1,3] -; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,1,3,3] +; AVX1-NEXT: vshufps {{.*#+}} xmm2 = xmm0[0,1,1,3] +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,1,3,3] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 ; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] ; AVX1-NEXT: retq @@ -526,7 +526,7 @@ ; ; AVX2-SLOW-LABEL: shuffle_v8f32_09ab1def: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,3] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1,1,3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] ; AVX2-SLOW-NEXT: retq @@ -540,7 +540,7 @@ ; ; AVX2-FAST-PERLANE-LABEL: shuffle_v8f32_09ab1def: ; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,3] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1,1,3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] ; AVX2-FAST-PERLANE-NEXT: retq @@ -558,7 +558,7 @@ define <8 x float> @shuffle_v8f32_00014445(<8 x float> %a, <8 x float> %b) { ; ALL-LABEL: shuffle_v8f32_00014445: ; ALL: # %bb.0: -; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,0,1,4,4,4,5] +; ALL-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0,0,1,4,4,4,5] ; ALL-NEXT: retq %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> ret <8 x float> %shuffle @@ -567,7 +567,7 @@ define <8 x float> @shuffle_v8f32_00204464(<8 x float> %a, <8 x float> %b) { ; ALL-LABEL: shuffle_v8f32_00204464: ; ALL: # %bb.0: -; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,2,0,4,4,6,4] +; ALL-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0,2,0,4,4,6,4] ; ALL-NEXT: retq %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> ret <8 x float> %shuffle @@ -576,7 +576,7 @@ define <8 x float> @shuffle_v8f32_03004744(<8 x float> %a, <8 x float> %b) { ; ALL-LABEL: shuffle_v8f32_03004744: ; ALL: # %bb.0: -; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,3,0,0,4,7,4,4] +; ALL-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,3,0,0,4,7,4,4] ; ALL-NEXT: retq %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> ret <8 x float> %shuffle @@ -585,7 +585,7 @@ define <8 x float> @shuffle_v8f32_10005444(<8 x float> %a, <8 x float> %b) { ; ALL-LABEL: shuffle_v8f32_10005444: ; ALL: # %bb.0: -; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,0,0,0,5,4,4,4] +; ALL-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,0,0,0,5,4,4,4] ; ALL-NEXT: retq %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> ret <8 x float> %shuffle @@ -594,7 +594,7 @@ define <8 x float> @shuffle_v8f32_22006644(<8 x float> %a, <8 x float> %b) { ; ALL-LABEL: shuffle_v8f32_22006644: ; ALL: # %bb.0: -; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,2,0,0,6,6,4,4] +; ALL-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,2,0,0,6,6,4,4] ; ALL-NEXT: retq %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> ret <8 x float> %shuffle @@ -603,7 +603,7 @@ define <8 x float> @shuffle_v8f32_33307774(<8 x float> %a, <8 x float> %b) { ; ALL-LABEL: shuffle_v8f32_33307774: ; ALL: # %bb.0: -; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,3,3,0,7,7,7,4] +; ALL-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,3,3,0,7,7,7,4] ; ALL-NEXT: retq %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> ret <8 x float> %shuffle @@ -612,7 +612,7 @@ define <8 x float> @shuffle_v8f32_32107654(<8 x float> %a, <8 x float> %b) { ; ALL-LABEL: shuffle_v8f32_32107654: ; ALL: # %bb.0: -; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; ALL-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] ; ALL-NEXT: retq %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> ret <8 x float> %shuffle @@ -621,7 +621,7 @@ define <8 x float> @shuffle_v8f32_00234467(<8 x float> %a, <8 x float> %b) { ; ALL-LABEL: shuffle_v8f32_00234467: ; ALL: # %bb.0: -; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,2,3,4,4,6,7] +; ALL-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0,2,3,4,4,6,7] ; ALL-NEXT: retq %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> ret <8 x float> %shuffle @@ -654,7 +654,7 @@ ; ALL: # %bb.0: ; ALL-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 ; ALL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4] +; ALL-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4] ; ALL-NEXT: retq %1 = shufflevector <4 x float> %a, <4 x float> %b, <8 x i32> ret <8 x float> %1 @@ -663,7 +663,7 @@ define <8 x float> @shuffle_v8f32_10325476(<8 x float> %a, <8 x float> %b) { ; ALL-LABEL: shuffle_v8f32_10325476: ; ALL: # %bb.0: -; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6] +; ALL-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6] ; ALL-NEXT: retq %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> ret <8 x float> %shuffle @@ -694,7 +694,7 @@ define <8 x float> @shuffle_v8f32_10235467(<8 x float> %a, <8 x float> %b) { ; ALL-LABEL: shuffle_v8f32_10235467: ; ALL: # %bb.0: -; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,0,2,3,5,4,6,7] +; ALL-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,0,2,3,5,4,6,7] ; ALL-NEXT: retq %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> ret <8 x float> %shuffle @@ -703,7 +703,7 @@ define <8 x float> @shuffle_v8f32_10225466(<8 x float> %a, <8 x float> %b) { ; ALL-LABEL: shuffle_v8f32_10225466: ; ALL: # %bb.0: -; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,0,2,2,5,4,6,6] +; ALL-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,0,2,2,5,4,6,6] ; ALL-NEXT: retq %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> ret <8 x float> %shuffle @@ -896,7 +896,7 @@ ; AVX2-SLOW-NEXT: vbroadcastf128 {{.*#+}} ymm2 = [4,5,2,0,4,5,2,0] ; AVX2-SLOW-NEXT: # ymm2 = mem[0,1,0,1] ; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm2, %ymm1 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,3,2,0,4,7,6,4] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,3,2,0,4,7,6,4] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4,5,6],ymm0[7] ; AVX2-SLOW-NEXT: retq @@ -916,7 +916,7 @@ ; AVX2-FAST-PERLANE-NEXT: vbroadcastf128 {{.*#+}} ymm2 = [4,5,2,0,4,5,2,0] ; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[0,1,0,1] ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm2, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,3,2,0,4,7,6,4] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,3,2,0,4,7,6,4] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,1] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4,5,6],ymm0[7] ; AVX2-FAST-PERLANE-NEXT: retq @@ -935,18 +935,18 @@ ; AVX1-LABEL: shuffle_v8f32_f511235a: ; AVX1: # %bb.0: ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3,0,1] -; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,1,1,1,5,5,5,5] +; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,1,1,1,5,5,5,5] ; AVX1-NEXT: vshufpd {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[3],ymm0[3] ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,0,1] -; AVX1-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[3,1,2,2,7,5,6,6] +; AVX1-NEXT: vshufps {{.*#+}} ymm1 = ymm1[3,1,2,2,7,5,6,6] ; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6],ymm1[7] ; AVX1-NEXT: retq ; ; AVX2-SLOW-LABEL: shuffle_v8f32_f511235a: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[3,2,2,3,7,6,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm1[3,2,2,3,7,6,6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,0] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,1,2,3,5,5,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,1,2,3,5,5,6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,0,1,2] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6],ymm1[7] ; AVX2-SLOW-NEXT: retq @@ -962,9 +962,9 @@ ; ; AVX2-FAST-PERLANE-LABEL: shuffle_v8f32_f511235a: ; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[3,2,2,3,7,6,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm1[3,2,2,3,7,6,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,0] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,1,2,3,5,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,1,2,3,5,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,0,1,2] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: retq @@ -981,13 +981,13 @@ define <8 x float> @shuffle_v8f32_32103210(<8 x float> %a, <8 x float> %b) { ; AVX1-LABEL: shuffle_v8f32_32103210: ; AVX1: # %bb.0: -; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0] +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,2,1,0] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-SLOW-LABEL: shuffle_v8f32_32103210: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,2,1,0] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,1] ; AVX2-SLOW-NEXT: retq ; @@ -1000,13 +1000,13 @@ ; ; AVX2-FAST-PERLANE-LABEL: shuffle_v8f32_32103210: ; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,2,1,0] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,1] ; AVX2-FAST-PERLANE-NEXT: retq ; ; AVX512VL-SLOW-LABEL: shuffle_v8f32_32103210: ; AVX512VL-SLOW: # %bb.0: -; AVX512VL-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0] +; AVX512VL-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,2,1,0] ; AVX512VL-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,1] ; AVX512VL-SLOW-NEXT: retq ; @@ -1019,7 +1019,7 @@ ; ; AVX512VL-FAST-PERLANE-LABEL: shuffle_v8f32_32103210: ; AVX512VL-FAST-PERLANE: # %bb.0: -; AVX512VL-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0] +; AVX512VL-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,2,1,0] ; AVX512VL-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,1] ; AVX512VL-FAST-PERLANE-NEXT: retq %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> @@ -1030,12 +1030,12 @@ ; AVX1-LABEL: shuffle_v8f32_76547654: ; AVX1: # %bb.0: ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] ; AVX1-NEXT: retq ; ; AVX2-SLOW-LABEL: shuffle_v8f32_76547654: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,3,2,3] ; AVX2-SLOW-NEXT: retq ; @@ -1048,13 +1048,13 @@ ; ; AVX2-FAST-PERLANE-LABEL: shuffle_v8f32_76547654: ; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,3,2,3] ; AVX2-FAST-PERLANE-NEXT: retq ; ; AVX512VL-SLOW-LABEL: shuffle_v8f32_76547654: ; AVX512VL-SLOW: # %bb.0: -; AVX512VL-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX512VL-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] ; AVX512VL-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,3,2,3] ; AVX512VL-SLOW-NEXT: retq ; @@ -1067,7 +1067,7 @@ ; ; AVX512VL-FAST-PERLANE-LABEL: shuffle_v8f32_76547654: ; AVX512VL-FAST-PERLANE: # %bb.0: -; AVX512VL-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX512VL-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] ; AVX512VL-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,3,2,3] ; AVX512VL-FAST-PERLANE-NEXT: retq %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> @@ -1078,12 +1078,12 @@ ; AVX1-LABEL: shuffle_v8f32_76543210: ; AVX1: # %bb.0: ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1] -; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] ; AVX1-NEXT: retq ; ; AVX2-SLOW-LABEL: shuffle_v8f32_76543210: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,3,0,1] ; AVX2-SLOW-NEXT: retq ; @@ -1095,13 +1095,13 @@ ; ; AVX2-FAST-PERLANE-LABEL: shuffle_v8f32_76543210: ; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,3,0,1] ; AVX2-FAST-PERLANE-NEXT: retq ; ; AVX512VL-SLOW-LABEL: shuffle_v8f32_76543210: ; AVX512VL-SLOW: # %bb.0: -; AVX512VL-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX512VL-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] ; AVX512VL-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,3,0,1] ; AVX512VL-SLOW-NEXT: retq ; @@ -1113,7 +1113,7 @@ ; ; AVX512VL-FAST-PERLANE-LABEL: shuffle_v8f32_76543210: ; AVX512VL-FAST-PERLANE: # %bb.0: -; AVX512VL-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX512VL-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] ; AVX512VL-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,3,0,1] ; AVX512VL-FAST-PERLANE-NEXT: retq %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> @@ -1124,13 +1124,13 @@ ; AVX1OR2-LABEL: shuffle_v8f32_3210ba98: ; AVX1OR2: # %bb.0: ; AVX1OR2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1OR2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX1OR2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] ; AVX1OR2-NEXT: retq ; ; AVX512VL-SLOW-LABEL: shuffle_v8f32_3210ba98: ; AVX512VL-SLOW: # %bb.0: ; AVX512VL-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX512VL-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX512VL-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] ; AVX512VL-SLOW-NEXT: retq ; ; AVX512VL-FAST-ALL-LABEL: shuffle_v8f32_3210ba98: @@ -1142,7 +1142,7 @@ ; AVX512VL-FAST-PERLANE-LABEL: shuffle_v8f32_3210ba98: ; AVX512VL-FAST-PERLANE: # %bb.0: ; AVX512VL-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX512VL-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX512VL-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] ; AVX512VL-FAST-PERLANE-NEXT: retq %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> ret <8 x float> %shuffle @@ -1152,13 +1152,13 @@ ; AVX1OR2-LABEL: shuffle_v8f32_3210fedc: ; AVX1OR2: # %bb.0: ; AVX1OR2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX1OR2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX1OR2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] ; AVX1OR2-NEXT: retq ; ; AVX512VL-SLOW-LABEL: shuffle_v8f32_3210fedc: ; AVX512VL-SLOW: # %bb.0: ; AVX512VL-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512VL-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX512VL-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] ; AVX512VL-SLOW-NEXT: retq ; ; AVX512VL-FAST-LABEL: shuffle_v8f32_3210fedc: @@ -1174,13 +1174,13 @@ ; AVX1OR2-LABEL: shuffle_v8f32_7654fedc: ; AVX1OR2: # %bb.0: ; AVX1OR2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] -; AVX1OR2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX1OR2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] ; AVX1OR2-NEXT: retq ; ; AVX512VL-SLOW-LABEL: shuffle_v8f32_7654fedc: ; AVX512VL-SLOW: # %bb.0: ; AVX512VL-SLOW-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] -; AVX512VL-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX512VL-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] ; AVX512VL-SLOW-NEXT: retq ; ; AVX512VL-FAST-ALL-LABEL: shuffle_v8f32_7654fedc: @@ -1192,7 +1192,7 @@ ; AVX512VL-FAST-PERLANE-LABEL: shuffle_v8f32_7654fedc: ; AVX512VL-FAST-PERLANE: # %bb.0: ; AVX512VL-FAST-PERLANE-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] -; AVX512VL-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX512VL-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] ; AVX512VL-FAST-PERLANE-NEXT: retq %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> ret <8 x float> %shuffle @@ -1202,13 +1202,13 @@ ; AVX1OR2-LABEL: shuffle_v8f32_fedc7654: ; AVX1OR2: # %bb.0: ; AVX1OR2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX1OR2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX1OR2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] ; AVX1OR2-NEXT: retq ; ; AVX512VL-SLOW-LABEL: shuffle_v8f32_fedc7654: ; AVX512VL-SLOW: # %bb.0: ; AVX512VL-SLOW-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512VL-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX512VL-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] ; AVX512VL-SLOW-NEXT: retq ; ; AVX512VL-FAST-ALL-LABEL: shuffle_v8f32_fedc7654: @@ -1221,7 +1221,7 @@ ; AVX512VL-FAST-PERLANE-LABEL: shuffle_v8f32_fedc7654: ; AVX512VL-FAST-PERLANE: # %bb.0: ; AVX512VL-FAST-PERLANE-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512VL-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX512VL-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] ; AVX512VL-FAST-PERLANE-NEXT: retq %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> ret <8 x float> %shuffle @@ -1266,13 +1266,13 @@ ; AVX1OR2-LABEL: shuffle_v8f32_ba987654: ; AVX1OR2: # %bb.0: ; AVX1OR2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX1OR2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX1OR2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] ; AVX1OR2-NEXT: retq ; ; AVX512VL-SLOW-LABEL: shuffle_v8f32_ba987654: ; AVX512VL-SLOW: # %bb.0: ; AVX512VL-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX512VL-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX512VL-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] ; AVX512VL-SLOW-NEXT: retq ; ; AVX512VL-FAST-LABEL: shuffle_v8f32_ba987654: @@ -1289,13 +1289,13 @@ ; AVX1OR2-LABEL: shuffle_v8f32_ba983210: ; AVX1OR2: # %bb.0: ; AVX1OR2-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1OR2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX1OR2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] ; AVX1OR2-NEXT: retq ; ; AVX512VL-SLOW-LABEL: shuffle_v8f32_ba983210: ; AVX512VL-SLOW: # %bb.0: ; AVX512VL-SLOW-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX512VL-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX512VL-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] ; AVX512VL-SLOW-NEXT: retq ; ; AVX512VL-FAST-ALL-LABEL: shuffle_v8f32_ba983210: @@ -1308,7 +1308,7 @@ ; AVX512VL-FAST-PERLANE-LABEL: shuffle_v8f32_ba983210: ; AVX512VL-FAST-PERLANE: # %bb.0: ; AVX512VL-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX512VL-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX512VL-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] ; AVX512VL-FAST-PERLANE-NEXT: retq %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> ret <8 x float> %shuffle @@ -1336,12 +1336,12 @@ ; AVX1-LABEL: shuffle_v8f32_084c195d: ; AVX1: # %bb.0: ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm1[2,3,0,1] -; AVX1-NEXT: vpermilps {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,4] -; AVX1-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,0,2,1,4,4,6,5] +; AVX1-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,4] +; AVX1-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,0,2,1,4,4,6,5] ; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3,4,5],ymm1[6,7] ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3,0,1] -; AVX1-NEXT: vpermilps {{.*#+}} ymm2 = ymm2[1,1,0,3,5,5,4,7] -; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,1,1,3,4,5,5,7] +; AVX1-NEXT: vshufps {{.*#+}} ymm2 = ymm2[1,1,0,3,5,5,4,7] +; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,1,3,4,5,5,7] ; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3,4,5],ymm0[6,7] ; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] ; AVX1-NEXT: retq @@ -1474,7 +1474,7 @@ define <8 x float> @shuffle_v8f32_uuuu1111(<8 x float> %a, <8 x float> %b) { ; ALL-LABEL: shuffle_v8f32_uuuu1111: ; ALL: # %bb.0: -; ALL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,1,1] +; ALL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1,1,1] ; ALL-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> @@ -1485,7 +1485,7 @@ ; AVX1-LABEL: shuffle_v8f32_44444444: ; AVX1: # %bb.0: ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4] +; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4] ; AVX1-NEXT: retq ; ; AVX2-SLOW-LABEL: shuffle_v8f32_44444444: @@ -1539,7 +1539,7 @@ define <8 x float> @shuffle_v8f32_uuuu3210(<8 x float> %a, <8 x float> %b) { ; ALL-LABEL: shuffle_v8f32_uuuu3210: ; ALL: # %bb.0: -; ALL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0] +; ALL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,2,1,0] ; ALL-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> @@ -1559,7 +1559,7 @@ define <8 x float> @shuffle_v8f32_1111uuuu(<8 x float> %a, <8 x float> %b) { ; ALL-LABEL: shuffle_v8f32_1111uuuu: ; ALL: # %bb.0: -; ALL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,1,1] +; ALL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1,1,1] ; ALL-NEXT: retq %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> ret <8 x float> %shuffle @@ -1569,7 +1569,7 @@ ; ALL-LABEL: shuffle_v8f32_5555uuuu: ; ALL: # %bb.0: ; ALL-NEXT: vextractf128 $1, %ymm0, %xmm0 -; ALL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,1,1] +; ALL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1,1,1] ; ALL-NEXT: retq %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> ret <8 x float> %shuffle @@ -1580,7 +1580,7 @@ ; ALL: # %bb.0: ; ALL-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 ; ALL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; ALL-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] ; ALL-NEXT: retq %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> %2 = shufflevector <4 x float> %b, <4 x float> undef, <4 x i32> @@ -1614,7 +1614,7 @@ define <8 x i32> @shuffle_v8i32_00000000(<8 x i32> %a, <8 x i32> %b) { ; AVX1-LABEL: shuffle_v8i32_00000000: ; AVX1: # %bb.0: -; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0] +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0,0,0] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-NEXT: retq ; @@ -1629,14 +1629,14 @@ define <8 x i32> @shuffle_v8i32_00000010(<8 x i32> %a, <8 x i32> %b) { ; AVX1-LABEL: shuffle_v8i32_00000010: ; AVX1: # %bb.0: -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,0,0,0] -; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,1,0] +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[0,0,0,0] +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0,1,0] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-SLOW-LABEL: shuffle_v8i32_00000010: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,1,0] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0,1,0] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,0,1] ; AVX2-SLOW-NEXT: retq ; @@ -1648,13 +1648,13 @@ ; ; AVX2-FAST-PERLANE-LABEL: shuffle_v8i32_00000010: ; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,1,0] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0,1,0] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,0,1] ; AVX2-FAST-PERLANE-NEXT: retq ; ; AVX512VL-SLOW-LABEL: shuffle_v8i32_00000010: ; AVX512VL-SLOW: # %bb.0: -; AVX512VL-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,1,0] +; AVX512VL-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0,1,0] ; AVX512VL-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,0,1] ; AVX512VL-SLOW-NEXT: retq ; @@ -1666,7 +1666,7 @@ ; ; AVX512VL-FAST-PERLANE-LABEL: shuffle_v8i32_00000010: ; AVX512VL-FAST-PERLANE: # %bb.0: -; AVX512VL-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,1,0] +; AVX512VL-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0,1,0] ; AVX512VL-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,0,1] ; AVX512VL-FAST-PERLANE-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> @@ -1676,14 +1676,14 @@ define <8 x i32> @shuffle_v8i32_00000200(<8 x i32> %a, <8 x i32> %b) { ; AVX1-LABEL: shuffle_v8i32_00000200: ; AVX1: # %bb.0: -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,0,0,0] -; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,0,0] +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[0,0,0,0] +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,0,0] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-SLOW-LABEL: shuffle_v8i32_00000200: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0,0,2] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,0] ; AVX2-SLOW-NEXT: retq ; @@ -1695,13 +1695,13 @@ ; ; AVX2-FAST-PERLANE-LABEL: shuffle_v8i32_00000200: ; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0,0,2] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,0] ; AVX2-FAST-PERLANE-NEXT: retq ; ; AVX512VL-SLOW-LABEL: shuffle_v8i32_00000200: ; AVX512VL-SLOW: # %bb.0: -; AVX512VL-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,2] +; AVX512VL-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0,0,2] ; AVX512VL-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,0] ; AVX512VL-SLOW-NEXT: retq ; @@ -1713,7 +1713,7 @@ ; ; AVX512VL-FAST-PERLANE-LABEL: shuffle_v8i32_00000200: ; AVX512VL-FAST-PERLANE: # %bb.0: -; AVX512VL-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,2] +; AVX512VL-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0,0,2] ; AVX512VL-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,0] ; AVX512VL-FAST-PERLANE-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> @@ -1723,14 +1723,14 @@ define <8 x i32> @shuffle_v8i32_00003000(<8 x i32> %a, <8 x i32> %b) { ; AVX1-LABEL: shuffle_v8i32_00003000: ; AVX1: # %bb.0: -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,0,0,0] -; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,0,0,0] +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[0,0,0,0] +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,0,0,0] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-SLOW-LABEL: shuffle_v8i32_00003000: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,3,0] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0,3,0] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,0] ; AVX2-SLOW-NEXT: retq ; @@ -1742,13 +1742,13 @@ ; ; AVX2-FAST-PERLANE-LABEL: shuffle_v8i32_00003000: ; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,3,0] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0,3,0] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,0] ; AVX2-FAST-PERLANE-NEXT: retq ; ; AVX512VL-SLOW-LABEL: shuffle_v8i32_00003000: ; AVX512VL-SLOW: # %bb.0: -; AVX512VL-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,3,0] +; AVX512VL-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0,3,0] ; AVX512VL-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,0] ; AVX512VL-SLOW-NEXT: retq ; @@ -1760,7 +1760,7 @@ ; ; AVX512VL-FAST-PERLANE-LABEL: shuffle_v8i32_00003000: ; AVX512VL-FAST-PERLANE: # %bb.0: -; AVX512VL-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,3,0] +; AVX512VL-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0,3,0] ; AVX512VL-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,0] ; AVX512VL-FAST-PERLANE-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> @@ -1844,7 +1844,7 @@ ; ; AVX2OR512VL-LABEL: shuffle_v8i32_01014545: ; AVX2OR512VL: # %bb.0: -; AVX2OR512VL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,1,0,1,4,5,4,5] +; AVX2OR512VL-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,0,1,4,5,4,5] ; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> ret <8 x i32> %shuffle @@ -1853,15 +1853,15 @@ define <8 x i32> @shuffle_v8i32_00112233(<8 x i32> %a, <8 x i32> %b) { ; AVX1-LABEL: shuffle_v8i32_00112233: ; AVX1: # %bb.0: -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,0,1,1] -; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,2,3,3] +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[0,0,1,1] +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,2,3,3] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-SLOW-LABEL: shuffle_v8i32_00112233: ; AVX2-SLOW: # %bb.0: ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,1,1,4,4,5,5] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0,1,1,4,4,5,5] ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-ALL-LABEL: shuffle_v8i32_00112233: @@ -1873,13 +1873,13 @@ ; AVX2-FAST-PERLANE-LABEL: shuffle_v8i32_00112233: ; AVX2-FAST-PERLANE: # %bb.0: ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,1,1,4,4,5,5] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0,1,1,4,4,5,5] ; AVX2-FAST-PERLANE-NEXT: retq ; ; AVX512VL-SLOW-LABEL: shuffle_v8i32_00112233: ; AVX512VL-SLOW: # %bb.0: ; AVX512VL-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3] -; AVX512VL-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,1,1,4,4,5,5] +; AVX512VL-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0,1,1,4,4,5,5] ; AVX512VL-SLOW-NEXT: retq ; ; AVX512VL-FAST-ALL-LABEL: shuffle_v8i32_00112233: @@ -1891,7 +1891,7 @@ ; AVX512VL-FAST-PERLANE-LABEL: shuffle_v8i32_00112233: ; AVX512VL-FAST-PERLANE: # %bb.0: ; AVX512VL-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3] -; AVX512VL-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,1,1,4,4,5,5] +; AVX512VL-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0,1,1,4,4,5,5] ; AVX512VL-FAST-PERLANE-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> ret <8 x i32> %shuffle @@ -1900,14 +1900,14 @@ define <8 x i32> @shuffle_v8i32_00001111(<8 x i32> %a, <8 x i32> %b) { ; AVX1-LABEL: shuffle_v8i32_00001111: ; AVX1: # %bb.0: -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,0,0,0] -; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,1,1] +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[0,0,0,0] +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1,1,1] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-SLOW-LABEL: shuffle_v8i32_00001111: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,1,1] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0,1,1] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,1] ; AVX2-SLOW-NEXT: retq ; @@ -1919,13 +1919,13 @@ ; ; AVX2-FAST-PERLANE-LABEL: shuffle_v8i32_00001111: ; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,1,1] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0,1,1] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,1] ; AVX2-FAST-PERLANE-NEXT: retq ; ; AVX512VL-SLOW-LABEL: shuffle_v8i32_00001111: ; AVX512VL-SLOW: # %bb.0: -; AVX512VL-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,1,1] +; AVX512VL-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0,1,1] ; AVX512VL-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,1] ; AVX512VL-SLOW-NEXT: retq ; @@ -1937,7 +1937,7 @@ ; ; AVX512VL-FAST-PERLANE-LABEL: shuffle_v8i32_00001111: ; AVX512VL-FAST-PERLANE: # %bb.0: -; AVX512VL-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,1,1] +; AVX512VL-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0,1,1] ; AVX512VL-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,1] ; AVX512VL-FAST-PERLANE-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> @@ -1957,7 +1957,7 @@ ; AVX1-LABEL: shuffle_v8i32_08080808: ; AVX1: # %bb.0: ; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,0] -; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,1,3] +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,1,3] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-NEXT: retq ; @@ -1992,19 +1992,19 @@ ; AVX1-LABEL: shuffle_v8i32_08084c4c: ; AVX1: # %bb.0: ; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0],ymm1[0,0],ymm0[4,4],ymm1[4,4] -; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7] +; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7] ; AVX1-NEXT: retq ; ; AVX2-LABEL: shuffle_v8i32_08084c4c: ; AVX2: # %bb.0: ; AVX2-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] -; AVX2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,1,0,1,4,5,4,5] +; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,0,1,4,5,4,5] ; AVX2-NEXT: retq ; ; AVX512VL-SLOW-LABEL: shuffle_v8i32_08084c4c: ; AVX512VL-SLOW: # %bb.0: ; AVX512VL-SLOW-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] -; AVX512VL-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,1,0,1,4,5,4,5] +; AVX512VL-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,0,1,4,5,4,5] ; AVX512VL-SLOW-NEXT: retq ; ; AVX512VL-FAST-LABEL: shuffle_v8i32_08084c4c: @@ -2084,7 +2084,7 @@ ; AVX1-NEXT: vmovlhps {{.*#+}} xmm2 = xmm0[0],xmm1[0] ; AVX1-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,2],xmm1[1,1] ; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,2,3,3] +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,2,3,3] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 ; AVX1-NEXT: retq ; @@ -2164,7 +2164,7 @@ define <8 x i32> @shuffle_v8i32_09ab1def(<8 x i32> %a, <8 x i32> %b) { ; AVX1-LABEL: shuffle_v8i32_09ab1def: ; AVX1: # %bb.0: -; AVX1-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[1,1,3,3] +; AVX1-NEXT: vshufps {{.*#+}} xmm2 = xmm0[1,1,3,3] ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] ; AVX1-NEXT: retq @@ -2203,7 +2203,7 @@ define <8 x i32> @shuffle_v8i32_00014445(<8 x i32> %a, <8 x i32> %b) { ; ALL-LABEL: shuffle_v8i32_00014445: ; ALL: # %bb.0: -; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,0,1,4,4,4,5] +; ALL-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0,0,1,4,4,4,5] ; ALL-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> ret <8 x i32> %shuffle @@ -2212,7 +2212,7 @@ define <8 x i32> @shuffle_v8i32_00204464(<8 x i32> %a, <8 x i32> %b) { ; ALL-LABEL: shuffle_v8i32_00204464: ; ALL: # %bb.0: -; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,2,0,4,4,6,4] +; ALL-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0,2,0,4,4,6,4] ; ALL-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> ret <8 x i32> %shuffle @@ -2221,7 +2221,7 @@ define <8 x i32> @shuffle_v8i32_03004744(<8 x i32> %a, <8 x i32> %b) { ; ALL-LABEL: shuffle_v8i32_03004744: ; ALL: # %bb.0: -; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,3,0,0,4,7,4,4] +; ALL-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,3,0,0,4,7,4,4] ; ALL-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> ret <8 x i32> %shuffle @@ -2230,7 +2230,7 @@ define <8 x i32> @shuffle_v8i32_10005444(<8 x i32> %a, <8 x i32> %b) { ; ALL-LABEL: shuffle_v8i32_10005444: ; ALL: # %bb.0: -; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,0,0,0,5,4,4,4] +; ALL-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,0,0,0,5,4,4,4] ; ALL-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> ret <8 x i32> %shuffle @@ -2239,7 +2239,7 @@ define <8 x i32> @shuffle_v8i32_22006644(<8 x i32> %a, <8 x i32> %b) { ; ALL-LABEL: shuffle_v8i32_22006644: ; ALL: # %bb.0: -; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,2,0,0,6,6,4,4] +; ALL-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,2,0,0,6,6,4,4] ; ALL-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> ret <8 x i32> %shuffle @@ -2248,7 +2248,7 @@ define <8 x i32> @shuffle_v8i32_33307774(<8 x i32> %a, <8 x i32> %b) { ; ALL-LABEL: shuffle_v8i32_33307774: ; ALL: # %bb.0: -; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,3,3,0,7,7,7,4] +; ALL-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,3,3,0,7,7,7,4] ; ALL-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> ret <8 x i32> %shuffle @@ -2257,7 +2257,7 @@ define <8 x i32> @shuffle_v8i32_32107654(<8 x i32> %a, <8 x i32> %b) { ; ALL-LABEL: shuffle_v8i32_32107654: ; ALL: # %bb.0: -; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; ALL-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] ; ALL-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> ret <8 x i32> %shuffle @@ -2266,7 +2266,7 @@ define <8 x i32> @shuffle_v8i32_00234467(<8 x i32> %a, <8 x i32> %b) { ; ALL-LABEL: shuffle_v8i32_00234467: ; ALL: # %bb.0: -; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,2,3,4,4,6,7] +; ALL-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0,2,3,4,4,6,7] ; ALL-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> ret <8 x i32> %shuffle @@ -2280,7 +2280,7 @@ ; ; AVX2OR512VL-LABEL: shuffle_v8i32_00224466: ; AVX2OR512VL: # %bb.0: -; AVX2OR512VL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,2,2,4,4,6,6] +; AVX2OR512VL-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0,2,2,4,4,6,6] ; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> ret <8 x i32> %shuffle @@ -2289,7 +2289,7 @@ define <8 x i32> @shuffle_v8i32_10325476(<8 x i32> %a, <8 x i32> %b) { ; ALL-LABEL: shuffle_v8i32_10325476: ; ALL: # %bb.0: -; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6] +; ALL-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6] ; ALL-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> ret <8 x i32> %shuffle @@ -2303,7 +2303,7 @@ ; ; AVX2OR512VL-LABEL: shuffle_v8i32_11335577: ; AVX2OR512VL: # %bb.0: -; AVX2OR512VL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,1,3,3,5,5,7,7] +; AVX2OR512VL-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,1,3,3,5,5,7,7] ; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> ret <8 x i32> %shuffle @@ -2312,7 +2312,7 @@ define <8 x i32> @shuffle_v8i32_10235467(<8 x i32> %a, <8 x i32> %b) { ; ALL-LABEL: shuffle_v8i32_10235467: ; ALL: # %bb.0: -; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,0,2,3,5,4,6,7] +; ALL-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,0,2,3,5,4,6,7] ; ALL-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> ret <8 x i32> %shuffle @@ -2321,7 +2321,7 @@ define <8 x i32> @shuffle_v8i32_10225466(<8 x i32> %a, <8 x i32> %b) { ; ALL-LABEL: shuffle_v8i32_10225466: ; ALL: # %bb.0: -; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,0,2,2,5,4,6,6] +; ALL-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,0,2,2,5,4,6,6] ; ALL-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> ret <8 x i32> %shuffle @@ -2625,7 +2625,7 @@ ; AVX2-SLOW-LABEL: shuffle_v8i32_6caa87e5: ; AVX2-SLOW: # %bb.0: ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,1,3,2] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,0,2,2,4,4,6,6] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,0,2,2,4,4,6,6] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,0,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4],ymm0[5],ymm1[6],ymm0[7] ; AVX2-SLOW-NEXT: retq @@ -2641,7 +2641,7 @@ ; AVX2-FAST-PERLANE-LABEL: shuffle_v8i32_6caa87e5: ; AVX2-FAST-PERLANE: # %bb.0: ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,1,3,2] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,0,2,2,4,4,6,6] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,0,2,2,4,4,6,6] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,0,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4],ymm0[5],ymm1[6],ymm0[7] ; AVX2-FAST-PERLANE-NEXT: retq @@ -2659,13 +2659,13 @@ define <8 x i32> @shuffle_v8i32_32103210(<8 x i32> %a, <8 x i32> %b) { ; AVX1-LABEL: shuffle_v8i32_32103210: ; AVX1: # %bb.0: -; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0] +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,2,1,0] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-SLOW-LABEL: shuffle_v8i32_32103210: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,2,1,0] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,1] ; AVX2-SLOW-NEXT: retq ; @@ -2678,13 +2678,13 @@ ; ; AVX2-FAST-PERLANE-LABEL: shuffle_v8i32_32103210: ; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,2,1,0] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,1] ; AVX2-FAST-PERLANE-NEXT: retq ; ; AVX512VL-SLOW-LABEL: shuffle_v8i32_32103210: ; AVX512VL-SLOW: # %bb.0: -; AVX512VL-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0] +; AVX512VL-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,2,1,0] ; AVX512VL-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,1] ; AVX512VL-SLOW-NEXT: retq ; @@ -2697,7 +2697,7 @@ ; ; AVX512VL-FAST-PERLANE-LABEL: shuffle_v8i32_32103210: ; AVX512VL-FAST-PERLANE: # %bb.0: -; AVX512VL-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0] +; AVX512VL-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,2,1,0] ; AVX512VL-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,1] ; AVX512VL-FAST-PERLANE-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> @@ -2708,12 +2708,12 @@ ; AVX1-LABEL: shuffle_v8i32_76547654: ; AVX1: # %bb.0: ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] ; AVX1-NEXT: retq ; ; AVX2-SLOW-LABEL: shuffle_v8i32_76547654: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,3,2,3] ; AVX2-SLOW-NEXT: retq ; @@ -2726,13 +2726,13 @@ ; ; AVX2-FAST-PERLANE-LABEL: shuffle_v8i32_76547654: ; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,3,2,3] ; AVX2-FAST-PERLANE-NEXT: retq ; ; AVX512VL-SLOW-LABEL: shuffle_v8i32_76547654: ; AVX512VL-SLOW: # %bb.0: -; AVX512VL-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX512VL-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] ; AVX512VL-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,3,2,3] ; AVX512VL-SLOW-NEXT: retq ; @@ -2745,7 +2745,7 @@ ; ; AVX512VL-FAST-PERLANE-LABEL: shuffle_v8i32_76547654: ; AVX512VL-FAST-PERLANE: # %bb.0: -; AVX512VL-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX512VL-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] ; AVX512VL-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,3,2,3] ; AVX512VL-FAST-PERLANE-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> @@ -2756,12 +2756,12 @@ ; AVX1-LABEL: shuffle_v8i32_76543210: ; AVX1: # %bb.0: ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1] -; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] ; AVX1-NEXT: retq ; ; AVX2-SLOW-LABEL: shuffle_v8i32_76543210: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,3,0,1] ; AVX2-SLOW-NEXT: retq ; @@ -2773,13 +2773,13 @@ ; ; AVX2-FAST-PERLANE-LABEL: shuffle_v8i32_76543210: ; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,3,0,1] ; AVX2-FAST-PERLANE-NEXT: retq ; ; AVX512VL-SLOW-LABEL: shuffle_v8i32_76543210: ; AVX512VL-SLOW: # %bb.0: -; AVX512VL-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX512VL-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] ; AVX512VL-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,3,0,1] ; AVX512VL-SLOW-NEXT: retq ; @@ -2791,7 +2791,7 @@ ; ; AVX512VL-FAST-PERLANE-LABEL: shuffle_v8i32_76543210: ; AVX512VL-FAST-PERLANE: # %bb.0: -; AVX512VL-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX512VL-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] ; AVX512VL-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,3,0,1] ; AVX512VL-FAST-PERLANE-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> @@ -2802,13 +2802,13 @@ ; AVX1OR2-LABEL: shuffle_v8i32_3210ba98: ; AVX1OR2: # %bb.0: ; AVX1OR2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1OR2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX1OR2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] ; AVX1OR2-NEXT: retq ; ; AVX512VL-SLOW-LABEL: shuffle_v8i32_3210ba98: ; AVX512VL-SLOW: # %bb.0: ; AVX512VL-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX512VL-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX512VL-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] ; AVX512VL-SLOW-NEXT: retq ; ; AVX512VL-FAST-ALL-LABEL: shuffle_v8i32_3210ba98: @@ -2820,7 +2820,7 @@ ; AVX512VL-FAST-PERLANE-LABEL: shuffle_v8i32_3210ba98: ; AVX512VL-FAST-PERLANE: # %bb.0: ; AVX512VL-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX512VL-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX512VL-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] ; AVX512VL-FAST-PERLANE-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> ret <8 x i32> %shuffle @@ -2830,13 +2830,13 @@ ; AVX1OR2-LABEL: shuffle_v8i32_3210fedc: ; AVX1OR2: # %bb.0: ; AVX1OR2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX1OR2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX1OR2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] ; AVX1OR2-NEXT: retq ; ; AVX512VL-SLOW-LABEL: shuffle_v8i32_3210fedc: ; AVX512VL-SLOW: # %bb.0: ; AVX512VL-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512VL-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX512VL-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] ; AVX512VL-SLOW-NEXT: retq ; ; AVX512VL-FAST-LABEL: shuffle_v8i32_3210fedc: @@ -2852,7 +2852,7 @@ ; AVX1OR2-LABEL: shuffle_v8i32_7654fedc: ; AVX1OR2: # %bb.0: ; AVX1OR2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] -; AVX1OR2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX1OR2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] ; AVX1OR2-NEXT: retq ; ; AVX512VL-SLOW-LABEL: shuffle_v8i32_7654fedc: @@ -2880,7 +2880,7 @@ ; AVX1OR2-LABEL: shuffle_v8i32_fedc7654: ; AVX1OR2: # %bb.0: ; AVX1OR2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX1OR2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX1OR2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] ; AVX1OR2-NEXT: retq ; ; AVX512VL-SLOW-LABEL: shuffle_v8i32_fedc7654: @@ -2909,13 +2909,13 @@ ; AVX1OR2-LABEL: shuffle_v8i32_ba987654: ; AVX1OR2: # %bb.0: ; AVX1OR2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX1OR2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX1OR2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] ; AVX1OR2-NEXT: retq ; ; AVX512VL-SLOW-LABEL: shuffle_v8i32_ba987654: ; AVX512VL-SLOW: # %bb.0: ; AVX512VL-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX512VL-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX512VL-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] ; AVX512VL-SLOW-NEXT: retq ; ; AVX512VL-FAST-LABEL: shuffle_v8i32_ba987654: @@ -2932,13 +2932,13 @@ ; AVX1OR2-LABEL: shuffle_v8i32_ba983210: ; AVX1OR2: # %bb.0: ; AVX1OR2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX1OR2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX1OR2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] ; AVX1OR2-NEXT: retq ; ; AVX512VL-SLOW-LABEL: shuffle_v8i32_ba983210: ; AVX512VL-SLOW: # %bb.0: ; AVX512VL-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX512VL-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX512VL-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] ; AVX512VL-SLOW-NEXT: retq ; ; AVX512VL-FAST-LABEL: shuffle_v8i32_ba983210: @@ -3105,7 +3105,7 @@ define <8 x i32> @shuffle_v8i32_uuuu1111(<8 x i32> %a, <8 x i32> %b) { ; ALL-LABEL: shuffle_v8i32_uuuu1111: ; ALL: # %bb.0: -; ALL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,1,1] +; ALL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1,1,1] ; ALL-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> @@ -3115,7 +3115,7 @@ define <8 x i32> @shuffle_v8i32_2222uuuu(<8 x i32> %a, <8 x i32> %b) { ; ALL-LABEL: shuffle_v8i32_2222uuuu: ; ALL: # %bb.0: -; ALL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,2,2,2] +; ALL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,2,2,2] ; ALL-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> ret <8 x i32> %shuffle @@ -3134,7 +3134,7 @@ ; AVX1-LABEL: shuffle_v8i32_44444444: ; AVX1: # %bb.0: ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4] +; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4] ; AVX1-NEXT: retq ; ; AVX2-SLOW-LABEL: shuffle_v8i32_44444444: @@ -3180,7 +3180,7 @@ ; AVX1-LABEL: shuffle_v8i32_44444444_bc: ; AVX1: # %bb.0: ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4] +; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4] ; AVX1-NEXT: retq ; ; AVX2-SLOW-LABEL: shuffle_v8i32_44444444_bc: @@ -3228,7 +3228,7 @@ ; ALL-LABEL: shuffle_v8i32_5555uuuu: ; ALL: # %bb.0: ; ALL-NEXT: vextractf128 $1, %ymm0, %xmm0 -; ALL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,1,1] +; ALL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1,1,1] ; ALL-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> ret <8 x i32> %shuffle @@ -3239,7 +3239,7 @@ ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 ; AVX1-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm0[1,2,3] -; AVX1-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[3,1,1,0] +; AVX1-NEXT: vshufps {{.*#+}} xmm2 = xmm2[3,1,1,0] ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,2,3] ; AVX1-NEXT: vpermilpd {{.*#+}} ymm1 = ymm1[0,0,3,2] @@ -3273,7 +3273,7 @@ ; ; AVX2OR512VL-LABEL: shuffle_v8i32_uuuuuu7u: ; AVX2OR512VL: # %bb.0: -; AVX2OR512VL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,3,3,3,7,7,7,7] +; AVX2OR512VL-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,3,3,3,7,7,7,7] ; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> ret <8 x i32> %shuffle @@ -3284,7 +3284,7 @@ ; ALL: # %bb.0: ; ALL-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 ; ALL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; ALL-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] ; ALL-NEXT: retq %1 = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> %2 = shufflevector <4 x i32> %b, <4 x i32> undef, <4 x i32> @@ -3297,7 +3297,7 @@ ; ALL: # %bb.0: ; ALL-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 ; ALL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4] +; ALL-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4] ; ALL-NEXT: retq %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <8 x i32> ret <8 x i32> %1 @@ -3317,7 +3317,7 @@ define <8 x float> @splat_v8f32(<4 x float> %r) { ; AVX1-LABEL: splat_v8f32: ; AVX1: # %bb.0: -; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0] +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0,0,0] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-NEXT: retq ; @@ -3338,7 +3338,7 @@ ; AVX1: # %bb.0: ; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm1[0,2],ymm0[4,6],ymm1[4,6] -; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,0,3,1,6,4,7,5] +; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0,3,1,6,4,7,5] ; AVX1-NEXT: retq ; ; AVX2OR512VL-LABEL: shuffle_v8i32_z0U2zUz6: @@ -3354,7 +3354,7 @@ ; AVX1: # %bb.0: ; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm1[1,3],ymm0[5,7],ymm1[5,7] -; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7] +; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7] ; AVX1-NEXT: retq ; ; AVX2OR512VL-LABEL: shuffle_v8i32_1U3z5zUU: @@ -3428,7 +3428,7 @@ define <8 x i32> @shuffle_v8i32_30127456(<8 x i32> %a, <8 x i32> %b) { ; ALL-LABEL: shuffle_v8i32_30127456: ; ALL: # %bb.0: -; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,0,1,2,7,4,5,6] +; ALL-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,0,1,2,7,4,5,6] ; ALL-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> ret <8 x i32> %shuffle @@ -3437,7 +3437,7 @@ define <8 x i32> @shuffle_v8i32_12305674(<8 x i32> %a, <8 x i32> %b) { ; ALL-LABEL: shuffle_v8i32_12305674: ; ALL: # %bb.0: -; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,2,3,0,5,6,7,4] +; ALL-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,2,3,0,5,6,7,4] ; ALL-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> ret <8 x i32> %shuffle @@ -3690,7 +3690,7 @@ define <8 x float> @broadcast_concat_crash(<4 x float> %x, <4 x float> %y, float %z) { ; AVX1-LABEL: broadcast_concat_crash: ; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm1[3,3,3,3] +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm1[3,3,3,3] ; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[2,3] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-NEXT: retq @@ -3699,7 +3699,7 @@ ; AVX2: # %bb.0: # %entry ; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,3] -; AVX2-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,3,3] +; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1,3,3] ; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[2,3] ; AVX2-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-NEXT: retq @@ -3708,7 +3708,7 @@ ; AVX512VL-SLOW: # %bb.0: # %entry ; AVX512VL-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX512VL-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,3] -; AVX512VL-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,3,3] +; AVX512VL-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1,3,3] ; AVX512VL-SLOW-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[2,3] ; AVX512VL-SLOW-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX512VL-SLOW-NEXT: retq @@ -3812,7 +3812,7 @@ ; AVX1-LABEL: lowhalf_v8i32: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[2,2,2,2] +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,2,2,2] ; AVX1-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX1-NEXT: retq ; @@ -3839,7 +3839,7 @@ ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3],xmm1[2,2] -; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,1,3] +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,1,3] ; AVX1-NEXT: retq ; ; AVX2-LABEL: lowhalf_v8f32: diff --git a/llvm/test/CodeGen/X86/vector-shuffle-512-v16.ll b/llvm/test/CodeGen/X86/vector-shuffle-512-v16.ll --- a/llvm/test/CodeGen/X86/vector-shuffle-512-v16.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-512-v16.ll @@ -141,7 +141,7 @@ define <16 x float> @shuffle_v16f32_00_00_02_00_04_04_06_04_08_08_10_08_12_12_14_12(<16 x float> %a, <16 x float> %b) { ; ALL-LABEL: shuffle_v16f32_00_00_02_00_04_04_06_04_08_08_10_08_12_12_14_12: ; ALL: # %bb.0: -; ALL-NEXT: vpermilps {{.*#+}} zmm0 = zmm0[0,0,2,0,4,4,6,4,8,8,10,8,12,12,14,12] +; ALL-NEXT: vshufps {{.*#+}} zmm0 = zmm0[0,0,2,0,4,4,6,4,8,8,10,8,12,12,14,12] ; ALL-NEXT: retq %shuffle = shufflevector <16 x float> %a, <16 x float> %b, <16 x i32> ret <16 x float> %shuffle @@ -150,7 +150,7 @@ define <16 x float> @shuffle_v16f32_03_uu_uu_uu_uu_04_uu_uu_uu_uu_11_uu_uu_uu_uu_12(<16 x float> %a, <16 x float> %b) { ; ALL-LABEL: shuffle_v16f32_03_uu_uu_uu_uu_04_uu_uu_uu_uu_11_uu_uu_uu_uu_12: ; ALL: # %bb.0: -; ALL-NEXT: vpermilps {{.*#+}} zmm0 = zmm0[3,0,3,0,7,4,7,4,11,8,11,8,15,12,15,12] +; ALL-NEXT: vshufps {{.*#+}} zmm0 = zmm0[3,0,3,0,7,4,7,4,11,8,11,8,15,12,15,12] ; ALL-NEXT: retq %shuffle = shufflevector <16 x float> %a, <16 x float> %b, <16 x i32> ret <16 x float> %shuffle @@ -284,7 +284,7 @@ define <16 x float> @shuffle_v16f32_0b_0a_09_08_0f_0e_0d_0c_03_02_01_00_07_06_05_04(<16 x float> %a) { ; ALL-LABEL: shuffle_v16f32_0b_0a_09_08_0f_0e_0d_0c_03_02_01_00_07_06_05_04: ; ALL: # %bb.0: -; ALL-NEXT: vpermilps {{.*#+}} zmm0 = zmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] +; ALL-NEXT: vshufps {{.*#+}} zmm0 = zmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] ; ALL-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm0[4,5,6,7,0,1,2,3] ; ALL-NEXT: retq %1 = shufflevector <16 x float> %a, <16 x float> undef, <16 x i32> @@ -411,7 +411,7 @@ define <16 x i32> @shuffle_v16i16_1_0_0_0_5_4_4_4_9_8_8_8_13_12_12_12(<16 x i32> %a, <16 x i32> %b) { ; ALL-LABEL: shuffle_v16i16_1_0_0_0_5_4_4_4_9_8_8_8_13_12_12_12: ; ALL: # %bb.0: -; ALL-NEXT: vpermilps {{.*#+}} zmm0 = zmm0[1,0,0,0,5,4,4,4,9,8,8,8,13,12,12,12] +; ALL-NEXT: vshufps {{.*#+}} zmm0 = zmm0[1,0,0,0,5,4,4,4,9,8,8,8,13,12,12,12] ; ALL-NEXT: retq %c = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32> ret <16 x i32> %c @@ -420,7 +420,7 @@ define <16 x i32> @shuffle_v16i16_3_3_0_0_7_7_4_4_11_11_8_8_15_15_12_12(<16 x i32> %a, <16 x i32> %b) { ; ALL-LABEL: shuffle_v16i16_3_3_0_0_7_7_4_4_11_11_8_8_15_15_12_12: ; ALL: # %bb.0: -; ALL-NEXT: vpermilps {{.*#+}} zmm0 = zmm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13] +; ALL-NEXT: vshufps {{.*#+}} zmm0 = zmm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13] ; ALL-NEXT: retq %c = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32> ret <16 x i32> %c diff --git a/llvm/test/CodeGen/X86/vector-shuffle-512-v8.ll b/llvm/test/CodeGen/X86/vector-shuffle-512-v8.ll --- a/llvm/test/CodeGen/X86/vector-shuffle-512-v8.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-512-v8.ll @@ -1300,7 +1300,7 @@ define <8 x i64> @shuffle_v8i64_00224466(<8 x i64> %a, <8 x i64> %b) { ; ALL-LABEL: shuffle_v8i64_00224466: ; ALL: # %bb.0: -; ALL-NEXT: vpermilps {{.*#+}} zmm0 = zmm0[0,1,0,1,4,5,4,5,8,9,8,9,12,13,12,13] +; ALL-NEXT: vshufps {{.*#+}} zmm0 = zmm0[0,1,0,1,4,5,4,5,8,9,8,9,12,13,12,13] ; ALL-NEXT: ret{{[l|q]}} %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> ret <8 x i64> %shuffle @@ -1309,7 +1309,7 @@ define <8 x i64> @shuffle_v8i64_10325476(<8 x i64> %a, <8 x i64> %b) { ; ALL-LABEL: shuffle_v8i64_10325476: ; ALL: # %bb.0: -; ALL-NEXT: vpermilps {{.*#+}} zmm0 = zmm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13] +; ALL-NEXT: vshufps {{.*#+}} zmm0 = zmm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13] ; ALL-NEXT: ret{{[l|q]}} %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> ret <8 x i64> %shuffle @@ -1318,7 +1318,7 @@ define <8 x i64> @shuffle_v8i64_11335577(<8 x i64> %a, <8 x i64> %b) { ; ALL-LABEL: shuffle_v8i64_11335577: ; ALL: # %bb.0: -; ALL-NEXT: vpermilps {{.*#+}} zmm0 = zmm0[2,3,2,3,6,7,6,7,10,11,10,11,14,15,14,15] +; ALL-NEXT: vshufps {{.*#+}} zmm0 = zmm0[2,3,2,3,6,7,6,7,10,11,10,11,14,15,14,15] ; ALL-NEXT: ret{{[l|q]}} %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> ret <8 x i64> %shuffle diff --git a/llvm/test/CodeGen/X86/vector-shuffle-avx512.ll b/llvm/test/CodeGen/X86/vector-shuffle-avx512.ll --- a/llvm/test/CodeGen/X86/vector-shuffle-avx512.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-avx512.ll @@ -16,7 +16,7 @@ ; ; KNL-LABEL: expand: ; KNL: # %bb.0: -; KNL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,3] +; KNL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1,1,3] ; KNL-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; KNL-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3,4,5,6,7] ; KNL-NEXT: ret{{[l|q]}} @@ -291,7 +291,7 @@ ; ; KNL-LABEL: expand15: ; KNL: # %bb.0: -; KNL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,3] +; KNL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1,1,3] ; KNL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,3] ; KNL-NEXT: vblendps {{.*#+}} ymm0 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6,7] ; KNL-NEXT: ret{{[l|q]}} diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll --- a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll @@ -76,7 +76,7 @@ define <4 x float> @combine_vpermilvar_4f32_unpckh(<4 x float> %a0) { ; CHECK-LABEL: combine_vpermilvar_4f32_unpckh: ; CHECK: # %bb.0: -; CHECK-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,2,3,3] +; CHECK-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,2,3,3] ; CHECK-NEXT: ret{{[l|q]}} %1 = tail call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> %a0, <4 x i32> ) ret <4 x float> %1 @@ -85,7 +85,7 @@ define <4 x float> @combine_vpermilvar_4f32_unpckl(<4 x float> %a0) { ; CHECK-LABEL: combine_vpermilvar_4f32_unpckl: ; CHECK: # %bb.0: -; CHECK-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,1,1] +; CHECK-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0,1,1] ; CHECK-NEXT: ret{{[l|q]}} %1 = tail call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> %a0, <4 x i32> ) ret <4 x float> %1 @@ -143,7 +143,7 @@ ; AVX512-NEXT: vmovaps {{.*#+}} ymm1 = [16,17,18,19,3,2,1,0] ; AVX512-NEXT: vxorps %xmm2, %xmm2, %xmm2 ; AVX512-NEXT: vpermt2ps %zmm2, %zmm1, %zmm0 -; AVX512-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX512-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] ; AVX512-NEXT: ret{{[l|q]}} %1 = tail call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %a0, <8 x i32> ) %2 = shufflevector <8 x float> %1, <8 x float> zeroinitializer, <8 x i32> @@ -274,7 +274,7 @@ define <4 x float> @combine_vpermilvar_4f32_4stage(<4 x float> %a0) { ; CHECK-LABEL: combine_vpermilvar_4f32_4stage: ; CHECK: # %bb.0: -; CHECK-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,0,3,1] +; CHECK-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,0,3,1] ; CHECK-NEXT: ret{{[l|q]}} %1 = tail call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> %a0, <4 x i32> ) %2 = tail call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> %1, <4 x i32> ) @@ -286,7 +286,7 @@ define <8 x float> @combine_vpermilvar_8f32_4stage(<8 x float> %a0) { ; CHECK-LABEL: combine_vpermilvar_8f32_4stage: ; CHECK: # %bb.0: -; CHECK-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,0,3,1,6,4,7,5] +; CHECK-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0,3,1,6,4,7,5] ; CHECK-NEXT: ret{{[l|q]}} %1 = tail call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %a0, <8 x i32> ) %2 = tail call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %1, <8 x i32> ) diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx2.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx2.ll --- a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx2.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx2.ll @@ -755,7 +755,7 @@ define <32 x i8> @combine_pshufb_pshufb_or_pshufb(<32 x i8> %a0) { ; CHECK-LABEL: combine_pshufb_pshufb_or_pshufb: ; CHECK: # %bb.0: -; CHECK-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4] +; CHECK-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4] ; CHECK-NEXT: ret{{[l|q]}} %1 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> ) %2 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> ) diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512f.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512f.ll --- a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512f.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512f.ll @@ -519,7 +519,7 @@ define <16 x float> @combine_vpermt2var_16f32_vpermilps(<16 x float> %x0, <16 x float> %x1) { ; CHECK-LABEL: combine_vpermt2var_16f32_vpermilps: ; CHECK: # %bb.0: -; CHECK-NEXT: vpermilps {{.*#+}} zmm0 = zmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] +; CHECK-NEXT: vshufps {{.*#+}} zmm0 = zmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] ; CHECK-NEXT: ret{{[l|q]}} %res0 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> , <16 x float> %x0, <16 x float> %x1, i16 -1) ret <16 x float> %res0 diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll --- a/llvm/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll @@ -204,7 +204,7 @@ ; ; AVX-LABEL: combine_pshufb_palignr: ; AVX: # %bb.0: -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX-NEXT: retq %1 = shufflevector <16 x i8> %a0, <16 x i8> %a1, <16 x i32> %2 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %1, <16 x i8> ) @@ -723,7 +723,7 @@ ; ; AVX1-LABEL: combine_pshufb_pshufb_or_pshufb: ; AVX1: # %bb.0: -; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0] +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0,0,0] ; AVX1-NEXT: retq ; ; AVX2-LABEL: combine_pshufb_pshufb_or_pshufb: diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining.ll --- a/llvm/test/CodeGen/X86/vector-shuffle-combining.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-combining.ll @@ -100,7 +100,7 @@ ; ; AVX1-LABEL: combine_pshufd6: ; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0] +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0,0,0] ; AVX1-NEXT: retq ; ; AVX2-LABEL: combine_pshufd6: @@ -178,7 +178,7 @@ ; AVX-LABEL: combine_bitwise_ops_test1: ; AVX: # %bb.0: ; AVX-NEXT: vandps %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,1,3] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,1,3] ; AVX-NEXT: retq %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32> %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32> @@ -196,7 +196,7 @@ ; AVX-LABEL: combine_bitwise_ops_test2: ; AVX: # %bb.0: ; AVX-NEXT: vorps %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,1,3] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,1,3] ; AVX-NEXT: retq %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32> %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32> @@ -214,7 +214,7 @@ ; AVX-LABEL: combine_bitwise_ops_test3: ; AVX: # %bb.0: ; AVX-NEXT: vxorps %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,1,3] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,1,3] ; AVX-NEXT: retq %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32> %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32> @@ -232,7 +232,7 @@ ; AVX-LABEL: combine_bitwise_ops_test4: ; AVX: # %bb.0: ; AVX-NEXT: vandps %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,1,3] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,1,3] ; AVX-NEXT: retq %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32> %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32> @@ -250,7 +250,7 @@ ; AVX-LABEL: combine_bitwise_ops_test5: ; AVX: # %bb.0: ; AVX-NEXT: vorps %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,1,3] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,1,3] ; AVX-NEXT: retq %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32> %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32> @@ -268,7 +268,7 @@ ; AVX-LABEL: combine_bitwise_ops_test6: ; AVX: # %bb.0: ; AVX-NEXT: vxorps %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,1,3] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,1,3] ; AVX-NEXT: retq %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32> %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32> @@ -628,7 +628,7 @@ ; ; AVX-LABEL: combine_nested_undef_test1: ; AVX: # %bb.0: -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,0,1] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,1,0,1] ; AVX-NEXT: retq %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> @@ -643,7 +643,7 @@ ; ; AVX-LABEL: combine_nested_undef_test2: ; AVX: # %bb.0: -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,1,0,3] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,1,0,3] ; AVX-NEXT: retq %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> @@ -658,7 +658,7 @@ ; ; AVX-LABEL: combine_nested_undef_test3: ; AVX: # %bb.0: -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,1,0,3] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,1,0,3] ; AVX-NEXT: retq %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> @@ -673,7 +673,7 @@ ; ; AVX1-LABEL: combine_nested_undef_test4: ; AVX1: # %bb.0: -; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,0,1] +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1,0,1] ; AVX1-NEXT: retq ; ; AVX2-LABEL: combine_nested_undef_test4: @@ -693,7 +693,7 @@ ; ; AVX-LABEL: combine_nested_undef_test5: ; AVX: # %bb.0: -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX-NEXT: retq %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> @@ -708,7 +708,7 @@ ; ; AVX-LABEL: combine_nested_undef_test6: ; AVX: # %bb.0: -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,0,1] ; AVX-NEXT: retq %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> @@ -723,7 +723,7 @@ ; ; AVX-LABEL: combine_nested_undef_test7: ; AVX: # %bb.0: -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,0,2] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,0,2] ; AVX-NEXT: retq %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> @@ -738,7 +738,7 @@ ; ; AVX-LABEL: combine_nested_undef_test8: ; AVX: # %bb.0: -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,3,3] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1,3,3] ; AVX-NEXT: retq %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> @@ -753,7 +753,7 @@ ; ; AVX-LABEL: combine_nested_undef_test9: ; AVX: # %bb.0: -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,3,2,2] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3,2,2] ; AVX-NEXT: retq %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> @@ -768,7 +768,7 @@ ; ; AVX-LABEL: combine_nested_undef_test10: ; AVX: # %bb.0: -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,1,1] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1,1,1] ; AVX-NEXT: retq %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> @@ -783,7 +783,7 @@ ; ; AVX-LABEL: combine_nested_undef_test11: ; AVX: # %bb.0: -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,2,1] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1,2,1] ; AVX-NEXT: retq %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> @@ -798,7 +798,7 @@ ; ; AVX1-LABEL: combine_nested_undef_test12: ; AVX1: # %bb.0: -; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,0,1] +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1,0,1] ; AVX1-NEXT: retq ; ; AVX2-LABEL: combine_nested_undef_test12: @@ -867,15 +867,15 @@ ; ; AVX1-LABEL: combine_nested_undef_test15: ; AVX1: # %bb.0: -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,0,1,1] -; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,0,1] +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,0,1,1] +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,1,0,1] ; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] ; AVX1-NEXT: retq ; ; AVX2-LABEL: combine_nested_undef_test15: ; AVX2: # %bb.0: ; AVX2-NEXT: vbroadcastss %xmm1, %xmm1 -; AVX2-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,0,1] +; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,1,0,1] ; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] ; AVX2-NEXT: retq %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> @@ -906,7 +906,7 @@ ; ; AVX-LABEL: combine_nested_undef_test16: ; AVX: # %bb.0: -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,0,1] ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] ; AVX-NEXT: retq %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> @@ -936,7 +936,7 @@ ; AVX-LABEL: combine_nested_undef_test17: ; AVX: # %bb.0: ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,0,1] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,1,0,1] ; AVX-NEXT: retq %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> @@ -951,7 +951,7 @@ ; ; AVX-LABEL: combine_nested_undef_test18: ; AVX: # %bb.0: -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm1[1,1,0,3] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm1[1,1,0,3] ; AVX-NEXT: retq %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> @@ -980,7 +980,7 @@ ; AVX-LABEL: combine_nested_undef_test19: ; AVX: # %bb.0: ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,0,0,0] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,0,0,0] ; AVX-NEXT: retq %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> @@ -1011,7 +1011,7 @@ ; AVX-LABEL: combine_nested_undef_test20: ; AVX: # %bb.0: ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,3,0] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,3,0] ; AVX-NEXT: retq %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> @@ -1040,7 +1040,7 @@ ; AVX1-LABEL: combine_nested_undef_test21: ; AVX1: # %bb.0: ; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] -; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,0,1] +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1,0,1] ; AVX1-NEXT: retq ; ; AVX2-LABEL: combine_nested_undef_test21: @@ -1065,7 +1065,7 @@ ; ; AVX-LABEL: combine_nested_undef_test22: ; AVX: # %bb.0: -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm1[1,1,1,3] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm1[1,1,1,3] ; AVX-NEXT: retq %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> @@ -1080,7 +1080,7 @@ ; ; AVX-LABEL: combine_nested_undef_test23: ; AVX: # %bb.0: -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm1[0,1,0,3] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm1[0,1,0,3] ; AVX-NEXT: retq %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> @@ -1095,7 +1095,7 @@ ; ; AVX-LABEL: combine_nested_undef_test24: ; AVX: # %bb.0: -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm1[0,3,2,3] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm1[0,3,2,3] ; AVX-NEXT: retq %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> @@ -1110,7 +1110,7 @@ ; ; AVX1-LABEL: combine_nested_undef_test25: ; AVX1: # %bb.0: -; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,0,1] +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1,0,1] ; AVX1-NEXT: retq ; ; AVX2-LABEL: combine_nested_undef_test25: @@ -1130,7 +1130,7 @@ ; ; AVX-LABEL: combine_nested_undef_test26: ; AVX: # %bb.0: -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX-NEXT: retq %1 = shufflevector <4 x i32> %B, <4 x i32> %A, <4 x i32> %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> @@ -1145,7 +1145,7 @@ ; ; AVX1-LABEL: combine_nested_undef_test27: ; AVX1: # %bb.0: -; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,0,1] +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1,0,1] ; AVX1-NEXT: retq ; ; AVX2-LABEL: combine_nested_undef_test27: @@ -1165,7 +1165,7 @@ ; ; AVX-LABEL: combine_nested_undef_test28: ; AVX: # %bb.0: -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,0] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1,1,0] ; AVX-NEXT: retq %1 = shufflevector <4 x i32> %B, <4 x i32> %A, <4 x i32> %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> @@ -1640,7 +1640,7 @@ ; ; AVX-LABEL: combine_test1b: ; AVX: # %bb.0: -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm1[0,1,2,0] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm1[0,1,2,0] ; AVX-NEXT: retq %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> @@ -1695,7 +1695,7 @@ ; AVX-LABEL: combine_test3b: ; AVX: # %bb.0: ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,3,2,3] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,3,2,3] ; AVX-NEXT: retq %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> @@ -1711,7 +1711,7 @@ ; ; AVX-LABEL: combine_test4b: ; AVX: # %bb.0: -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm1[1,1,2,3] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm1[1,1,2,3] ; AVX-NEXT: retq %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> @@ -2457,7 +2457,7 @@ ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] ; AVX1-NEXT: retq ; ; AVX2-SLOW-LABEL: combine_unneeded_subvector1: @@ -2500,7 +2500,7 @@ ; AVX1-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] ; AVX1-NEXT: retq ; ; AVX2-LABEL: combine_unneeded_subvector2: @@ -2742,7 +2742,7 @@ ; AVX: # %bb.0: # %entry ; AVX-NEXT: vhaddps %xmm0, %xmm0, %xmm1 ; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,1] -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,1,3] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,1,3] ; AVX-NEXT: retq entry: %s1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> @@ -2778,7 +2778,7 @@ ; ; AVX-LABEL: PR22390: ; AVX: # %bb.0: # %entry -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,0,1,2] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,0,1,2] ; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm0[1,2,3] ; AVX-NEXT: vaddps %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-shuffle-concatenation.ll b/llvm/test/CodeGen/X86/vector-shuffle-concatenation.ll --- a/llvm/test/CodeGen/X86/vector-shuffle-concatenation.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-concatenation.ll @@ -22,7 +22,7 @@ ; AVX-LABEL: concat_a_to_shuf_of_a: ; AVX: # %bb.0: ; AVX-NEXT: vmovaps (%rdi), %xmm0 -; AVX-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,0,1] ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX-NEXT: vmovaps %ymm0, (%rsi) ; AVX-NEXT: vzeroupper @@ -69,7 +69,7 @@ ; AVX-LABEL: concat_shuf_of_a_to_a: ; AVX: # %bb.0: ; AVX-NEXT: vmovaps (%rdi), %xmm0 -; AVX-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,0,1] ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX-NEXT: vmovaps %ymm0, (%rdx) ; AVX-NEXT: vzeroupper @@ -119,7 +119,7 @@ ; AVX-LABEL: concat_a_to_shuf_of_a_extrause_of_shuf: ; AVX: # %bb.0: ; AVX-NEXT: vmovaps (%rdi), %xmm0 -; AVX-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,0,1] ; AVX-NEXT: vmovaps %xmm1, (%rdx) ; AVX-NEXT: vmovaps %xmm0, 16(%rsi) ; AVX-NEXT: vmovaps %xmm1, (%rsi) @@ -128,7 +128,7 @@ ; AVX2-LABEL: concat_a_to_shuf_of_a_extrause_of_shuf: ; AVX2: # %bb.0: ; AVX2-NEXT: vmovaps (%rdi), %xmm0 -; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,0,1] ; AVX2-NEXT: vmovaps %xmm1, (%rdx) ; AVX2-NEXT: vmovaps %xmm0, 16(%rsi) ; AVX2-NEXT: vmovaps %xmm1, (%rsi) @@ -137,7 +137,7 @@ ; AVX512F-LABEL: concat_a_to_shuf_of_a_extrause_of_shuf: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovaps (%rdi), %xmm0 -; AVX512F-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512F-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,0,1] ; AVX512F-NEXT: vmovaps %xmm1, (%rdx) ; AVX512F-NEXT: vmovaps %xmm0, 16(%rsi) ; AVX512F-NEXT: vmovaps %xmm1, (%rsi) @@ -146,7 +146,7 @@ ; AVX512BW-LABEL: concat_a_to_shuf_of_a_extrause_of_shuf: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovaps (%rdi), %xmm0 -; AVX512BW-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512BW-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,0,1] ; AVX512BW-NEXT: vmovaps %xmm1, (%rdx) ; AVX512BW-NEXT: vmovaps %xmm0, 16(%rsi) ; AVX512BW-NEXT: vmovaps %xmm1, (%rsi) @@ -618,7 +618,7 @@ ; AVX-LABEL: concat_aaa_to_shuf_of_a: ; AVX: # %bb.0: ; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] -; AVX-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,0,1] ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1 ; AVX-NEXT: vmovaps %ymm0, 32(%rsi) ; AVX-NEXT: vmovaps %ymm1, (%rsi) @@ -675,7 +675,7 @@ ; AVX-LABEL: concat_shuf_of_a_to_aaa: ; AVX: # %bb.0: ; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] -; AVX-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,0,1] ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX-NEXT: vmovaps %ymm0, (%rsi) ; AVX-NEXT: vmovaps %ymm1, 32(%rsi) diff --git a/llvm/test/CodeGen/X86/vector-trunc-ssat.ll b/llvm/test/CodeGen/X86/vector-trunc-ssat.ll --- a/llvm/test/CodeGen/X86/vector-trunc-ssat.ll +++ b/llvm/test/CodeGen/X86/vector-trunc-ssat.ll @@ -119,7 +119,7 @@ ; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [18446744071562067968,18446744071562067968] ; AVX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 ; AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3] ; AVX-NEXT: retq ; ; AVX512F-LABEL: trunc_ssat_v2i64_v2i32: @@ -263,7 +263,7 @@ ; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [18446744071562067968,18446744071562067968] ; AVX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 ; AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3] ; AVX-NEXT: vmovlpd %xmm0, (%rdi) ; AVX-NEXT: retq ; @@ -1119,7 +1119,7 @@ ; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [18446744073709518848,18446744073709518848] ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3] ; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] ; AVX1-NEXT: retq ; @@ -1131,7 +1131,7 @@ ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = [18446744073709518848,18446744073709518848] ; AVX2-SLOW-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 ; AVX2-SLOW-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] ; AVX2-SLOW-NEXT: retq ; @@ -1288,7 +1288,7 @@ ; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [18446744073709518848,18446744073709518848] ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3] ; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] ; AVX1-NEXT: vmovd %xmm0, (%rdi) ; AVX1-NEXT: retq @@ -1301,7 +1301,7 @@ ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = [18446744073709518848,18446744073709518848] ; AVX2-SLOW-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 ; AVX2-SLOW-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovd %xmm0, (%rdi) ; AVX2-SLOW-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-trunc-usat.ll b/llvm/test/CodeGen/X86/vector-trunc-usat.ll --- a/llvm/test/CodeGen/X86/vector-trunc-usat.ll +++ b/llvm/test/CodeGen/X86/vector-trunc-usat.ll @@ -80,7 +80,7 @@ ; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [9223372041149743103,9223372041149743103] ; AVX-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 ; AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3] ; AVX-NEXT: retq ; ; AVX512F-LABEL: trunc_usat_v2i64_v2i32: @@ -183,7 +183,7 @@ ; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [9223372041149743103,9223372041149743103] ; AVX-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 ; AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3] ; AVX-NEXT: vmovlpd %xmm0, (%rdi) ; AVX-NEXT: retq ; @@ -781,7 +781,7 @@ ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [9223372036854841343,9223372036854841343] ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 ; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3] ; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] ; AVX1-NEXT: retq ; @@ -792,7 +792,7 @@ ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = [9223372036854841343,9223372036854841343] ; AVX2-SLOW-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 ; AVX2-SLOW-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] ; AVX2-SLOW-NEXT: retq ; @@ -907,7 +907,7 @@ ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [9223372036854841343,9223372036854841343] ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 ; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3] ; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] ; AVX1-NEXT: vmovd %xmm0, (%rdi) ; AVX1-NEXT: retq @@ -919,7 +919,7 @@ ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = [9223372036854841343,9223372036854841343] ; AVX2-SLOW-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 ; AVX2-SLOW-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovd %xmm0, (%rdi) ; AVX2-SLOW-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vselect-avx.ll b/llvm/test/CodeGen/X86/vselect-avx.ll --- a/llvm/test/CodeGen/X86/vselect-avx.ll +++ b/llvm/test/CodeGen/X86/vselect-avx.ll @@ -288,16 +288,16 @@ ; AVX1-LABEL: vselect_concat_splat: ; AVX1: ## %bb.0: ## %entry ; AVX1-NEXT: vmovups (%rax), %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,3,2,1] -; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,0,3,2] +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[0,3,2,1] +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,0,3,2] ; AVX1-NEXT: vmovups 16, %xmm2 ; AVX1-NEXT: vmovups 32, %xmm3 ; AVX1-NEXT: vblendps {{.*#+}} xmm4 = mem[0],xmm3[1],mem[2,3] ; AVX1-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],xmm2[2],xmm4[3] -; AVX1-NEXT: vpermilps {{.*#+}} xmm4 = xmm4[0,3,2,1] +; AVX1-NEXT: vshufps {{.*#+}} xmm4 = xmm4[0,3,2,1] ; AVX1-NEXT: vblendps {{.*#+}} xmm3 = mem[0,1],xmm3[2,3] ; AVX1-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm3[1,2],xmm2[3] -; AVX1-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[1,0,3,2] +; AVX1-NEXT: vshufps {{.*#+}} xmm2 = xmm2[1,0,3,2] ; AVX1-NEXT: vxorps %xmm3, %xmm3, %xmm3 ; AVX1-NEXT: vcmpneqps %xmm3, %xmm1, %xmm3 ; AVX1-NEXT: vblendvps %xmm3, %xmm4, %xmm1, %xmm1 diff --git a/llvm/test/CodeGen/X86/x86-interleaved-access.ll b/llvm/test/CodeGen/X86/x86-interleaved-access.ll --- a/llvm/test/CodeGen/X86/x86-interleaved-access.ll +++ b/llvm/test/CodeGen/X86/x86-interleaved-access.ll @@ -1720,13 +1720,13 @@ ; AVX2: # %bb.0: ; AVX2-NEXT: vmovups (%rdi), %ymm0 ; AVX2-NEXT: vmovups (%rdi), %xmm1 -; AVX2-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[0,0,1,1] +; AVX2-NEXT: vshufps {{.*#+}} xmm2 = xmm1[0,0,1,1] ; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,0,1,1] -; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[2,2,3,3] +; AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,2,3,3] ; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,0,1,1] -; AVX2-NEXT: vpermilps {{.*#+}} ymm3 = ymm0[0,0,1,1,4,4,5,5] +; AVX2-NEXT: vshufps {{.*#+}} ymm3 = ymm0[0,0,1,1,4,4,5,5] ; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[2,2,3,3] -; AVX2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,2,3,3,6,6,7,7] +; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,2,3,3,6,6,7,7] ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,3,3] ; AVX2-NEXT: vmovups %ymm0, 96(%rsi) ; AVX2-NEXT: vmovups %ymm3, 64(%rsi) @@ -1780,13 +1780,13 @@ ; AVX2: # %bb.0: ; AVX2-NEXT: vmovups (%rdi), %ymm0 ; AVX2-NEXT: vmovups (%rdi), %xmm1 -; AVX2-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[0,0,1,1] +; AVX2-NEXT: vshufps {{.*#+}} xmm2 = xmm1[0,0,1,1] ; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,0,1,1] -; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[2,2,3,3] +; AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,2,3,3] ; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,0,1,1] -; AVX2-NEXT: vpermilps {{.*#+}} ymm3 = ymm0[0,0,1,1,4,4,5,5] +; AVX2-NEXT: vshufps {{.*#+}} ymm3 = ymm0[0,0,1,1,4,4,5,5] ; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[2,2,3,3] -; AVX2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,2,3,3,6,6,7,7] +; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,2,3,3,6,6,7,7] ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,3,3] ; AVX2-NEXT: vmovups %ymm0, 96(%rsi) ; AVX2-NEXT: vmovups %ymm3, 64(%rsi) diff --git a/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll b/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll --- a/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll +++ b/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll @@ -1942,7 +1942,7 @@ ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0],ymm1[1,3],ymm0[4,4],ymm1[5,7] -; AVX-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7] +; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7] ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX-NEXT: vpaddb 16(%rdx), %xmm1, %xmm1 ; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 @@ -4645,7 +4645,7 @@ ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm2 ; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,0],ymm1[1,3],ymm2[4,4],ymm1[5,7] -; AVX-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[0,2,1,3] +; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm1[0,2,1,3] ; AVX-NEXT: vxorps %xmm3, %xmm3, %xmm3 ; AVX-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[4],ymm3[4],ymm1[5],ymm3[5] ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] @@ -4852,7 +4852,7 @@ ; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5],ymm2[6],ymm3[7] ; AVX-NEXT: vxorps %xmm3, %xmm3, %xmm3 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3] -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,0,1,1] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,0,1,1] ; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 ; AVX-NEXT: vextractf128 $1, %ymm2, %xmm2 ; AVX-NEXT: vpaddb 16(%rdx), %xmm2, %xmm2 @@ -6609,7 +6609,7 @@ ; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0],ymm1[1,3],ymm0[4,4],ymm1[5,7] -; AVX-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7] +; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7] ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX-NEXT: vpaddb 48(%rdx), %xmm1, %xmm2 ; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm3 diff --git a/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast_from_memory.ll b/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast_from_memory.ll --- a/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast_from_memory.ll +++ b/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast_from_memory.ll @@ -1562,7 +1562,7 @@ ; AVX: # %bb.0: ; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0],mem[1,3],ymm0[4,4],mem[5,7] -; AVX-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7] +; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7] ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 @@ -3743,7 +3743,7 @@ ; AVX-NEXT: vmovaps 48(%rdi), %xmm1 ; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm2 = mem[0,1,0,1] ; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,0],ymm1[1,3],ymm2[4,4],ymm1[5,7] -; AVX-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[0,2,1,3] +; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm1[0,2,1,3] ; AVX-NEXT: vxorps %xmm3, %xmm3, %xmm3 ; AVX-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[4],ymm3[4],ymm1[5],ymm3[5] ; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1 @@ -3896,13 +3896,13 @@ ; AVX: # %bb.0: ; AVX-NEXT: vmovaps (%rdi), %xmm0 ; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm0[0],mem[1,2,3] -; AVX-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,1,2,0] +; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,0] ; AVX-NEXT: vbroadcastss (%rdi), %ymm2 ; AVX-NEXT: vxorps %xmm3, %xmm3, %xmm3 ; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5],ymm2[6],ymm3[7] ; AVX-NEXT: vxorps %xmm3, %xmm3, %xmm3 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3] -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,0,1,1] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,0,1,1] ; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm0 ; AVX-NEXT: vextractf128 $1, %ymm2, %xmm2 ; AVX-NEXT: vpaddb 16(%rsi), %xmm2, %xmm2 @@ -5369,7 +5369,7 @@ ; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] ; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0],ymm1[1,3],ymm0[4,4],ymm1[5,7] -; AVX-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7] +; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7] ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX-NEXT: vpaddb 48(%rsi), %xmm1, %xmm2 ; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm3 diff --git a/llvm/utils/gn/secondary/llvm/lib/Target/X86/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Target/X86/BUILD.gn --- a/llvm/utils/gn/secondary/llvm/lib/Target/X86/BUILD.gn +++ b/llvm/utils/gn/secondary/llvm/lib/Target/X86/BUILD.gn @@ -94,6 +94,7 @@ "X86FastTileConfig.cpp", "X86FixupBWInsts.cpp", "X86FixupLEAs.cpp", + "X86FixupInstTuning.cpp", "X86FixupSetCC.cpp", "X86FlagsCopyLowering.cpp", "X86FloatingPoint.cpp",