diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp --- a/llvm/lib/Target/X86/X86InstrInfo.cpp +++ b/llvm/lib/Target/X86/X86InstrInfo.cpp @@ -8130,6 +8130,16 @@ case X86::VPBLENDWYrmi: case X86::VPBLENDWYrri: return GetBlendDomains(8, false); + case X86::SHUFPSrri: + case X86::VSHUFPSrri: + case X86::VSHUFPSYrri: + case X86::VSHUFPSZ128rri: + case X86::VSHUFPSZ256rri: + case X86::VSHUFPSZrri: + // If the shuffled registers match we can transform this to any domain. + if (MI.getOperand(1).getReg() != MI.getOperand(2).getReg()) + return 0; + return 0x2; // PackedSingle case X86::VPANDDZ128rr: case X86::VPANDDZ128rm: case X86::VPANDDZ256rr: case X86::VPANDDZ256rm: case X86::VPANDQZ128rr: case X86::VPANDQZ128rm: @@ -8224,6 +8234,29 @@ return true; }; + auto setShuffleIntToFPDomain = [&](unsigned NewOpc) { + if (Domain >= 3) + return true; + + unsigned MaskImm = MI.getOperand(NumOperands - 1).getImm(); + MI.removeOperand(NumOperands - 1); + MI.addOperand(MI.getOperand(1)); + // MI.tieOperands(1, 2); + MI.setDesc(get(NewOpc)); + MI.addOperand(MachineOperand::CreateImm(MaskImm)); + return true; + }; + + auto setShuffleFPToIntDomain = [&](unsigned NewOpc) { + // We are converting `rri` -> `ri` so the two shuffled registers must much. + if (Domain != 3 || MI.getOperand(1).getReg() != MI.getOperand(2).getReg()) + return true; + + MI.removeOperand(1); + MI.setDesc(get(NewOpc)); + return true; + }; + switch (Opcode) { case X86::BLENDPDrmi: case X86::BLENDPDrri: @@ -8253,6 +8286,37 @@ case X86::VPBLENDWYrmi: case X86::VPBLENDWYrri: return SetBlendDomain(16, true); + case X86::SHUFPSrri: + return setShuffleFPToIntDomain(X86::PSHUFDri); + case X86::VSHUFPSrri: + return setShuffleFPToIntDomain(X86::VPSHUFDri); + case X86::VSHUFPSYrri: + return setShuffleFPToIntDomain(X86::VPSHUFDYri); + case X86::VSHUFPSZ128rri: + return setShuffleFPToIntDomain(X86::VPSHUFDZ128ri); + case X86::VSHUFPSZ256rri: + return setShuffleFPToIntDomain(X86::VPSHUFDZ256ri); + case X86::VSHUFPSZrri: + return setShuffleFPToIntDomain(X86::VPSHUFDZri); + case X86::PSHUFDri: + return setShuffleIntToFPDomain(X86::SHUFPSrri); + case X86::VPSHUFDri: + return setShuffleIntToFPDomain(X86::VSHUFPSrri); + case X86::VPSHUFDYri: + return setShuffleIntToFPDomain(X86::VSHUFPSYrri); + case X86::VPSHUFDZ128ri: + return setShuffleIntToFPDomain(X86::VSHUFPSZ128rri); + case X86::VPSHUFDZ256ri: + return setShuffleIntToFPDomain(X86::VSHUFPSZ256rri); + case X86::VPSHUFDZri: + return setShuffleIntToFPDomain(X86::VSHUFPSZrri); + case X86::PSHUFDmi: + case X86::VPSHUFDmi: + case X86::VPSHUFDYmi: + case X86::VPSHUFDZ128mi: + case X86::VPSHUFDZ256mi: + case X86::VPSHUFDZmi: + return Subtarget.hasNoDomainDelayShuffle(); case X86::VPANDDZ128rr: case X86::VPANDDZ128rm: case X86::VPANDDZ256rr: case X86::VPANDDZ256rm: case X86::VPANDQZ128rr: case X86::VPANDQZ128rm: diff --git a/llvm/lib/Target/X86/X86InstrSSE.td b/llvm/lib/Target/X86/X86InstrSSE.td --- a/llvm/lib/Target/X86/X86InstrSSE.td +++ b/llvm/lib/Target/X86/X86InstrSSE.td @@ -7369,6 +7369,16 @@ // VPERMIL - Permute Single and Double Floating-Point Values // +// shufps preferable if vpermilp has register arguments. Its either +// faster or equal speed and has shorted encoding. +def : Pat<(v4f32 (X86VPermilpi VR128:$src1, (i8 timm:$src2))), + (VSHUFPSrri VR128:$src1, VR128:$src1, (i8 timm:$src2))>; + +let Predicates = [HasAVX] in { +def : Pat<(v8f32 (X86VPermilpi VR256:$src1, (i8 timm:$src2))), + (VSHUFPSYrri VR256:$src1, VR256:$src1, (i8 timm:$src2))>; +} + multiclass avx_permil opc_rm, bits<8> opc_rmi, string OpcodeStr, RegisterClass RC, X86MemOperand x86memop_f, X86MemOperand x86memop_i, @@ -7721,15 +7731,15 @@ let Predicates = [HasAVX1Only] in { def : Pat<(v4f32 (X86VBroadcast FR32:$src)), - (VPERMILPSri (v4f32 (COPY_TO_REGCLASS FR32:$src, VR128)), 0)>; + (VSHUFPSrri (v4f32 (COPY_TO_REGCLASS FR32:$src, VR128)), (v4f32 (COPY_TO_REGCLASS FR32:$src, VR128)), 0)>; def : Pat<(v8f32 (X86VBroadcast FR32:$src)), (VINSERTF128rr (INSERT_SUBREG (v8f32 (IMPLICIT_DEF)), - (v4f32 (VPERMILPSri (v4f32 (COPY_TO_REGCLASS FR32:$src, VR128)), 0)), sub_xmm), - (v4f32 (VPERMILPSri (v4f32 (COPY_TO_REGCLASS FR32:$src, VR128)), 0)), 1)>; + (v4f32 (VSHUFPSrri (v4f32 (COPY_TO_REGCLASS FR32:$src, VR128)), (v4f32 (COPY_TO_REGCLASS FR32:$src, VR128)), 0)), sub_xmm), + (v4f32 (VSHUFPSrri (v4f32 (COPY_TO_REGCLASS FR32:$src, VR128)), (v4f32 (COPY_TO_REGCLASS FR32:$src, VR128)), 0)), 1)>; def : Pat<(v8f32 (X86VBroadcast v4f32:$src)), (VINSERTF128rr (INSERT_SUBREG (v8f32 (IMPLICIT_DEF)), - (v4f32 (VPERMILPSri VR128:$src, 0)), sub_xmm), - (v4f32 (VPERMILPSri VR128:$src, 0)), 1)>; + (v4f32 (VSHUFPSrri VR128:$src, VR128:$src, 0)), sub_xmm), + (v4f32 (VSHUFPSrri VR128:$src, VR128:$src, 0)), 1)>; def : Pat<(v4f64 (X86VBroadcast FR64:$src)), (VINSERTF128rr (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)), (v2f64 (VMOVDDUPrr (v2f64 (COPY_TO_REGCLASS FR64:$src, VR128)))), sub_xmm), diff --git a/llvm/test/CodeGen/X86/2012-01-12-extract-sv.ll b/llvm/test/CodeGen/X86/2012-01-12-extract-sv.ll --- a/llvm/test/CodeGen/X86/2012-01-12-extract-sv.ll +++ b/llvm/test/CodeGen/X86/2012-01-12-extract-sv.ll @@ -6,7 +6,7 @@ ; AVX1-LABEL: endless_loop: ; AVX1: # %bb.0: # %entry ; AVX1-NEXT: vmovaps (%eax), %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,0,0,0] +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[0,0,0,0] ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-NEXT: vxorps %xmm2, %xmm2, %xmm2 ; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7] diff --git a/llvm/test/CodeGen/X86/SwizzleShuff.ll b/llvm/test/CodeGen/X86/SwizzleShuff.ll --- a/llvm/test/CodeGen/X86/SwizzleShuff.ll +++ b/llvm/test/CodeGen/X86/SwizzleShuff.ll @@ -21,8 +21,8 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vmovaps (%rdi), %xmm0 ; CHECK-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1],mem[1,2] -; CHECK-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,3,2,2] -; CHECK-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,1,0,2] +; CHECK-NEXT: vshufps {{.*#+}} xmm1 = xmm0[1,3,2,2] +; CHECK-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,1,0,2] ; CHECK-NEXT: vxorps %xmm0, %xmm1, %xmm0 ; CHECK-NEXT: retq %A = load <4 x i32>, ptr %pA diff --git a/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll b/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll --- a/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll +++ b/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll @@ -1942,7 +1942,7 @@ ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0],ymm1[1,3],ymm0[4,4],ymm1[5,7] -; AVX-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7] +; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7] ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX-NEXT: vpaddb 16(%rdx), %xmm1, %xmm1 ; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 @@ -4242,7 +4242,7 @@ ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm2 ; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,0],ymm1[1,3],ymm2[4,4],ymm1[5,7] -; AVX-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,2,1,3,4,6,5,7] +; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2,1,3,4,6,5,7] ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; AVX-NEXT: vextractf128 $1, %ymm1, %xmm2 ; AVX-NEXT: vpaddb 16(%rdx), %xmm2, %xmm2 @@ -4369,7 +4369,7 @@ ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7] -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,0] +; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,0] ; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,1,0,1] ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] ; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 diff --git a/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll b/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll --- a/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll +++ b/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll @@ -1562,7 +1562,7 @@ ; AVX: # %bb.0: ; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0],mem[1,3],ymm0[4,4],mem[5,7] -; AVX-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7] +; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7] ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 @@ -3439,7 +3439,7 @@ ; AVX-NEXT: vmovaps 48(%rdi), %xmm0 ; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm1 = mem[0,1,0,1] ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,0],ymm0[1,3],ymm1[4,4],ymm0[5,7] -; AVX-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7] +; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7] ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = mem[0,1,0,1] ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm2 ; AVX-NEXT: vpaddb 16(%rsi), %xmm2, %xmm2 diff --git a/llvm/test/CodeGen/X86/avx-intrinsics-fast-isel.ll b/llvm/test/CodeGen/X86/avx-intrinsics-fast-isel.ll --- a/llvm/test/CodeGen/X86/avx-intrinsics-fast-isel.ll +++ b/llvm/test/CodeGen/X86/avx-intrinsics-fast-isel.ll @@ -1275,7 +1275,7 @@ define <4 x float> @test_mm_permute_ps(<4 x float> %a0) nounwind { ; CHECK-LABEL: test_mm_permute_ps: ; CHECK: # %bb.0: -; CHECK-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0] +; CHECK-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,2,1,0] ; CHECK-NEXT: ret{{[l|q]}} %res = shufflevector <4 x float> %a0, <4 x float> %a0, <4 x i32> ret <4 x float> %res @@ -1284,7 +1284,7 @@ define <4 x float> @test2_mm_permute_ps(<4 x float> %a0) nounwind { ; CHECK-LABEL: test2_mm_permute_ps: ; CHECK: # %bb.0: -; CHECK-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,1,2,3] +; CHECK-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,1,2,3] ; CHECK-NEXT: ret{{[l|q]}} %res = shufflevector <4 x float> %a0, <4 x float> %a0, <4 x i32> ret <4 x float> %res @@ -1293,7 +1293,7 @@ define <8 x float> @test_mm256_permute_ps(<8 x float> %a0) nounwind { ; CHECK-LABEL: test_mm256_permute_ps: ; CHECK: # %bb.0: -; CHECK-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; CHECK-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] ; CHECK-NEXT: ret{{[l|q]}} %res = shufflevector <8 x float> %a0, <8 x float> %a0, <8 x i32> ret <8 x float> %res @@ -1934,7 +1934,7 @@ ; X86-LABEL: test_mm256_set1_epi32: ; X86: # %bb.0: ; X86-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X86-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0] +; X86-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0,0,0] ; X86-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; X86-NEXT: retl ; @@ -2002,13 +2002,13 @@ ; X86-LABEL: test_mm256_set1_ps: ; X86: # %bb.0: ; X86-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X86-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0] +; X86-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0,0,0] ; X86-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; X86-NEXT: retl ; ; X64-LABEL: test_mm256_set1_ps: ; X64: # %bb.0: -; X64-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0] +; X64-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0,0,0] ; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; X64-NEXT: retq %res0 = insertelement <8 x float> undef, float %a0, i32 0 diff --git a/llvm/test/CodeGen/X86/avx-intrinsics-x86-upgrade.ll b/llvm/test/CodeGen/X86/avx-intrinsics-x86-upgrade.ll --- a/llvm/test/CodeGen/X86/avx-intrinsics-x86-upgrade.ll +++ b/llvm/test/CodeGen/X86/avx-intrinsics-x86-upgrade.ll @@ -877,17 +877,11 @@ define <4 x float> @test_x86_avx_vpermil_ps(<4 x float> %a0) { -; AVX-LABEL: test_x86_avx_vpermil_ps: -; AVX: # %bb.0: -; AVX-NEXT: vpermilps $7, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x04,0xc0,0x07] -; AVX-NEXT: # xmm0 = xmm0[3,1,0,0] -; AVX-NEXT: ret{{[l|q]}} # encoding: [0xc3] -; -; AVX512VL-LABEL: test_x86_avx_vpermil_ps: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpermilps $7, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x04,0xc0,0x07] -; AVX512VL-NEXT: # xmm0 = xmm0[3,1,0,0] -; AVX512VL-NEXT: ret{{[l|q]}} # encoding: [0xc3] +; CHECK-LABEL: test_x86_avx_vpermil_ps: +; CHECK: # %bb.0: +; CHECK-NEXT: vshufps $7, %xmm0, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0xc6,0xc0,0x07] +; CHECK-NEXT: # xmm0 = xmm0[3,1,0,0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] %res = call <4 x float> @llvm.x86.avx.vpermil.ps(<4 x float> %a0, i8 7) ; <<4 x float>> [#uses=1] ret <4 x float> %res } @@ -895,17 +889,11 @@ define <8 x float> @test_x86_avx_vpermil_ps_256(<8 x float> %a0) { -; AVX-LABEL: test_x86_avx_vpermil_ps_256: -; AVX: # %bb.0: -; AVX-NEXT: vpermilps $7, %ymm0, %ymm0 # encoding: [0xc4,0xe3,0x7d,0x04,0xc0,0x07] -; AVX-NEXT: # ymm0 = ymm0[3,1,0,0,7,5,4,4] -; AVX-NEXT: ret{{[l|q]}} # encoding: [0xc3] -; -; AVX512VL-LABEL: test_x86_avx_vpermil_ps_256: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpermilps $7, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x04,0xc0,0x07] -; AVX512VL-NEXT: # ymm0 = ymm0[3,1,0,0,7,5,4,4] -; AVX512VL-NEXT: ret{{[l|q]}} # encoding: [0xc3] +; CHECK-LABEL: test_x86_avx_vpermil_ps_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vshufps $7, %ymm0, %ymm0, %ymm0 # encoding: [0xc5,0xfc,0xc6,0xc0,0x07] +; CHECK-NEXT: # ymm0 = ymm0[3,1,0,0,7,5,4,4] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] %res = call <8 x float> @llvm.x86.avx.vpermil.ps.256(<8 x float> %a0, i8 7) ; <<8 x float>> [#uses=1] ret <8 x float> %res } diff --git a/llvm/test/CodeGen/X86/avx-splat.ll b/llvm/test/CodeGen/X86/avx-splat.ll --- a/llvm/test/CodeGen/X86/avx-splat.ll +++ b/llvm/test/CodeGen/X86/avx-splat.ll @@ -145,7 +145,7 @@ define <8 x float> @funcG(<8 x float> %a) nounwind uwtable readnone ssp { ; CHECK-LABEL: funcG: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0] +; CHECK-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0,0,0] ; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; CHECK-NEXT: ret{{[l|q]}} entry: @@ -157,7 +157,7 @@ ; CHECK-LABEL: funcH: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] -; CHECK-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,1,1,1,5,5,5,5] +; CHECK-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,1,1,1,5,5,5,5] ; CHECK-NEXT: ret{{[l|q]}} entry: %shuffle = shufflevector <8 x float> %a, <8 x float> undef, <8 x i32> diff --git a/llvm/test/CodeGen/X86/avx-vbroadcast.ll b/llvm/test/CodeGen/X86/avx-vbroadcast.ll --- a/llvm/test/CodeGen/X86/avx-vbroadcast.ll +++ b/llvm/test/CodeGen/X86/avx-vbroadcast.ll @@ -624,12 +624,12 @@ define <4 x i32> @H(<4 x i32> %a) { ; X86-LABEL: H: ; X86: ## %bb.0: ## %entry -; X86-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,1,1] +; X86-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1,1,1] ; X86-NEXT: retl ; ; X64-LABEL: H: ; X64: ## %bb.0: ## %entry -; X64-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,1,1] +; X64-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1,1,1] ; X64-NEXT: retq entry: %x = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> @@ -982,7 +982,7 @@ ; X64: ## %bb.0: ## %bb ; X64-NEXT: movdq2q %xmm0, %mm0 ; X64-NEXT: movq2dq %mm0, %xmm0 -; X64-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,0,1] +; X64-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1,0,1] ; X64-NEXT: retq bb: %tmp1 = bitcast x86_mmx %tmp to i64 diff --git a/llvm/test/CodeGen/X86/avx-vinsertf128.ll b/llvm/test/CodeGen/X86/avx-vinsertf128.ll --- a/llvm/test/CodeGen/X86/avx-vinsertf128.ll +++ b/llvm/test/CodeGen/X86/avx-vinsertf128.ll @@ -29,7 +29,7 @@ ; CHECK-NEXT: vminpd %xmm0, %xmm0, %xmm0 ; CHECK-NEXT: vminsd %xmm0, %xmm0, %xmm0 ; CHECK-NEXT: vcvtsd2ss %xmm0, %xmm0, %xmm0 -; CHECK-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0] +; CHECK-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0,0,0] ; CHECK-NEXT: vmovups %xmm0, (%rax) ; CHECK-NEXT: retq allocas: diff --git a/llvm/test/CodeGen/X86/avx-vperm2x128.ll b/llvm/test/CodeGen/X86/avx-vperm2x128.ll --- a/llvm/test/CodeGen/X86/avx-vperm2x128.ll +++ b/llvm/test/CodeGen/X86/avx-vperm2x128.ll @@ -336,7 +336,7 @@ ; ALL-LABEL: shuffle_v8f32_uu67ucuf: ; ALL: # %bb.0: # %entry ; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] -; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,2,3,4,4,6,7] +; ALL-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0,2,3,4,4,6,7] ; ALL-NEXT: retq entry: %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> diff --git a/llvm/test/CodeGen/X86/avx2-intrinsics-fast-isel.ll b/llvm/test/CodeGen/X86/avx2-intrinsics-fast-isel.ll --- a/llvm/test/CodeGen/X86/avx2-intrinsics-fast-isel.ll +++ b/llvm/test/CodeGen/X86/avx2-intrinsics-fast-isel.ll @@ -2015,7 +2015,7 @@ define <4 x i64> @test_mm256_shuffle_epi32(<4 x i64> %a0) { ; CHECK-LABEL: test_mm256_shuffle_epi32: ; CHECK: # %bb.0: -; CHECK-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,3,0,0,7,7,4,4] +; CHECK-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,3,0,0,7,7,4,4] ; CHECK-NEXT: ret{{[l|q]}} %arg0 = bitcast <4 x i64> %a0 to <8 x i32> %shuf = shufflevector <8 x i32> %arg0, <8 x i32> undef, <8 x i32> diff --git a/llvm/test/CodeGen/X86/avx512-cvt.ll b/llvm/test/CodeGen/X86/avx512-cvt.ll --- a/llvm/test/CodeGen/X86/avx512-cvt.ll +++ b/llvm/test/CodeGen/X86/avx512-cvt.ll @@ -211,7 +211,7 @@ define <4 x i64> @f32to4sl(<4 x float> %a) { ; NODQ-LABEL: f32to4sl: ; NODQ: # %bb.0: -; NODQ-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,3,3,3] +; NODQ-NEXT: vshufps {{.*#+}} xmm1 = xmm0[3,3,3,3] ; NODQ-NEXT: vcvttss2si %xmm1, %rax ; NODQ-NEXT: vmovq %rax, %xmm1 ; NODQ-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] @@ -1707,7 +1707,7 @@ ; NOVL: # %bb.0: ; NOVL-NEXT: vxorpd %xmm1, %xmm1, %xmm1 ; NOVL-NEXT: vcmpltpd %xmm0, %xmm1, %xmm0 -; NOVL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3] +; NOVL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3] ; NOVL-NEXT: vcvtdq2pd %xmm0, %xmm0 ; NOVL-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll b/llvm/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll --- a/llvm/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll +++ b/llvm/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll @@ -1128,7 +1128,7 @@ define <8 x i64> @test_mm512_shuffle_epi32(<8 x i64> %a0) { ; CHECK-LABEL: test_mm512_shuffle_epi32: ; CHECK: # %bb.0: -; CHECK-NEXT: vpermilps {{.*#+}} zmm0 = zmm0[1,0,0,0,5,4,4,4,9,8,8,8,13,12,12,12] +; CHECK-NEXT: vshufps {{.*#+}} zmm0 = zmm0[1,0,0,0,5,4,4,4,9,8,8,8,13,12,12,12] ; CHECK-NEXT: ret{{[l|q]}} %arg0 = bitcast <8 x i64> %a0 to <16 x i32> %res0 = shufflevector <16 x i32> %arg0, <16 x i32> undef, <16 x i32> diff --git a/llvm/test/CodeGen/X86/avx512-intrinsics-upgrade.ll b/llvm/test/CodeGen/X86/avx512-intrinsics-upgrade.ll --- a/llvm/test/CodeGen/X86/avx512-intrinsics-upgrade.ll +++ b/llvm/test/CodeGen/X86/avx512-intrinsics-upgrade.ll @@ -1072,7 +1072,7 @@ define <16 x i32>@test_int_x86_avx512_pshuf_d_512(<16 x i32> %x0, i32 %x1, <16 x i32> %x2) { ; CHECK-LABEL: test_int_x86_avx512_pshuf_d_512: ; CHECK: ## %bb.0: -; CHECK-NEXT: vpermilps $3, %zmm0, %zmm0 ## encoding: [0x62,0xf3,0x7d,0x48,0x04,0xc0,0x03] +; CHECK-NEXT: vshufps $3, %zmm0, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0xc6,0xc0,0x03] ; CHECK-NEXT: ## zmm0 = zmm0[3,0,0,0,7,4,4,4,11,8,8,8,15,12,12,12] ; CHECK-NEXT: ret{{[l|q]}} ## encoding: [0xc3] %res = call <16 x i32> @llvm.x86.avx512.mask.pshuf.d.512(<16 x i32> %x0, i32 3, <16 x i32> %x2, i16 -1) diff --git a/llvm/test/CodeGen/X86/avx512-shuffles/in_lane_permute.ll b/llvm/test/CodeGen/X86/avx512-shuffles/in_lane_permute.ll --- a/llvm/test/CodeGen/X86/avx512-shuffles/in_lane_permute.ll +++ b/llvm/test/CodeGen/X86/avx512-shuffles/in_lane_permute.ll @@ -6,7 +6,7 @@ define <4 x float> @test_4xfloat_perm_mask0(<4 x float> %vec) { ; CHECK-LABEL: test_4xfloat_perm_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,1,3,1] +; CHECK-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,1,3,1] ; CHECK-NEXT: retq %res = shufflevector <4 x float> %vec, <4 x float> undef, <4 x i32> ret <4 x float> %res @@ -92,7 +92,7 @@ define <4 x float> @test_4xfloat_perm_mask3(<4 x float> %vec) { ; CHECK-LABEL: test_4xfloat_perm_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,2,3,2] +; CHECK-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,2,3,2] ; CHECK-NEXT: retq %res = shufflevector <4 x float> %vec, <4 x float> undef, <4 x i32> ret <4 x float> %res @@ -342,7 +342,7 @@ define <8 x float> @test_8xfloat_perm_imm_mask3(<8 x float> %vec) { ; CHECK-LABEL: test_8xfloat_perm_imm_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,2,1,0,6,6,5,4] +; CHECK-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,2,1,0,6,6,5,4] ; CHECK-NEXT: retq %res = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> ret <8 x float> %res diff --git a/llvm/test/CodeGen/X86/avx512-shuffles/shuffle.ll b/llvm/test/CodeGen/X86/avx512-shuffles/shuffle.ll --- a/llvm/test/CodeGen/X86/avx512-shuffles/shuffle.ll +++ b/llvm/test/CodeGen/X86/avx512-shuffles/shuffle.ll @@ -2091,7 +2091,7 @@ define <4 x i32> @test_4xi32_perm_mask0(<4 x i32> %vec) { ; CHECK-LABEL: test_4xi32_perm_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,3,0] +; CHECK-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,3,0] ; CHECK-NEXT: retq %res = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> ret <4 x i32> %res @@ -2171,7 +2171,7 @@ define <4 x i32> @test_4xi32_perm_mask3(<4 x i32> %vec) { ; CHECK-LABEL: test_4xi32_perm_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,0,3] +; CHECK-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1,0,3] ; CHECK-NEXT: retq %res = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> ret <4 x i32> %res @@ -2325,7 +2325,7 @@ define <8 x i32> @test_8xi32_perm_mask0(<8 x i32> %vec) { ; CHECK-LABEL: test_8xi32_perm_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,3,1,0,6,7,5,4] +; CHECK-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,3,1,0,6,7,5,4] ; CHECK-NEXT: retq %res = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> ret <8 x i32> %res @@ -2405,7 +2405,7 @@ define <8 x i32> @test_8xi32_perm_mask3(<8 x i32> %vec) { ; CHECK-LABEL: test_8xi32_perm_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,3,1,0,5,7,5,4] +; CHECK-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3,1,0,5,7,5,4] ; CHECK-NEXT: retq %res = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> ret <8 x i32> %res @@ -2559,7 +2559,7 @@ define <16 x i32> @test_16xi32_perm_mask0(<16 x i32> %vec) { ; CHECK-LABEL: test_16xi32_perm_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpermilps {{.*#+}} zmm0 = zmm0[3,1,3,0,7,5,7,4,11,9,11,8,15,13,15,12] +; CHECK-NEXT: vshufps {{.*#+}} zmm0 = zmm0[3,1,3,0,7,5,7,4,11,9,11,8,15,13,15,12] ; CHECK-NEXT: retq %res = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> ret <16 x i32> %res @@ -2639,7 +2639,7 @@ define <16 x i32> @test_16xi32_perm_mask3(<16 x i32> %vec) { ; CHECK-LABEL: test_16xi32_perm_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vpermilps {{.*#+}} zmm0 = zmm0[3,2,0,3,7,6,4,7,11,10,8,11,15,14,12,15] +; CHECK-NEXT: vshufps {{.*#+}} zmm0 = zmm0[3,2,0,3,7,6,4,7,11,10,8,11,15,14,12,15] ; CHECK-NEXT: retq %res = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> ret <16 x i32> %res diff --git a/llvm/test/CodeGen/X86/avx512-trunc.ll b/llvm/test/CodeGen/X86/avx512-trunc.ll --- a/llvm/test/CodeGen/X86/avx512-trunc.ll +++ b/llvm/test/CodeGen/X86/avx512-trunc.ll @@ -270,7 +270,7 @@ define <2 x i32> @trunc_qd_128(<2 x i64> %i) #0 { ; ALL-LABEL: trunc_qd_128: ; ALL: ## %bb.0: -; ALL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3] +; ALL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3] ; ALL-NEXT: retq %x = trunc <2 x i64> %i to <2 x i32> ret <2 x i32> %x @@ -279,7 +279,7 @@ define void @trunc_qd_128_mem(<2 x i64> %i, ptr %res) #0 { ; KNL-LABEL: trunc_qd_128_mem: ; KNL: ## %bb.0: -; KNL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3] +; KNL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3] ; KNL-NEXT: vmovlps %xmm0, (%rdi) ; KNL-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/avx512-vec-cmp.ll b/llvm/test/CodeGen/X86/avx512-vec-cmp.ll --- a/llvm/test/CodeGen/X86/avx512-vec-cmp.ll +++ b/llvm/test/CodeGen/X86/avx512-vec-cmp.ll @@ -1211,7 +1211,7 @@ ; AVX512-LABEL: test46: ; AVX512: ## %bb.0: ; AVX512-NEXT: vcmpeqps %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0xc2,0xc1,0x00] -; AVX512-NEXT: vpermilps $212, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x04,0xc0,0xd4] +; AVX512-NEXT: vshufps $212, %xmm0, %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0xc6,0xc0,0xd4] ; AVX512-NEXT: ## xmm0 = xmm0[0,1,1,3] ; AVX512-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0x54,0x05,A,A,A,A] ; AVX512-NEXT: ## fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte diff --git a/llvm/test/CodeGen/X86/avx512fp16-frem.ll b/llvm/test/CodeGen/X86/avx512fp16-frem.ll --- a/llvm/test/CodeGen/X86/avx512fp16-frem.ll +++ b/llvm/test/CodeGen/X86/avx512fp16-frem.ll @@ -28,11 +28,11 @@ ; CHECK-NEXT: callq fmodf@PLT ; CHECK-NEXT: vcvtss2sh %xmm0, %xmm0, %xmm0 ; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: vpermilps $255, (%rsp), %xmm0 # 16-byte Folded Reload -; CHECK-NEXT: # xmm0 = mem[3,3,3,3] +; CHECK-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload +; CHECK-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; CHECK-NEXT: vcvtsh2ss %xmm0, %xmm0, %xmm0 -; CHECK-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; CHECK-NEXT: # xmm1 = mem[3,3,3,3] +; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; CHECK-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3] ; CHECK-NEXT: vcvtsh2ss %xmm1, %xmm1, %xmm1 ; CHECK-NEXT: callq fmodf@PLT ; CHECK-NEXT: vcvtss2sh %xmm0, %xmm0, %xmm0 @@ -117,11 +117,11 @@ ; CHECK-NEXT: callq fmodf@PLT ; CHECK-NEXT: vcvtss2sh %xmm0, %xmm0, %xmm0 ; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: vpermilps $255, (%rsp), %xmm0 # 16-byte Folded Reload -; CHECK-NEXT: # xmm0 = mem[3,3,3,3] +; CHECK-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload +; CHECK-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; CHECK-NEXT: vcvtsh2ss %xmm0, %xmm0, %xmm0 -; CHECK-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; CHECK-NEXT: # xmm1 = mem[3,3,3,3] +; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; CHECK-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3] ; CHECK-NEXT: vcvtsh2ss %xmm1, %xmm1, %xmm1 ; CHECK-NEXT: callq fmodf@PLT ; CHECK-NEXT: vcvtss2sh %xmm0, %xmm0, %xmm0 @@ -206,11 +206,11 @@ ; CHECK-NEXT: callq fmodf@PLT ; CHECK-NEXT: vcvtss2sh %xmm0, %xmm0, %xmm0 ; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: vpermilps $255, (%rsp), %xmm0 # 16-byte Folded Reload -; CHECK-NEXT: # xmm0 = mem[3,3,3,3] +; CHECK-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload +; CHECK-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; CHECK-NEXT: vcvtsh2ss %xmm0, %xmm0, %xmm0 -; CHECK-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; CHECK-NEXT: # xmm1 = mem[3,3,3,3] +; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; CHECK-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3] ; CHECK-NEXT: vcvtsh2ss %xmm1, %xmm1, %xmm1 ; CHECK-NEXT: callq fmodf@PLT ; CHECK-NEXT: vcvtss2sh %xmm0, %xmm0, %xmm0 @@ -300,11 +300,11 @@ ; CHECK-NEXT: callq fmodf@PLT ; CHECK-NEXT: vcvtss2sh %xmm0, %xmm0, %xmm0 ; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; CHECK-NEXT: # xmm0 = mem[3,3,3,3] +; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; CHECK-NEXT: vcvtsh2ss %xmm0, %xmm0, %xmm0 -; CHECK-NEXT: vpermilps $255, (%rsp), %xmm1 # 16-byte Folded Reload -; CHECK-NEXT: # xmm1 = mem[3,3,3,3] +; CHECK-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload +; CHECK-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3] ; CHECK-NEXT: vcvtsh2ss %xmm1, %xmm1, %xmm1 ; CHECK-NEXT: callq fmodf@PLT ; CHECK-NEXT: vcvtss2sh %xmm0, %xmm0, %xmm0 @@ -380,12 +380,13 @@ ; CHECK-NEXT: callq fmodf@PLT ; CHECK-NEXT: vcvtss2sh %xmm0, %xmm0, %xmm0 ; CHECK-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill -; CHECK-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; CHECK-NEXT: # xmm0 = mem[3,3,3,3] +; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; CHECK-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; CHECK-NEXT: vcvtsh2ss %xmm0, %xmm0, %xmm0 -; CHECK-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; CHECK-NEXT: # xmm1 = mem[3,3,3,3] +; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload +; CHECK-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3] ; CHECK-NEXT: vcvtsh2ss %xmm1, %xmm1, %xmm1 +; CHECK-NEXT: vzeroupper ; CHECK-NEXT: callq fmodf@PLT ; CHECK-NEXT: vcvtss2sh %xmm0, %xmm0, %xmm0 ; CHECK-NEXT: vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload @@ -476,11 +477,11 @@ ; CHECK-NEXT: callq fmodf@PLT ; CHECK-NEXT: vcvtss2sh %xmm0, %xmm0, %xmm0 ; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; CHECK-NEXT: # xmm0 = mem[3,3,3,3] +; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; CHECK-NEXT: vcvtsh2ss %xmm0, %xmm0, %xmm0 -; CHECK-NEXT: vpermilps $255, (%rsp), %xmm1 # 16-byte Folded Reload -; CHECK-NEXT: # xmm1 = mem[3,3,3,3] +; CHECK-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload +; CHECK-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3] ; CHECK-NEXT: vcvtsh2ss %xmm1, %xmm1, %xmm1 ; CHECK-NEXT: callq fmodf@PLT ; CHECK-NEXT: vcvtss2sh %xmm0, %xmm0, %xmm0 @@ -561,11 +562,11 @@ ; CHECK-NEXT: callq fmodf@PLT ; CHECK-NEXT: vcvtss2sh %xmm0, %xmm0, %xmm0 ; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; CHECK-NEXT: # xmm0 = mem[3,3,3,3] +; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; CHECK-NEXT: vcvtsh2ss %xmm0, %xmm0, %xmm0 -; CHECK-NEXT: vpermilps $255, (%rsp), %xmm1 # 16-byte Folded Reload -; CHECK-NEXT: # xmm1 = mem[3,3,3,3] +; CHECK-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload +; CHECK-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3] ; CHECK-NEXT: vcvtsh2ss %xmm1, %xmm1, %xmm1 ; CHECK-NEXT: callq fmodf@PLT ; CHECK-NEXT: vcvtss2sh %xmm0, %xmm0, %xmm0 @@ -647,11 +648,11 @@ ; CHECK-NEXT: callq fmodf@PLT ; CHECK-NEXT: vcvtss2sh %xmm0, %xmm0, %xmm0 ; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; CHECK-NEXT: # xmm0 = mem[3,3,3,3] +; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; CHECK-NEXT: vcvtsh2ss %xmm0, %xmm0, %xmm0 -; CHECK-NEXT: vpermilps $255, (%rsp), %xmm1 # 16-byte Folded Reload -; CHECK-NEXT: # xmm1 = mem[3,3,3,3] +; CHECK-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload +; CHECK-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3] ; CHECK-NEXT: vcvtsh2ss %xmm1, %xmm1, %xmm1 ; CHECK-NEXT: callq fmodf@PLT ; CHECK-NEXT: vcvtss2sh %xmm0, %xmm0, %xmm0 @@ -727,12 +728,13 @@ ; CHECK-NEXT: callq fmodf@PLT ; CHECK-NEXT: vcvtss2sh %xmm0, %xmm0, %xmm0 ; CHECK-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill -; CHECK-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; CHECK-NEXT: # xmm0 = mem[3,3,3,3] +; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; CHECK-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; CHECK-NEXT: vcvtsh2ss %xmm0, %xmm0, %xmm0 -; CHECK-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; CHECK-NEXT: # xmm1 = mem[3,3,3,3] +; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload +; CHECK-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3] ; CHECK-NEXT: vcvtsh2ss %xmm1, %xmm1, %xmm1 +; CHECK-NEXT: vzeroupper ; CHECK-NEXT: callq fmodf@PLT ; CHECK-NEXT: vcvtss2sh %xmm0, %xmm0, %xmm0 ; CHECK-NEXT: vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload @@ -905,11 +907,11 @@ ; CHECK-NEXT: callq fmodf@PLT ; CHECK-NEXT: vcvtss2sh %xmm0, %xmm0, %xmm0 ; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: vpermilps $255, (%rsp), %xmm0 # 16-byte Folded Reload -; CHECK-NEXT: # xmm0 = mem[3,3,3,3] +; CHECK-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload +; CHECK-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; CHECK-NEXT: vcvtsh2ss %xmm0, %xmm0, %xmm1 -; CHECK-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; CHECK-NEXT: # xmm0 = mem[3,3,3,3] +; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; CHECK-NEXT: vcvtsh2ss %xmm0, %xmm0, %xmm0 ; CHECK-NEXT: callq fmodf@PLT ; CHECK-NEXT: vcvtss2sh %xmm0, %xmm0, %xmm0 @@ -999,11 +1001,11 @@ ; CHECK-NEXT: callq fmodf@PLT ; CHECK-NEXT: vcvtss2sh %xmm0, %xmm0, %xmm0 ; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; CHECK-NEXT: # xmm0 = mem[3,3,3,3] +; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; CHECK-NEXT: vcvtsh2ss %xmm0, %xmm0, %xmm1 -; CHECK-NEXT: vpermilps $255, (%rsp), %xmm0 # 16-byte Folded Reload -; CHECK-NEXT: # xmm0 = mem[3,3,3,3] +; CHECK-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload +; CHECK-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; CHECK-NEXT: vcvtsh2ss %xmm0, %xmm0, %xmm0 ; CHECK-NEXT: callq fmodf@PLT ; CHECK-NEXT: vcvtss2sh %xmm0, %xmm0, %xmm0 @@ -1079,12 +1081,13 @@ ; CHECK-NEXT: callq fmodf@PLT ; CHECK-NEXT: vcvtss2sh %xmm0, %xmm0, %xmm0 ; CHECK-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill -; CHECK-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; CHECK-NEXT: # xmm0 = mem[3,3,3,3] +; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; CHECK-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; CHECK-NEXT: vcvtsh2ss %xmm0, %xmm0, %xmm1 -; CHECK-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; CHECK-NEXT: # xmm0 = mem[3,3,3,3] +; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; CHECK-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; CHECK-NEXT: vcvtsh2ss %xmm0, %xmm0, %xmm0 +; CHECK-NEXT: vzeroupper ; CHECK-NEXT: callq fmodf@PLT ; CHECK-NEXT: vcvtss2sh %xmm0, %xmm0, %xmm0 ; CHECK-NEXT: vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload @@ -1175,11 +1178,11 @@ ; CHECK-NEXT: callq fmodf@PLT ; CHECK-NEXT: vcvtss2sh %xmm0, %xmm0, %xmm0 ; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; CHECK-NEXT: # xmm0 = mem[3,3,3,3] +; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; CHECK-NEXT: vcvtsh2ss %xmm0, %xmm0, %xmm1 -; CHECK-NEXT: vpermilps $255, (%rsp), %xmm0 # 16-byte Folded Reload -; CHECK-NEXT: # xmm0 = mem[3,3,3,3] +; CHECK-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload +; CHECK-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; CHECK-NEXT: vcvtsh2ss %xmm0, %xmm0, %xmm0 ; CHECK-NEXT: callq fmodf@PLT ; CHECK-NEXT: vcvtss2sh %xmm0, %xmm0, %xmm0 @@ -1260,11 +1263,11 @@ ; CHECK-NEXT: callq fmodf@PLT ; CHECK-NEXT: vcvtss2sh %xmm0, %xmm0, %xmm0 ; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; CHECK-NEXT: # xmm0 = mem[3,3,3,3] +; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; CHECK-NEXT: vcvtsh2ss %xmm0, %xmm0, %xmm1 -; CHECK-NEXT: vpermilps $255, (%rsp), %xmm0 # 16-byte Folded Reload -; CHECK-NEXT: # xmm0 = mem[3,3,3,3] +; CHECK-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload +; CHECK-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; CHECK-NEXT: vcvtsh2ss %xmm0, %xmm0, %xmm0 ; CHECK-NEXT: callq fmodf@PLT ; CHECK-NEXT: vcvtss2sh %xmm0, %xmm0, %xmm0 @@ -1346,11 +1349,11 @@ ; CHECK-NEXT: callq fmodf@PLT ; CHECK-NEXT: vcvtss2sh %xmm0, %xmm0, %xmm0 ; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; CHECK-NEXT: # xmm0 = mem[3,3,3,3] +; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; CHECK-NEXT: vcvtsh2ss %xmm0, %xmm0, %xmm1 -; CHECK-NEXT: vpermilps $255, (%rsp), %xmm0 # 16-byte Folded Reload -; CHECK-NEXT: # xmm0 = mem[3,3,3,3] +; CHECK-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload +; CHECK-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; CHECK-NEXT: vcvtsh2ss %xmm0, %xmm0, %xmm0 ; CHECK-NEXT: callq fmodf@PLT ; CHECK-NEXT: vcvtss2sh %xmm0, %xmm0, %xmm0 @@ -1426,12 +1429,13 @@ ; CHECK-NEXT: callq fmodf@PLT ; CHECK-NEXT: vcvtss2sh %xmm0, %xmm0, %xmm0 ; CHECK-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill -; CHECK-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; CHECK-NEXT: # xmm0 = mem[3,3,3,3] +; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; CHECK-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; CHECK-NEXT: vcvtsh2ss %xmm0, %xmm0, %xmm1 -; CHECK-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; CHECK-NEXT: # xmm0 = mem[3,3,3,3] +; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; CHECK-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; CHECK-NEXT: vcvtsh2ss %xmm0, %xmm0, %xmm0 +; CHECK-NEXT: vzeroupper ; CHECK-NEXT: callq fmodf@PLT ; CHECK-NEXT: vcvtss2sh %xmm0, %xmm0, %xmm0 ; CHECK-NEXT: vpunpcklwd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload diff --git a/llvm/test/CodeGen/X86/avx512fp16-mov.ll b/llvm/test/CodeGen/X86/avx512fp16-mov.ll --- a/llvm/test/CodeGen/X86/avx512fp16-mov.ll +++ b/llvm/test/CodeGen/X86/avx512fp16-mov.ll @@ -1334,7 +1334,7 @@ define half @extract_f16_6(<8 x half> %x) { ; CHECK-LABEL: extract_f16_6: ; CHECK: # %bb.0: -; CHECK-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; CHECK-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; CHECK-NEXT: ret{{[l|q]}} %res = extractelement <8 x half> %x, i32 6 ret half %res @@ -1606,14 +1606,14 @@ define void @extract_store_f16_6(<8 x half> %x, ptr %y) { ; X64-LABEL: extract_store_f16_6: ; X64: # %bb.0: -; X64-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; X64-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; X64-NEXT: vmovsh %xmm0, (%rdi) ; X64-NEXT: retq ; ; X86-LABEL: extract_store_f16_6: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; X86-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; X86-NEXT: vmovsh %xmm0, (%eax) ; X86-NEXT: retl %res = extractelement <8 x half> %x, i32 6 diff --git a/llvm/test/CodeGen/X86/avx512fp16-mscatter.ll b/llvm/test/CodeGen/X86/avx512fp16-mscatter.ll --- a/llvm/test/CodeGen/X86/avx512fp16-mscatter.ll +++ b/llvm/test/CodeGen/X86/avx512fp16-mscatter.ll @@ -31,7 +31,7 @@ ; CHECK-NEXT: vpsrldq {{.*#+}} xmm3 = xmm1[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; CHECK-NEXT: vpextrq $1, %xmm4, %rax ; CHECK-NEXT: vmovsh %xmm3, (%rax) -; CHECK-NEXT: vpermilps {{.*#+}} xmm3 = xmm1[3,3,3,3] +; CHECK-NEXT: vshufps {{.*#+}} xmm3 = xmm1[3,3,3,3] ; CHECK-NEXT: vextracti32x4 $3, %zmm0, %xmm0 ; CHECK-NEXT: vmovq %xmm0, %rax ; CHECK-NEXT: vmovsh %xmm3, (%rax) @@ -58,7 +58,7 @@ ; CHECK-NEXT: vpsrldq {{.*#+}} xmm1 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; CHECK-NEXT: vpextrq $1, %xmm3, %rax ; CHECK-NEXT: vmovsh %xmm1, (%rax) -; CHECK-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,3,3,3] +; CHECK-NEXT: vshufps {{.*#+}} xmm1 = xmm0[3,3,3,3] ; CHECK-NEXT: vextracti32x4 $3, %zmm2, %xmm2 ; CHECK-NEXT: vmovq %xmm2, %rax ; CHECK-NEXT: vmovsh %xmm1, (%rax) diff --git a/llvm/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll b/llvm/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll --- a/llvm/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll +++ b/llvm/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll @@ -891,7 +891,7 @@ define <8 x float>@test_int_x86_avx512_vpermil_ps_256(<8 x float> %x0, <8 x float> %x2) { ; CHECK-LABEL: test_int_x86_avx512_vpermil_ps_256: ; CHECK: # %bb.0: -; CHECK-NEXT: vpermilps $22, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x04,0xc0,0x16] +; CHECK-NEXT: vshufps $22, %ymm0, %ymm0, %ymm0 # encoding: [0xc5,0xfc,0xc6,0xc0,0x16] ; CHECK-NEXT: # ymm0 = ymm0[2,1,1,0,6,5,5,4] ; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] %res = call <8 x float> @llvm.x86.avx512.mask.vpermil.ps.256(<8 x float> %x0, i32 22, <8 x float> %x2, i8 -1) @@ -943,7 +943,7 @@ define <4 x float>@test_int_x86_avx512_vpermil_ps_128(<4 x float> %x0, <4 x float> %x2) { ; CHECK-LABEL: test_int_x86_avx512_vpermil_ps_128: ; CHECK: # %bb.0: -; CHECK-NEXT: vpermilps $22, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x04,0xc0,0x16] +; CHECK-NEXT: vshufps $22, %xmm0, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0xc6,0xc0,0x16] ; CHECK-NEXT: # xmm0 = xmm0[2,1,1,0] ; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] %res = call <4 x float> @llvm.x86.avx512.mask.vpermil.ps.128(<4 x float> %x0, i32 22, <4 x float> %x2, i8 -1) @@ -1967,7 +1967,7 @@ define <4 x i32>@test_int_x86_avx512_pshuf_d_128(<4 x i32> %x0, i32 %x1, <4 x i32> %x2) { ; CHECK-LABEL: test_int_x86_avx512_pshuf_d_128: ; CHECK: # %bb.0: -; CHECK-NEXT: vpermilps $3, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x04,0xc0,0x03] +; CHECK-NEXT: vshufps $3, %xmm0, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0xc6,0xc0,0x03] ; CHECK-NEXT: # xmm0 = xmm0[3,0,0,0] ; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] %res = call <4 x i32> @llvm.x86.avx512.mask.pshuf.d.128(<4 x i32> %x0, i32 3, <4 x i32> %x2, i8 -1) @@ -2019,7 +2019,7 @@ define <8 x i32>@test_int_x86_avx512_pshuf_d_256(<8 x i32> %x0, i32 %x1, <8 x i32> %x2) { ; CHECK-LABEL: test_int_x86_avx512_pshuf_d_256: ; CHECK: # %bb.0: -; CHECK-NEXT: vpermilps $3, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x04,0xc0,0x03] +; CHECK-NEXT: vshufps $3, %ymm0, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0xc6,0xc0,0x03] ; CHECK-NEXT: # ymm0 = ymm0[3,0,0,0,7,4,4,4] ; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] %res = call <8 x i32> @llvm.x86.avx512.mask.pshuf.d.256(<8 x i32> %x0, i32 3, <8 x i32> %x2, i8 -1) diff --git a/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool-sext.ll b/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool-sext.ll --- a/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool-sext.ll +++ b/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool-sext.ll @@ -352,7 +352,7 @@ ; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,1,1,4,5,6,7] ; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,2,3,3,4,5,6,7] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,1,1,4,4,5,5] +; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0,1,1,4,4,5,5] ; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [9241421688590303745,9241421688590303745] @@ -617,7 +617,7 @@ ; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm1[0,0,1,1,4,5,6,7] ; AVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm1[2,2,3,3,4,5,6,7] ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,1,1,4,4,5,5] +; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0,1,1,4,4,5,5] ; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm2 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745] ; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 @@ -627,7 +627,7 @@ ; AVX1-NEXT: vpshufhw {{.*#+}} xmm3 = xmm1[0,1,2,3,4,4,5,5] ; AVX1-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,6,7,7] ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 -; AVX1-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[2,2,3,3,6,6,7,7] +; AVX1-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,2,3,3,6,6,7,7] ; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 ; AVX1-NEXT: vpcmpeqb %xmm2, %xmm3, %xmm3 diff --git a/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool-zext.ll b/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool-zext.ll --- a/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool-zext.ll +++ b/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool-zext.ll @@ -451,7 +451,7 @@ ; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,1,1,4,5,6,7] ; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,2,3,3,4,5,6,7] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,1,1,4,4,5,5] +; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0,1,1,4,4,5,5] ; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [9241421688590303745,9241421688590303745] @@ -806,7 +806,7 @@ ; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm1[0,0,1,1,4,5,6,7] ; AVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm1[2,2,3,3,4,5,6,7] ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,1,1,4,4,5,5] +; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0,1,1,4,4,5,5] ; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm2 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745] ; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 @@ -821,7 +821,7 @@ ; AVX1-NEXT: vpshufhw {{.*#+}} xmm3 = xmm1[0,1,2,3,4,4,5,5] ; AVX1-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,6,7,7] ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 -; AVX1-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[2,2,3,3,6,6,7,7] +; AVX1-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,2,3,3,6,6,7,7] ; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 ; AVX1-NEXT: vpcmpeqb %xmm2, %xmm3, %xmm3 diff --git a/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool.ll b/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool.ll --- a/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool.ll +++ b/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool.ll @@ -246,7 +246,7 @@ ; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,1,1,4,5,6,7] ; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,2,3,3,4,5,6,7] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,1,1,4,4,5,5] +; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0,1,1,4,4,5,5] ; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [9241421688590303745,9241421688590303745] diff --git a/llvm/test/CodeGen/X86/buildvec-extract.ll b/llvm/test/CodeGen/X86/buildvec-extract.ll --- a/llvm/test/CodeGen/X86/buildvec-extract.ll +++ b/llvm/test/CodeGen/X86/buildvec-extract.ll @@ -83,7 +83,7 @@ ; ; AVX-LABEL: extract1_i32_zext_insert0_i64_zero: ; AVX: # %bb.0: -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,1,1] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1,1,1] ; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] ; AVX-NEXT: retq @@ -128,7 +128,7 @@ ; ; AVX-LABEL: extract2_i32_zext_insert0_i64_zero: ; AVX: # %bb.0: -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] ; AVX-NEXT: retq @@ -187,7 +187,7 @@ ; ; AVX-LABEL: extract0_i32_zext_insert1_i64_undef: ; AVX: # %bb.0: -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,0,1] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1,0,1] ; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3] ; AVX-NEXT: retq diff --git a/llvm/test/CodeGen/X86/combine-and.ll b/llvm/test/CodeGen/X86/combine-and.ll --- a/llvm/test/CodeGen/X86/combine-and.ll +++ b/llvm/test/CodeGen/X86/combine-and.ll @@ -624,7 +624,7 @@ ; AVX1-LABEL: neg_scalar_broadcast_v8i64: ; AVX1: # %bb.0: ; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,1,0,1] +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[0,1,0,1] ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,3] @@ -790,7 +790,7 @@ ; ; AVX1-LABEL: casted_neg_scalar_broadcast_v2i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,0,1] +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1,0,1] ; AVX1-NEXT: vandnps %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/combine-concatvectors.ll b/llvm/test/CodeGen/X86/combine-concatvectors.ll --- a/llvm/test/CodeGen/X86/combine-concatvectors.ll +++ b/llvm/test/CodeGen/X86/combine-concatvectors.ll @@ -91,7 +91,7 @@ ; AVX1-NEXT: vmovaps (%rdi), %ymm0 ; AVX1-NEXT: vmovaps (%rsi), %ymm1 ; AVX1-NEXT: vmovaps (%rdx), %ymm2 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,0,0,0] +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,0,0,0] ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 ; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4],ymm2[5,6],ymm0[7] diff --git a/llvm/test/CodeGen/X86/copy-low-subvec-elt-to-high-subvec-elt.ll b/llvm/test/CodeGen/X86/copy-low-subvec-elt-to-high-subvec-elt.ll --- a/llvm/test/CodeGen/X86/copy-low-subvec-elt-to-high-subvec-elt.ll +++ b/llvm/test/CodeGen/X86/copy-low-subvec-elt-to-high-subvec-elt.ll @@ -47,7 +47,7 @@ define <4 x float> @vec128_eltty_float_source_subvec_0_target_subvec_mask_1_unary(<4 x float> %x) nounwind { ; CHECK-LABEL: vec128_eltty_float_source_subvec_0_target_subvec_mask_1_unary: ; CHECK: # %bb.0: -; CHECK-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,2,0] +; CHECK-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1,2,0] ; CHECK-NEXT: retq %r = shufflevector <4 x float> %x, <4 x float> poison, <4 x i32> ret <4 x float> %r @@ -65,7 +65,7 @@ define <4 x i32> @vec128_eltty_i32_source_subvec_0_target_subvec_mask_1_unary(<4 x i32> %x) nounwind { ; CHECK-LABEL: vec128_eltty_i32_source_subvec_0_target_subvec_mask_1_unary: ; CHECK: # %bb.0: -; CHECK-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,2,0] +; CHECK-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1,2,0] ; CHECK-NEXT: retq %r = shufflevector <4 x i32> %x, <4 x i32> poison, <4 x i32> ret <4 x i32> %r @@ -322,7 +322,7 @@ define <4 x i64> @vec256_eltty_i64_source_subvec_1_target_subvec_mask_2_binary(<4 x i64> %x, <4 x i64> %y) nounwind { ; CHECK-LABEL: vec256_eltty_i64_source_subvec_1_target_subvec_mask_2_binary: ; CHECK: # %bb.0: -; CHECK-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,1,0,1,4,5,4,5] +; CHECK-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,0,1,4,5,4,5] ; CHECK-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; CHECK-NEXT: retq %r = shufflevector <4 x i64> %x, <4 x i64> %y, <4 x i32> @@ -448,7 +448,7 @@ ; ; CHECK-FAST-LABEL: vec256_eltty_float_source_subvec_1_target_subvec_mask_2_binary: ; CHECK-FAST: # %bb.0: -; CHECK-FAST-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,0,0,0,4,4,4,4] +; CHECK-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,0,0,0,4,4,4,4] ; CHECK-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] ; CHECK-FAST-NEXT: retq %r = shufflevector <8 x float> %x, <8 x float> %y, <8 x i32> @@ -577,7 +577,7 @@ ; ; CHECK-FAST-LABEL: vec256_eltty_i32_source_subvec_1_target_subvec_mask_2_binary: ; CHECK-FAST: # %bb.0: -; CHECK-FAST-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,0,0,0,4,4,4,4] +; CHECK-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,0,0,0,4,4,4,4] ; CHECK-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7] ; CHECK-FAST-NEXT: retq %r = shufflevector <8 x i32> %x, <8 x i32> %y, <8 x i32> @@ -598,7 +598,7 @@ ; CHECK-SLOW-LABEL: vec256_eltty_i32_source_subvec_1_target_subvec_mask_3_binary: ; CHECK-SLOW: # %bb.0: ; CHECK-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,3,2,3] -; CHECK-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,0,0,0,4,4,4,4] +; CHECK-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,0,0,0,4,4,4,4] ; CHECK-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6],ymm1[7] ; CHECK-SLOW-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/expand-vp-fp-intrinsics.ll b/llvm/test/CodeGen/X86/expand-vp-fp-intrinsics.ll --- a/llvm/test/CodeGen/X86/expand-vp-fp-intrinsics.ll +++ b/llvm/test/CodeGen/X86/expand-vp-fp-intrinsics.ll @@ -139,10 +139,10 @@ ; AVX-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3] ; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill -; AVX-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; AVX-NEXT: # xmm0 = mem[3,3,3,3] -; AVX-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; AVX-NEXT: # xmm1 = mem[3,3,3,3] +; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3] ; AVX-NEXT: callq fmodf@PLT ; AVX-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] diff --git a/llvm/test/CodeGen/X86/extract-concat.ll b/llvm/test/CodeGen/X86/extract-concat.ll --- a/llvm/test/CodeGen/X86/extract-concat.ll +++ b/llvm/test/CodeGen/X86/extract-concat.ll @@ -68,9 +68,9 @@ ; ; AVX1-LABEL: catcat: ; AVX1: # %bb.0: -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,1,0,1] +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[0,1,0,1] ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm4 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] ; AVX1-NEXT: vmovddup {{.*#+}} ymm2 = ymm0[0,0,2,2] diff --git a/llvm/test/CodeGen/X86/extract-store.ll b/llvm/test/CodeGen/X86/extract-store.ll --- a/llvm/test/CodeGen/X86/extract-store.ll +++ b/llvm/test/CodeGen/X86/extract-store.ll @@ -326,7 +326,7 @@ ; AVX-X86-LABEL: extract_i64_1: ; AVX-X86: # %bb.0: ; AVX-X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; AVX-X86-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX-X86-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,0,1] ; AVX-X86-NEXT: vmovlps %xmm0, (%eax) ; AVX-X86-NEXT: retl ; diff --git a/llvm/test/CodeGen/X86/fdiv-combine-vec.ll b/llvm/test/CodeGen/X86/fdiv-combine-vec.ll --- a/llvm/test/CodeGen/X86/fdiv-combine-vec.ll +++ b/llvm/test/CodeGen/X86/fdiv-combine-vec.ll @@ -61,7 +61,7 @@ ; AVX: # %bb.0: ; AVX-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero ; AVX-NEXT: vdivss %xmm1, %xmm2, %xmm1 -; AVX-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,0,0,0] +; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,0,0,0] ; AVX-NEXT: vmulps %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq %vy = insertelement <4 x float> undef, float %y, i32 0 @@ -84,7 +84,7 @@ ; AVX: # %bb.0: ; AVX-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero ; AVX-NEXT: vdivss %xmm1, %xmm2, %xmm1 -; AVX-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,0,0,0] +; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,0,0,0] ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 ; AVX-NEXT: vmulps %ymm1, %ymm0, %ymm0 ; AVX-NEXT: retq @@ -115,7 +115,7 @@ ; AVX-NEXT: vsubss %xmm1, %xmm3, %xmm1 ; AVX-NEXT: vmulss %xmm1, %xmm2, %xmm1 ; AVX-NEXT: vaddss %xmm1, %xmm2, %xmm1 -; AVX-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,0,0,0] +; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,0,0,0] ; AVX-NEXT: vmulps %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq %vy = insertelement <4 x float> undef, float %y, i32 0 @@ -146,7 +146,7 @@ ; AVX-NEXT: vsubss %xmm1, %xmm3, %xmm1 ; AVX-NEXT: vmulss %xmm1, %xmm2, %xmm1 ; AVX-NEXT: vaddss %xmm1, %xmm2, %xmm1 -; AVX-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,0,0,0] +; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,0,0,0] ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 ; AVX-NEXT: vmulps %ymm1, %ymm0, %ymm0 ; AVX-NEXT: retq diff --git a/llvm/test/CodeGen/X86/fmaddsub-combine.ll b/llvm/test/CodeGen/X86/fmaddsub-combine.ll --- a/llvm/test/CodeGen/X86/fmaddsub-combine.ll +++ b/llvm/test/CodeGen/X86/fmaddsub-combine.ll @@ -490,8 +490,8 @@ ; NOFMA-NEXT: vsubss %xmm5, %xmm4, %xmm4 ; NOFMA-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[2,3] ; NOFMA-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0],xmm1[3] -; NOFMA-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] -; NOFMA-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[3,3,3,3] +; NOFMA-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; NOFMA-NEXT: vshufps {{.*#+}} xmm2 = xmm2[3,3,3,3] ; NOFMA-NEXT: vsubss %xmm2, %xmm0, %xmm0 ; NOFMA-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] ; NOFMA-NEXT: retq @@ -578,8 +578,8 @@ ; NOFMA-NEXT: vsubss %xmm9, %xmm8, %xmm8 ; NOFMA-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[2,3] ; NOFMA-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0],xmm1[3] -; NOFMA-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] -; NOFMA-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[3,3,3,3] +; NOFMA-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; NOFMA-NEXT: vshufps {{.*#+}} xmm2 = xmm2[3,3,3,3] ; NOFMA-NEXT: vsubss %xmm2, %xmm0, %xmm0 ; NOFMA-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] ; NOFMA-NEXT: vmovshdup {{.*#+}} xmm1 = xmm4[1,1,3,3] @@ -587,8 +587,8 @@ ; NOFMA-NEXT: vsubss %xmm2, %xmm1, %xmm1 ; NOFMA-NEXT: vinsertps {{.*#+}} xmm1 = xmm6[0],xmm1[0],xmm6[2,3] ; NOFMA-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm7[0],xmm1[3] -; NOFMA-NEXT: vpermilps {{.*#+}} xmm2 = xmm4[3,3,3,3] -; NOFMA-NEXT: vpermilps {{.*#+}} xmm3 = xmm5[3,3,3,3] +; NOFMA-NEXT: vshufps {{.*#+}} xmm2 = xmm4[3,3,3,3] +; NOFMA-NEXT: vshufps {{.*#+}} xmm3 = xmm5[3,3,3,3] ; NOFMA-NEXT: vsubss %xmm3, %xmm2, %xmm2 ; NOFMA-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0] ; NOFMA-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 @@ -719,12 +719,12 @@ ; NOFMA-NEXT: vsubss %xmm15, %xmm14, %xmm14 ; NOFMA-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm14[0],xmm2[2,3] ; NOFMA-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0],xmm2[3] -; NOFMA-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] -; NOFMA-NEXT: vpermilps {{.*#+}} xmm3 = xmm4[3,3,3,3] +; NOFMA-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; NOFMA-NEXT: vshufps {{.*#+}} xmm3 = xmm4[3,3,3,3] ; NOFMA-NEXT: vsubss %xmm3, %xmm0, %xmm0 ; NOFMA-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[0] -; NOFMA-NEXT: vpermilps {{.*#+}} xmm2 = xmm6[3,3,3,3] -; NOFMA-NEXT: vpermilps {{.*#+}} xmm3 = xmm7[3,3,3,3] +; NOFMA-NEXT: vshufps {{.*#+}} xmm2 = xmm6[3,3,3,3] +; NOFMA-NEXT: vshufps {{.*#+}} xmm3 = xmm7[3,3,3,3] ; NOFMA-NEXT: vsubss %xmm3, %xmm2, %xmm2 ; NOFMA-NEXT: vinsertps {{.*#+}} xmm2 = xmm8[0,1,2],xmm2[0] ; NOFMA-NEXT: vmovshdup {{.*#+}} xmm3 = xmm1[1,1,3,3] @@ -732,16 +732,16 @@ ; NOFMA-NEXT: vsubss %xmm4, %xmm3, %xmm3 ; NOFMA-NEXT: vinsertps {{.*#+}} xmm3 = xmm9[0],xmm3[0],xmm9[2,3] ; NOFMA-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1],xmm10[0],xmm3[3] -; NOFMA-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,3,3,3] -; NOFMA-NEXT: vpermilps {{.*#+}} xmm4 = xmm5[3,3,3,3] +; NOFMA-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3] +; NOFMA-NEXT: vshufps {{.*#+}} xmm4 = xmm5[3,3,3,3] ; NOFMA-NEXT: vsubss %xmm4, %xmm1, %xmm1 ; NOFMA-NEXT: vinsertps {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[0] ; NOFMA-NEXT: vmovshdup {{.*#+}} xmm3 = xmm11[1,1,3,3] ; NOFMA-NEXT: vmovshdup {{.*#+}} xmm4 = xmm13[1,1,3,3] ; NOFMA-NEXT: vsubss %xmm4, %xmm3, %xmm3 ; NOFMA-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,0],xmm12[0,0] -; NOFMA-NEXT: vpermilps {{.*#+}} xmm4 = xmm11[3,3,3,3] -; NOFMA-NEXT: vpermilps {{.*#+}} xmm5 = xmm13[3,3,3,3] +; NOFMA-NEXT: vshufps {{.*#+}} xmm4 = xmm11[3,3,3,3] +; NOFMA-NEXT: vshufps {{.*#+}} xmm5 = xmm13[3,3,3,3] ; NOFMA-NEXT: vsubss %xmm5, %xmm4, %xmm4 ; NOFMA-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[0] ; NOFMA-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 diff --git a/llvm/test/CodeGen/X86/haddsub-2.ll b/llvm/test/CodeGen/X86/haddsub-2.ll --- a/llvm/test/CodeGen/X86/haddsub-2.ll +++ b/llvm/test/CodeGen/X86/haddsub-2.ll @@ -907,12 +907,12 @@ ; AVX-LABEL: not_a_hsub_2: ; AVX: # %bb.0: ; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] -; AVX-NEXT: vpermilps {{.*#+}} xmm3 = xmm0[3,3,3,3] +; AVX-NEXT: vshufps {{.*#+}} xmm3 = xmm0[3,3,3,3] ; AVX-NEXT: vsubss %xmm3, %xmm2, %xmm2 ; AVX-NEXT: vmovshdup {{.*#+}} xmm3 = xmm0[1,1,3,3] ; AVX-NEXT: vsubss %xmm3, %xmm0, %xmm0 ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[2,3] -; AVX-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[3,3,3,3] +; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm1[3,3,3,3] ; AVX-NEXT: vpermilpd {{.*#+}} xmm3 = xmm1[1,0] ; AVX-NEXT: vsubss %xmm3, %xmm2, %xmm2 ; AVX-NEXT: vmovshdup {{.*#+}} xmm3 = xmm1[1,1,3,3] diff --git a/llvm/test/CodeGen/X86/haddsub-4.ll b/llvm/test/CodeGen/X86/haddsub-4.ll --- a/llvm/test/CodeGen/X86/haddsub-4.ll +++ b/llvm/test/CodeGen/X86/haddsub-4.ll @@ -66,13 +66,13 @@ ; AVX1: # %bb.0: ; AVX1-NEXT: vhaddps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1] -; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6] +; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6] ; AVX1-NEXT: retq ; ; AVX2-LABEL: hadd_reverse_v8f32: ; AVX2: # %bb.0: ; AVX2-NEXT: vhaddps %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6] +; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6] ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,3,0,1] ; AVX2-NEXT: retq %lhs = shufflevector <8 x float> %a0, <8 x float> %a1, <8 x i32> @@ -96,17 +96,17 @@ ; AVX1-LABEL: hadd_reverse2_v8f32: ; AVX1: # %bb.0: ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1] -; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,0,1] -; AVX1-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[3,2,1,0,7,6,5,4] +; AVX1-NEXT: vshufps {{.*#+}} ymm1 = ymm1[3,2,1,0,7,6,5,4] ; AVX1-NEXT: vhaddps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: hadd_reverse2_v8f32: ; AVX2: # %bb.0: -; AVX2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,3,0,1] -; AVX2-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[3,2,1,0,7,6,5,4] +; AVX2-NEXT: vshufps {{.*#+}} ymm1 = ymm1[3,2,1,0,7,6,5,4] ; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,3,0,1] ; AVX2-NEXT: vhaddps %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq @@ -133,13 +133,13 @@ ; AVX1: # %bb.0: ; AVX1-NEXT: vhaddps %ymm0, %ymm1, %ymm0 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1] -; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6] +; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6] ; AVX1-NEXT: retq ; ; AVX2-LABEL: hadd_reverse3_v8f32: ; AVX2: # %bb.0: ; AVX2-NEXT: vhaddps %ymm0, %ymm1, %ymm0 -; AVX2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6] +; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6] ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,3,0,1] ; AVX2-NEXT: retq %shuf0 = shufflevector <8 x float> %a0, <8 x float> %a1, <8 x i32> @@ -337,17 +337,17 @@ ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm3[2,3] ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 ; AVX1-NEXT: vhaddps %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6] -; AVX1-NEXT: vpermilps {{.*#+}} ymm1 = ymm2[1,0,3,2,5,4,7,6] +; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6] +; AVX1-NEXT: vshufps {{.*#+}} ymm1 = ymm2[1,0,3,2,5,4,7,6] ; AVX1-NEXT: retq ; ; AVX2-LABEL: hadd_reverse_v16f32: ; AVX2: # %bb.0: ; AVX2-NEXT: vhaddps %ymm3, %ymm1, %ymm1 -; AVX2-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[1,0,3,2,5,4,7,6] +; AVX2-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,0,3,2,5,4,7,6] ; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm1[2,0,3,1] ; AVX2-NEXT: vhaddps %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6] +; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6] ; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm0[2,0,3,1] ; AVX2-NEXT: vmovaps %ymm3, %ymm0 ; AVX2-NEXT: retq @@ -380,20 +380,20 @@ ; AVX1: # %bb.0: ; AVX1-NEXT: vhaddps %ymm3, %ymm1, %ymm1 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,0,1] -; AVX1-NEXT: vpermilps {{.*#+}} ymm3 = ymm1[1,0,3,2,5,4,7,6] +; AVX1-NEXT: vshufps {{.*#+}} ymm3 = ymm1[1,0,3,2,5,4,7,6] ; AVX1-NEXT: vhaddps %ymm2, %ymm0, %ymm0 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1] -; AVX1-NEXT: vpermilps {{.*#+}} ymm1 = ymm0[1,0,3,2,5,4,7,6] +; AVX1-NEXT: vshufps {{.*#+}} ymm1 = ymm0[1,0,3,2,5,4,7,6] ; AVX1-NEXT: vmovaps %ymm3, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: hadd_reverse2_v16f32: ; AVX2: # %bb.0: ; AVX2-NEXT: vhaddps %ymm3, %ymm1, %ymm1 -; AVX2-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[1,0,3,2,5,4,7,6] +; AVX2-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,0,3,2,5,4,7,6] ; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm1[2,3,0,1] ; AVX2-NEXT: vhaddps %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6] +; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6] ; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm0[2,3,0,1] ; AVX2-NEXT: vmovaps %ymm3, %ymm0 ; AVX2-NEXT: retq diff --git a/llvm/test/CodeGen/X86/haddsub-undef.ll b/llvm/test/CodeGen/X86/haddsub-undef.ll --- a/llvm/test/CodeGen/X86/haddsub-undef.ll +++ b/llvm/test/CodeGen/X86/haddsub-undef.ll @@ -208,7 +208,7 @@ ; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; AVX-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm1 ; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] -; AVX-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX-SLOW-NEXT: vaddss %xmm0, %xmm2, %xmm0 ; AVX-SLOW-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3] ; AVX-SLOW-NEXT: retq @@ -216,7 +216,7 @@ ; AVX-FAST-LABEL: test8_undef: ; AVX-FAST: # %bb.0: ; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 -; AVX-FAST-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,3] +; AVX-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1,1,3] ; AVX-FAST-NEXT: retq %vecext = extractelement <4 x float> %a, i32 0 %vecext1 = extractelement <4 x float> %a, i32 1 @@ -377,7 +377,7 @@ ; AVX512-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] ; AVX512-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm1 ; AVX512-SLOW-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] -; AVX512-SLOW-NEXT: vpermilps {{.*#+}} xmm3 = xmm0[3,3,3,3] +; AVX512-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm0[3,3,3,3] ; AVX512-SLOW-NEXT: vaddss %xmm3, %xmm2, %xmm2 ; AVX512-SLOW-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[2,3] ; AVX512-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm0 @@ -385,7 +385,7 @@ ; AVX512-SLOW-NEXT: vaddss %xmm2, %xmm0, %xmm2 ; AVX512-SLOW-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] ; AVX512-SLOW-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] -; AVX512-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX512-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX512-SLOW-NEXT: vaddss %xmm0, %xmm2, %xmm0 ; AVX512-SLOW-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] ; AVX512-SLOW-NEXT: retq @@ -520,15 +520,15 @@ ; ; AVX-SLOW-LABEL: add_ps_030: ; AVX-SLOW: # %bb.0: -; AVX-SLOW-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3] -; AVX-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,0,2,3] +; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm0[3,1,2,3] +; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,0,2,3] ; AVX-SLOW-NEXT: vaddps %xmm1, %xmm0, %xmm0 ; AVX-SLOW-NEXT: retq ; ; AVX-FAST-LABEL: add_ps_030: ; AVX-FAST: # %bb.0: ; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0 -; AVX-FAST-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,0,2,3] +; AVX-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,0,2,3] ; AVX-FAST-NEXT: retq %l = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> %r = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> @@ -591,7 +591,7 @@ ; AVX-LABEL: add_ps_016: ; AVX: # %bb.0: ; AVX-NEXT: vhaddps %xmm0, %xmm1, %xmm0 -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,0,3,3] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,0,3,3] ; AVX-NEXT: retq %3 = shufflevector <4 x float> %1, <4 x float> %0, <2 x i32> %4 = shufflevector <4 x float> %1, <4 x float> %0, <2 x i32> @@ -620,8 +620,8 @@ ; ; AVX-SLOW-LABEL: add_ps_017: ; AVX-SLOW: # %bb.0: -; AVX-SLOW-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,3,3,3] -; AVX-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,2,2,2] +; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm0[3,3,3,3] +; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,2,2,2] ; AVX-SLOW-NEXT: vaddps %xmm1, %xmm0, %xmm0 ; AVX-SLOW-NEXT: retq ; @@ -943,8 +943,8 @@ ; ; AVX-SLOW-LABEL: PR45747_1: ; AVX-SLOW: # %bb.0: -; AVX-SLOW-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,2,2,2] -; AVX-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,2,2,2] +; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX-SLOW-NEXT: vaddps %xmm1, %xmm0, %xmm0 ; AVX-SLOW-NEXT: retq ; @@ -976,7 +976,7 @@ ; AVX-SLOW-LABEL: PR45747_2: ; AVX-SLOW: # %bb.0: ; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm0 = xmm1[1,0] -; AVX-SLOW-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,3,1,1] +; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,1,1] ; AVX-SLOW-NEXT: vaddps %xmm0, %xmm1, %xmm0 ; AVX-SLOW-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/haddsub.ll b/llvm/test/CodeGen/X86/haddsub.ll --- a/llvm/test/CodeGen/X86/haddsub.ll +++ b/llvm/test/CodeGen/X86/haddsub.ll @@ -502,7 +502,7 @@ ; AVX-SLOW-LABEL: extract_extract23_v4f32_fadd_f32: ; AVX-SLOW: # %bb.0: ; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX-SLOW-NEXT: vaddss %xmm0, %xmm1, %xmm0 ; AVX-SLOW-NEXT: retq ; @@ -563,7 +563,7 @@ ; AVX-SLOW-LABEL: extract_extract23_v4f32_fadd_f32_commute: ; AVX-SLOW: # %bb.0: ; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm0 ; AVX-SLOW-NEXT: retq ; @@ -683,7 +683,7 @@ ; AVX-SLOW-LABEL: extract_extract23_v4f32_fsub_f32: ; AVX-SLOW: # %bb.0: ; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX-SLOW-NEXT: vsubss %xmm0, %xmm1, %xmm0 ; AVX-SLOW-NEXT: retq ; @@ -729,7 +729,7 @@ ; AVX-LABEL: extract_extract23_v4f32_fsub_f32_commute: ; AVX: # %bb.0: ; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX-NEXT: vsubss %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq %x0 = extractelement <4 x float> %x, i32 2 @@ -837,7 +837,7 @@ ; AVX-SLOW-LABEL: extract_extract23_v8f32_fadd_f32: ; AVX-SLOW: # %bb.0: ; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX-SLOW-NEXT: vaddss %xmm0, %xmm1, %xmm0 ; AVX-SLOW-NEXT: vzeroupper ; AVX-SLOW-NEXT: retq @@ -873,7 +873,7 @@ ; AVX-SLOW: # %bb.0: ; AVX-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX-SLOW-NEXT: vaddss %xmm0, %xmm1, %xmm0 ; AVX-SLOW-NEXT: vzeroupper ; AVX-SLOW-NEXT: retq @@ -939,7 +939,7 @@ ; AVX-SLOW-LABEL: extract_extract23_v8f32_fadd_f32_commute: ; AVX-SLOW: # %bb.0: ; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm0 ; AVX-SLOW-NEXT: vzeroupper ; AVX-SLOW-NEXT: retq @@ -975,7 +975,7 @@ ; AVX-SLOW: # %bb.0: ; AVX-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm0 ; AVX-SLOW-NEXT: vzeroupper ; AVX-SLOW-NEXT: retq @@ -1172,7 +1172,7 @@ ; AVX-SLOW-LABEL: extract_extract23_v8f32_fsub_f32: ; AVX-SLOW: # %bb.0: ; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX-SLOW-NEXT: vsubss %xmm0, %xmm1, %xmm0 ; AVX-SLOW-NEXT: vzeroupper ; AVX-SLOW-NEXT: retq diff --git a/llvm/test/CodeGen/X86/horizontal-reduce-smax.ll b/llvm/test/CodeGen/X86/horizontal-reduce-smax.ll --- a/llvm/test/CodeGen/X86/horizontal-reduce-smax.ll +++ b/llvm/test/CodeGen/X86/horizontal-reduce-smax.ll @@ -457,7 +457,7 @@ ; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; X86-AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 ; X86-AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; X86-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X86-AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 ; X86-AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; X86-AVX1-NEXT: vmovd %xmm0, %eax @@ -470,7 +470,7 @@ ; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; X86-AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 ; X86-AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; X86-AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X86-AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 ; X86-AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; X86-AVX2-NEXT: vmovd %xmm0, %eax @@ -531,7 +531,7 @@ ; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; X64-AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 ; X64-AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; X64-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 ; X64-AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; X64-AVX1-NEXT: vmovq %xmm0, %rax @@ -543,7 +543,7 @@ ; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; X64-AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 ; X64-AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; X64-AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 ; X64-AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; X64-AVX2-NEXT: vmovq %xmm0, %rax @@ -1122,7 +1122,7 @@ ; X86-AVX1-NEXT: vblendvpd %xmm3, %xmm0, %xmm1, %xmm0 ; X86-AVX1-NEXT: vpcmpgtq %xmm2, %xmm0, %xmm1 ; X86-AVX1-NEXT: vblendvpd %xmm1, %xmm0, %xmm2, %xmm0 -; X86-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X86-AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 ; X86-AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; X86-AVX1-NEXT: vmovd %xmm0, %eax @@ -1137,7 +1137,7 @@ ; X86-AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 ; X86-AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 ; X86-AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; X86-AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X86-AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 ; X86-AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; X86-AVX2-NEXT: vmovd %xmm0, %eax @@ -1240,7 +1240,7 @@ ; X64-AVX1-NEXT: vblendvpd %xmm3, %xmm0, %xmm1, %xmm0 ; X64-AVX1-NEXT: vpcmpgtq %xmm2, %xmm0, %xmm1 ; X64-AVX1-NEXT: vblendvpd %xmm1, %xmm0, %xmm2, %xmm0 -; X64-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 ; X64-AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; X64-AVX1-NEXT: vmovq %xmm0, %rax @@ -1254,7 +1254,7 @@ ; X64-AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 ; X64-AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 ; X64-AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; X64-AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 ; X64-AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; X64-AVX2-NEXT: vmovq %xmm0, %rax diff --git a/llvm/test/CodeGen/X86/horizontal-reduce-smin.ll b/llvm/test/CodeGen/X86/horizontal-reduce-smin.ll --- a/llvm/test/CodeGen/X86/horizontal-reduce-smin.ll +++ b/llvm/test/CodeGen/X86/horizontal-reduce-smin.ll @@ -460,7 +460,7 @@ ; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; X86-AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 ; X86-AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; X86-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X86-AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 ; X86-AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; X86-AVX1-NEXT: vmovd %xmm0, %eax @@ -473,7 +473,7 @@ ; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; X86-AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 ; X86-AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; X86-AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X86-AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 ; X86-AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; X86-AVX2-NEXT: vmovd %xmm0, %eax @@ -535,7 +535,7 @@ ; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; X64-AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 ; X64-AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; X64-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 ; X64-AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; X64-AVX1-NEXT: vmovq %xmm0, %rax @@ -547,7 +547,7 @@ ; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; X64-AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 ; X64-AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; X64-AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 ; X64-AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; X64-AVX2-NEXT: vmovq %xmm0, %rax @@ -1126,7 +1126,7 @@ ; X86-AVX1-NEXT: vblendvpd %xmm3, %xmm0, %xmm1, %xmm0 ; X86-AVX1-NEXT: vpcmpgtq %xmm2, %xmm0, %xmm1 ; X86-AVX1-NEXT: vblendvpd %xmm1, %xmm2, %xmm0, %xmm0 -; X86-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X86-AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 ; X86-AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; X86-AVX1-NEXT: vmovd %xmm0, %eax @@ -1141,7 +1141,7 @@ ; X86-AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 ; X86-AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 ; X86-AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; X86-AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X86-AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 ; X86-AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; X86-AVX2-NEXT: vmovd %xmm0, %eax @@ -1244,7 +1244,7 @@ ; X64-AVX1-NEXT: vblendvpd %xmm3, %xmm0, %xmm1, %xmm0 ; X64-AVX1-NEXT: vpcmpgtq %xmm2, %xmm0, %xmm1 ; X64-AVX1-NEXT: vblendvpd %xmm1, %xmm2, %xmm0, %xmm0 -; X64-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 ; X64-AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; X64-AVX1-NEXT: vmovq %xmm0, %rax @@ -1258,7 +1258,7 @@ ; X64-AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 ; X64-AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 ; X64-AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; X64-AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 ; X64-AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; X64-AVX2-NEXT: vmovq %xmm0, %rax diff --git a/llvm/test/CodeGen/X86/horizontal-reduce-umax.ll b/llvm/test/CodeGen/X86/horizontal-reduce-umax.ll --- a/llvm/test/CodeGen/X86/horizontal-reduce-umax.ll +++ b/llvm/test/CodeGen/X86/horizontal-reduce-umax.ll @@ -52,7 +52,7 @@ ; ; X86-AVX1-LABEL: test_reduce_v2i64: ; X86-AVX1: ## %bb.0: -; X86-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X86-AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] ; X86-AVX1-NEXT: ## xmm2 = mem[0,0] ; X86-AVX1-NEXT: vxorps %xmm2, %xmm0, %xmm3 @@ -531,7 +531,7 @@ ; X86-AVX1-NEXT: vxorps %xmm2, %xmm0, %xmm4 ; X86-AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3 ; X86-AVX1-NEXT: vblendvpd %xmm3, %xmm0, %xmm1, %xmm0 -; X86-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X86-AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-AVX1-NEXT: vxorpd %xmm2, %xmm0, %xmm3 ; X86-AVX1-NEXT: vxorpd %xmm2, %xmm1, %xmm2 ; X86-AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 @@ -549,7 +549,7 @@ ; X86-AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm4 ; X86-AVX2-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3 ; X86-AVX2-NEXT: vblendvpd %xmm3, %xmm0, %xmm1, %xmm0 -; X86-AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X86-AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-AVX2-NEXT: vxorpd %xmm2, %xmm0, %xmm3 ; X86-AVX2-NEXT: vxorpd %xmm2, %xmm1, %xmm2 ; X86-AVX2-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 @@ -621,7 +621,7 @@ ; X64-AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm4 ; X64-AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3 ; X64-AVX1-NEXT: vblendvpd %xmm3, %xmm0, %xmm1, %xmm0 -; X64-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-AVX1-NEXT: vxorpd %xmm2, %xmm0, %xmm3 ; X64-AVX1-NEXT: vxorpd %xmm2, %xmm1, %xmm2 ; X64-AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 @@ -638,7 +638,7 @@ ; X64-AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm4 ; X64-AVX2-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3 ; X64-AVX2-NEXT: vblendvpd %xmm3, %xmm0, %xmm1, %xmm0 -; X64-AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-AVX2-NEXT: vxorpd %xmm2, %xmm0, %xmm3 ; X64-AVX2-NEXT: vxorpd %xmm2, %xmm1, %xmm2 ; X64-AVX2-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 @@ -1236,7 +1236,7 @@ ; X86-AVX1-NEXT: vxorpd %xmm2, %xmm3, %xmm4 ; X86-AVX1-NEXT: vpcmpgtq %xmm1, %xmm4, %xmm1 ; X86-AVX1-NEXT: vblendvpd %xmm1, %xmm3, %xmm0, %xmm0 -; X86-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X86-AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-AVX1-NEXT: vxorpd %xmm2, %xmm0, %xmm3 ; X86-AVX1-NEXT: vxorpd %xmm2, %xmm1, %xmm2 ; X86-AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 @@ -1258,7 +1258,7 @@ ; X86-AVX2-NEXT: vxorpd %xmm2, %xmm0, %xmm4 ; X86-AVX2-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3 ; X86-AVX2-NEXT: vblendvpd %xmm3, %xmm0, %xmm1, %xmm0 -; X86-AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X86-AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-AVX2-NEXT: vxorpd %xmm2, %xmm0, %xmm3 ; X86-AVX2-NEXT: vxorpd %xmm2, %xmm1, %xmm2 ; X86-AVX2-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 @@ -1381,7 +1381,7 @@ ; X64-AVX1-NEXT: vxorpd %xmm2, %xmm3, %xmm4 ; X64-AVX1-NEXT: vpcmpgtq %xmm1, %xmm4, %xmm1 ; X64-AVX1-NEXT: vblendvpd %xmm1, %xmm3, %xmm0, %xmm0 -; X64-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-AVX1-NEXT: vxorpd %xmm2, %xmm0, %xmm3 ; X64-AVX1-NEXT: vxorpd %xmm2, %xmm1, %xmm2 ; X64-AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 @@ -1402,7 +1402,7 @@ ; X64-AVX2-NEXT: vxorpd %xmm2, %xmm0, %xmm4 ; X64-AVX2-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3 ; X64-AVX2-NEXT: vblendvpd %xmm3, %xmm0, %xmm1, %xmm0 -; X64-AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-AVX2-NEXT: vxorpd %xmm2, %xmm0, %xmm3 ; X64-AVX2-NEXT: vxorpd %xmm2, %xmm1, %xmm2 ; X64-AVX2-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 diff --git a/llvm/test/CodeGen/X86/horizontal-reduce-umin.ll b/llvm/test/CodeGen/X86/horizontal-reduce-umin.ll --- a/llvm/test/CodeGen/X86/horizontal-reduce-umin.ll +++ b/llvm/test/CodeGen/X86/horizontal-reduce-umin.ll @@ -53,7 +53,7 @@ ; ; X86-AVX1-LABEL: test_reduce_v2i64: ; X86-AVX1: ## %bb.0: -; X86-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X86-AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] ; X86-AVX1-NEXT: ## xmm2 = mem[0,0] ; X86-AVX1-NEXT: vxorps %xmm2, %xmm0, %xmm3 @@ -475,7 +475,7 @@ ; X86-AVX1-NEXT: vxorps %xmm1, %xmm3, %xmm4 ; X86-AVX1-NEXT: vpcmpgtq %xmm2, %xmm4, %xmm2 ; X86-AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm3, %xmm0 -; X86-AVX1-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[2,3,2,3] +; X86-AVX1-NEXT: vshufps {{.*#+}} xmm2 = xmm0[2,3,2,3] ; X86-AVX1-NEXT: vxorpd %xmm1, %xmm0, %xmm3 ; X86-AVX1-NEXT: vxorpd %xmm1, %xmm2, %xmm1 ; X86-AVX1-NEXT: vpcmpgtq %xmm3, %xmm1, %xmm1 @@ -493,7 +493,7 @@ ; X86-AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm4 ; X86-AVX2-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3 ; X86-AVX2-NEXT: vblendvpd %xmm3, %xmm0, %xmm1, %xmm0 -; X86-AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X86-AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-AVX2-NEXT: vxorpd %xmm2, %xmm0, %xmm3 ; X86-AVX2-NEXT: vxorpd %xmm2, %xmm1, %xmm2 ; X86-AVX2-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2 @@ -567,7 +567,7 @@ ; X64-AVX1-NEXT: vpxor %xmm1, %xmm3, %xmm4 ; X64-AVX1-NEXT: vpcmpgtq %xmm2, %xmm4, %xmm2 ; X64-AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm3, %xmm0 -; X64-AVX1-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[2,3,2,3] +; X64-AVX1-NEXT: vshufps {{.*#+}} xmm2 = xmm0[2,3,2,3] ; X64-AVX1-NEXT: vxorpd %xmm1, %xmm0, %xmm3 ; X64-AVX1-NEXT: vxorpd %xmm1, %xmm2, %xmm1 ; X64-AVX1-NEXT: vpcmpgtq %xmm3, %xmm1, %xmm1 @@ -584,7 +584,7 @@ ; X64-AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm4 ; X64-AVX2-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3 ; X64-AVX2-NEXT: vblendvpd %xmm3, %xmm0, %xmm1, %xmm0 -; X64-AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-AVX2-NEXT: vxorpd %xmm2, %xmm0, %xmm3 ; X64-AVX2-NEXT: vxorpd %xmm2, %xmm1, %xmm2 ; X64-AVX2-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2 @@ -1152,7 +1152,7 @@ ; X86-AVX1-NEXT: vxorpd %xmm2, %xmm3, %xmm4 ; X86-AVX1-NEXT: vpcmpgtq %xmm1, %xmm4, %xmm1 ; X86-AVX1-NEXT: vblendvpd %xmm1, %xmm0, %xmm3, %xmm0 -; X86-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X86-AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-AVX1-NEXT: vxorpd %xmm2, %xmm0, %xmm3 ; X86-AVX1-NEXT: vxorpd %xmm2, %xmm1, %xmm2 ; X86-AVX1-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2 @@ -1174,7 +1174,7 @@ ; X86-AVX2-NEXT: vxorpd %xmm2, %xmm1, %xmm4 ; X86-AVX2-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3 ; X86-AVX2-NEXT: vblendvpd %xmm3, %xmm0, %xmm1, %xmm0 -; X86-AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X86-AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X86-AVX2-NEXT: vxorpd %xmm2, %xmm0, %xmm3 ; X86-AVX2-NEXT: vxorpd %xmm2, %xmm1, %xmm2 ; X86-AVX2-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2 @@ -1299,7 +1299,7 @@ ; X64-AVX1-NEXT: vxorpd %xmm3, %xmm2, %xmm4 ; X64-AVX1-NEXT: vpcmpgtq %xmm1, %xmm4, %xmm1 ; X64-AVX1-NEXT: vblendvpd %xmm1, %xmm0, %xmm2, %xmm0 -; X64-AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-AVX1-NEXT: vxorpd %xmm3, %xmm0, %xmm2 ; X64-AVX1-NEXT: vxorpd %xmm3, %xmm1, %xmm3 ; X64-AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 @@ -1320,7 +1320,7 @@ ; X64-AVX2-NEXT: vxorpd %xmm2, %xmm1, %xmm4 ; X64-AVX2-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3 ; X64-AVX2-NEXT: vblendvpd %xmm3, %xmm0, %xmm1, %xmm0 -; X64-AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; X64-AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; X64-AVX2-NEXT: vxorpd %xmm2, %xmm0, %xmm3 ; X64-AVX2-NEXT: vxorpd %xmm2, %xmm1, %xmm2 ; X64-AVX2-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2 diff --git a/llvm/test/CodeGen/X86/horizontal-shuffle-2.ll b/llvm/test/CodeGen/X86/horizontal-shuffle-2.ll --- a/llvm/test/CodeGen/X86/horizontal-shuffle-2.ll +++ b/llvm/test/CodeGen/X86/horizontal-shuffle-2.ll @@ -14,7 +14,7 @@ ; AVX-LABEL: test_unpacklo_hadd_v4f32: ; AVX: ## %bb.0: ; AVX-NEXT: vhaddps %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,1,3] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,1,3] ; AVX-NEXT: ret{{[l|q]}} %5 = tail call <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float> %0, <4 x float> %1) #4 %6 = tail call <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float> %2, <4 x float> %3) #4 @@ -33,7 +33,7 @@ ; AVX-LABEL: test_unpackhi_hadd_v4f32: ; AVX: ## %bb.0: ; AVX-NEXT: vhaddps %xmm3, %xmm1, %xmm0 -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,1,3] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,1,3] ; AVX-NEXT: ret{{[l|q]}} %5 = tail call <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float> %0, <4 x float> %1) #4 %6 = tail call <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float> %2, <4 x float> %3) #4 @@ -51,7 +51,7 @@ ; AVX-LABEL: test_unpacklo_hsub_v4f32: ; AVX: ## %bb.0: ; AVX-NEXT: vhsubps %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,1,3] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,1,3] ; AVX-NEXT: ret{{[l|q]}} %5 = tail call <4 x float> @llvm.x86.sse3.hsub.ps(<4 x float> %0, <4 x float> %1) #4 %6 = tail call <4 x float> @llvm.x86.sse3.hsub.ps(<4 x float> %2, <4 x float> %3) #4 @@ -70,7 +70,7 @@ ; AVX-LABEL: test_unpackhi_hsub_v4f32: ; AVX: ## %bb.0: ; AVX-NEXT: vhsubps %xmm3, %xmm1, %xmm0 -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,1,3] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,1,3] ; AVX-NEXT: ret{{[l|q]}} %5 = tail call <4 x float> @llvm.x86.sse3.hsub.ps(<4 x float> %0, <4 x float> %1) #4 %6 = tail call <4 x float> @llvm.x86.sse3.hsub.ps(<4 x float> %2, <4 x float> %3) #4 @@ -164,7 +164,7 @@ ; AVX-LABEL: test_unpacklo_hadd_v4f32_unary: ; AVX: ## %bb.0: ; AVX-NEXT: vhaddps %xmm0, %xmm0, %xmm0 -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,1,1] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0,1,1] ; AVX-NEXT: ret{{[l|q]}} %2 = tail call <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float> %0, <4 x float> %0) #4 %3 = shufflevector <4 x float> %2, <4 x float> %2, <4 x i32> diff --git a/llvm/test/CodeGen/X86/horizontal-shuffle-3.ll b/llvm/test/CodeGen/X86/horizontal-shuffle-3.ll --- a/llvm/test/CodeGen/X86/horizontal-shuffle-3.ll +++ b/llvm/test/CodeGen/X86/horizontal-shuffle-3.ll @@ -6,7 +6,7 @@ ; CHECK-LABEL: test_unpacklo_hadd_v8f32: ; CHECK: ## %bb.0: ; CHECK-NEXT: vhaddps %ymm2, %ymm0, %ymm0 -; CHECK-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7] +; CHECK-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7] ; CHECK-NEXT: ret{{[l|q]}} %5 = tail call <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float> %0, <8 x float> %1) #4 %6 = tail call <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float> %2, <8 x float> %3) #4 @@ -18,7 +18,7 @@ ; CHECK-LABEL: test_unpackhi_hadd_v8f32: ; CHECK: ## %bb.0: ; CHECK-NEXT: vhaddps %ymm3, %ymm1, %ymm0 -; CHECK-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7] +; CHECK-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7] ; CHECK-NEXT: ret{{[l|q]}} %5 = tail call <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float> %0, <8 x float> %1) #4 %6 = tail call <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float> %2, <8 x float> %3) #4 @@ -30,7 +30,7 @@ ; CHECK-LABEL: test_unpacklo_hsub_v8f32: ; CHECK: ## %bb.0: ; CHECK-NEXT: vhsubps %ymm2, %ymm0, %ymm0 -; CHECK-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7] +; CHECK-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7] ; CHECK-NEXT: ret{{[l|q]}} %5 = tail call <8 x float> @llvm.x86.avx.hsub.ps.256(<8 x float> %0, <8 x float> %1) #4 %6 = tail call <8 x float> @llvm.x86.avx.hsub.ps.256(<8 x float> %2, <8 x float> %3) #4 @@ -42,7 +42,7 @@ ; CHECK-LABEL: test_unpackhi_hsub_v8f32: ; CHECK: ## %bb.0: ; CHECK-NEXT: vhsubps %ymm3, %ymm1, %ymm0 -; CHECK-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7] +; CHECK-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7] ; CHECK-NEXT: ret{{[l|q]}} %5 = tail call <8 x float> @llvm.x86.avx.hsub.ps.256(<8 x float> %0, <8 x float> %1) #4 %6 = tail call <8 x float> @llvm.x86.avx.hsub.ps.256(<8 x float> %2, <8 x float> %3) #4 diff --git a/llvm/test/CodeGen/X86/horizontal-shuffle-4.ll b/llvm/test/CodeGen/X86/horizontal-shuffle-4.ll --- a/llvm/test/CodeGen/X86/horizontal-shuffle-4.ll +++ b/llvm/test/CodeGen/X86/horizontal-shuffle-4.ll @@ -61,7 +61,7 @@ ; CHECK-NEXT: vhaddps %ymm1, %ymm0, %ymm0 ; CHECK-NEXT: vhaddps %ymm3, %ymm2, %ymm1 ; CHECK-NEXT: vhaddps %ymm1, %ymm0, %ymm0 -; CHECK-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,2,3,0,5,6,7,4] +; CHECK-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,2,3,0,5,6,7,4] ; CHECK-NEXT: ret{{[l|q]}} %1 = call <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float> %a0, <8 x float> %a1) %2 = call <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float> %a2, <8 x float> %a3) diff --git a/llvm/test/CodeGen/X86/horizontal-sum.ll b/llvm/test/CodeGen/X86/horizontal-sum.ll --- a/llvm/test/CodeGen/X86/horizontal-sum.ll +++ b/llvm/test/CodeGen/X86/horizontal-sum.ll @@ -218,8 +218,8 @@ ; AVX1-SLOW-LABEL: pair_sum_v8f32_v4f32: ; AVX1-SLOW: # %bb.0: ; AVX1-SLOW-NEXT: vhaddps %xmm1, %xmm0, %xmm0 -; AVX1-SLOW-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,3,1,3] -; AVX1-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,1,3] +; AVX1-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm0[1,3,1,3] +; AVX1-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,1,3] ; AVX1-SLOW-NEXT: vaddps %xmm1, %xmm0, %xmm0 ; AVX1-SLOW-NEXT: vhaddps %xmm4, %xmm4, %xmm1 ; AVX1-SLOW-NEXT: vhaddps %xmm5, %xmm5, %xmm4 @@ -262,8 +262,8 @@ ; AVX2-SLOW-LABEL: pair_sum_v8f32_v4f32: ; AVX2-SLOW: # %bb.0: ; AVX2-SLOW-NEXT: vhaddps %xmm1, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,3,1,3] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,1,3] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm0[1,3,1,3] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,1,3] ; AVX2-SLOW-NEXT: vaddps %xmm1, %xmm0, %xmm0 ; AVX2-SLOW-NEXT: vhaddps %xmm4, %xmm4, %xmm1 ; AVX2-SLOW-NEXT: vhaddps %xmm5, %xmm5, %xmm4 @@ -893,27 +893,27 @@ ; AVX-SLOW-NEXT: vaddss %xmm4, %xmm0, %xmm4 ; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm5 = xmm0[1,0] ; AVX-SLOW-NEXT: vaddss %xmm5, %xmm4, %xmm4 -; AVX-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX-SLOW-NEXT: vaddss %xmm0, %xmm4, %xmm0 ; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm4 = xmm1[1,1,3,3] ; AVX-SLOW-NEXT: vaddss %xmm4, %xmm1, %xmm4 ; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm5 = xmm1[1,0] ; AVX-SLOW-NEXT: vaddss %xmm5, %xmm4, %xmm4 -; AVX-SLOW-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,3,3,3] +; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3] ; AVX-SLOW-NEXT: vaddss %xmm1, %xmm4, %xmm1 ; AVX-SLOW-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] ; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm2[1,1,3,3] ; AVX-SLOW-NEXT: vaddss %xmm1, %xmm2, %xmm1 ; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm4 = xmm2[1,0] ; AVX-SLOW-NEXT: vaddss %xmm4, %xmm1, %xmm1 -; AVX-SLOW-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[3,3,3,3] +; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm2[3,3,3,3] ; AVX-SLOW-NEXT: vaddss %xmm2, %xmm1, %xmm1 ; AVX-SLOW-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3] ; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm1 = xmm3[1,1,3,3] ; AVX-SLOW-NEXT: vaddss %xmm1, %xmm3, %xmm1 ; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm2 = xmm3[1,0] ; AVX-SLOW-NEXT: vaddss %xmm2, %xmm1, %xmm1 -; AVX-SLOW-NEXT: vpermilps {{.*#+}} xmm2 = xmm3[3,3,3,3] +; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm3[3,3,3,3] ; AVX-SLOW-NEXT: vaddss %xmm2, %xmm1, %xmm1 ; AVX-SLOW-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] ; AVX-SLOW-NEXT: retq @@ -923,24 +923,24 @@ ; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm4 ; AVX-FAST-NEXT: vpermilpd {{.*#+}} xmm5 = xmm0[1,0] ; AVX-FAST-NEXT: vaddss %xmm5, %xmm4, %xmm4 -; AVX-FAST-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX-FAST-NEXT: vaddss %xmm0, %xmm4, %xmm0 ; AVX-FAST-NEXT: vhaddps %xmm1, %xmm1, %xmm4 ; AVX-FAST-NEXT: vpermilpd {{.*#+}} xmm5 = xmm1[1,0] ; AVX-FAST-NEXT: vaddss %xmm5, %xmm4, %xmm4 -; AVX-FAST-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,3,3,3] +; AVX-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3] ; AVX-FAST-NEXT: vaddss %xmm1, %xmm4, %xmm1 ; AVX-FAST-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] ; AVX-FAST-NEXT: vhaddps %xmm2, %xmm2, %xmm1 ; AVX-FAST-NEXT: vpermilpd {{.*#+}} xmm4 = xmm2[1,0] ; AVX-FAST-NEXT: vaddss %xmm4, %xmm1, %xmm1 -; AVX-FAST-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[3,3,3,3] +; AVX-FAST-NEXT: vshufps {{.*#+}} xmm2 = xmm2[3,3,3,3] ; AVX-FAST-NEXT: vaddss %xmm2, %xmm1, %xmm1 ; AVX-FAST-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3] ; AVX-FAST-NEXT: vhaddps %xmm3, %xmm3, %xmm1 ; AVX-FAST-NEXT: vpermilpd {{.*#+}} xmm2 = xmm3[1,0] ; AVX-FAST-NEXT: vaddss %xmm2, %xmm1, %xmm1 -; AVX-FAST-NEXT: vpermilps {{.*#+}} xmm2 = xmm3[3,3,3,3] +; AVX-FAST-NEXT: vshufps {{.*#+}} xmm2 = xmm3[3,3,3,3] ; AVX-FAST-NEXT: vaddss %xmm2, %xmm1, %xmm1 ; AVX-FAST-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] ; AVX-FAST-NEXT: retq diff --git a/llvm/test/CodeGen/X86/i64-to-float.ll b/llvm/test/CodeGen/X86/i64-to-float.ll --- a/llvm/test/CodeGen/X86/i64-to-float.ll +++ b/llvm/test/CodeGen/X86/i64-to-float.ll @@ -299,7 +299,7 @@ ; X86-AVX-NEXT: # xmm1 = mem[0,0] ; X86-AVX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 ; X86-AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; X86-AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X86-AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3] ; X86-AVX-NEXT: vcvtdq2pd %xmm0, %xmm0 ; X86-AVX-NEXT: retl ; @@ -359,7 +359,7 @@ ; X64-AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255] ; X64-AVX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 ; X64-AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; X64-AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X64-AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3] ; X64-AVX-NEXT: vcvtdq2pd %xmm0, %xmm0 ; X64-AVX-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/insertelement-var-index.ll b/llvm/test/CodeGen/X86/insertelement-var-index.ll --- a/llvm/test/CodeGen/X86/insertelement-var-index.ll +++ b/llvm/test/CodeGen/X86/insertelement-var-index.ll @@ -187,7 +187,7 @@ ; ; AVX1-LABEL: arg_f32_v4f32_undef: ; AVX1: # %bb.0: -; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0] +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0,0,0] ; AVX1-NEXT: retq ; ; AVX2-LABEL: arg_f32_v4f32_undef: @@ -578,7 +578,7 @@ ; ; AVX1-LABEL: arg_f32_v8f32_undef: ; AVX1: # %bb.0: -; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0] +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0,0,0] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-NEXT: retq ; @@ -1053,7 +1053,7 @@ ; ; AVX1-LABEL: arg_f32_v4f32: ; AVX1: # %bb.0: -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,0,0,0] +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,0,0,0] ; AVX1-NEXT: vmovd %edi, %xmm2 ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] ; AVX1-NEXT: vpcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 @@ -1777,7 +1777,7 @@ ; ; AVX1-LABEL: arg_f32_v8f32: ; AVX1: # %bb.0: -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,0,0,0] +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,0,0,0] ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 ; AVX1-NEXT: vmovd %edi, %xmm2 ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] diff --git a/llvm/test/CodeGen/X86/known-bits-vector.ll b/llvm/test/CodeGen/X86/known-bits-vector.ll --- a/llvm/test/CodeGen/X86/known-bits-vector.ll +++ b/llvm/test/CodeGen/X86/known-bits-vector.ll @@ -136,14 +136,14 @@ define <4 x float> @knownbits_mask_shuffle_uitofp(<4 x i32> %a0) nounwind { ; X86-LABEL: knownbits_mask_shuffle_uitofp: ; X86: # %bb.0: -; X86-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,2,3,3] +; X86-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,2,3,3] ; X86-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 ; X86-NEXT: vcvtdq2ps %xmm0, %xmm0 ; X86-NEXT: retl ; ; X64-LABEL: knownbits_mask_shuffle_uitofp: ; X64: # %bb.0: -; X64-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,2,3,3] +; X64-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,2,3,3] ; X64-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; X64-NEXT: vcvtdq2ps %xmm0, %xmm0 ; X64-NEXT: retq @@ -173,7 +173,7 @@ define <4 x float> @knownbits_mask_xor_shuffle_uitofp(<4 x i32> %a0) nounwind { ; X86-LABEL: knownbits_mask_xor_shuffle_uitofp: ; X86: # %bb.0: -; X86-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,2,3,3] +; X86-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,2,3,3] ; X86-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 ; X86-NEXT: vxorps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 ; X86-NEXT: vcvtdq2ps %xmm0, %xmm0 @@ -181,7 +181,7 @@ ; ; X64-LABEL: knownbits_mask_xor_shuffle_uitofp: ; X64: # %bb.0: -; X64-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,2,3,3] +; X64-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,2,3,3] ; X64-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; X64-NEXT: vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; X64-NEXT: vcvtdq2ps %xmm0, %xmm0 @@ -384,10 +384,10 @@ define <8 x float> @knownbits_mask_concat_uitofp(<4 x i32> %a0, <4 x i32> %a1) nounwind { ; X86-LABEL: knownbits_mask_concat_uitofp: ; X86: # %bb.0: -; X86-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[1,3,1,3] +; X86-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,3,1,3] ; X86-NEXT: vmovaps {{.*#+}} xmm2 = [131071,131071,131071,131071] ; X86-NEXT: vandps %xmm2, %xmm1, %xmm1 -; X86-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,0,2] +; X86-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,0,2] ; X86-NEXT: vandps %xmm2, %xmm0, %xmm0 ; X86-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; X86-NEXT: vcvtdq2ps %ymm0, %ymm0 @@ -395,10 +395,10 @@ ; ; X64-LABEL: knownbits_mask_concat_uitofp: ; X64: # %bb.0: -; X64-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[1,3,1,3] +; X64-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,3,1,3] ; X64-NEXT: vmovaps {{.*#+}} xmm2 = [131071,131071,131071,131071] ; X64-NEXT: vandps %xmm2, %xmm1, %xmm1 -; X64-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,0,2] +; X64-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,0,2] ; X64-NEXT: vandps %xmm2, %xmm0, %xmm0 ; X64-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; X64-NEXT: vcvtdq2ps %ymm0, %ymm0 @@ -601,7 +601,7 @@ ; X86-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm3, %xmm3 ; X86-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; X86-NEXT: vblendvps %xmm0, %xmm2, %xmm3, %xmm0 -; X86-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,2,2] +; X86-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0,2,2] ; X86-NEXT: vcvtdq2ps %xmm0, %xmm0 ; X86-NEXT: movl %ebp, %esp ; X86-NEXT: popl %ebp @@ -613,7 +613,7 @@ ; X64-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 ; X64-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; X64-NEXT: vblendvps %xmm0, %xmm2, %xmm3, %xmm0 -; X64-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,2,2] +; X64-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0,2,2] ; X64-NEXT: vcvtdq2ps %xmm0, %xmm0 ; X64-NEXT: retq %1 = and <4 x i32> %a2, @@ -637,7 +637,7 @@ ; X86-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm3, %xmm3 ; X86-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; X86-NEXT: vblendvps %xmm0, %xmm2, %xmm3, %xmm0 -; X86-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,2,2] +; X86-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0,2,2] ; X86-NEXT: vcvtdq2ps %xmm0, %xmm0 ; X86-NEXT: movl %ebp, %esp ; X86-NEXT: popl %ebp @@ -649,7 +649,7 @@ ; X64-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 ; X64-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; X64-NEXT: vblendvps %xmm0, %xmm2, %xmm3, %xmm0 -; X64-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,2,2] +; X64-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0,2,2] ; X64-NEXT: vcvtdq2ps %xmm0, %xmm0 ; X64-NEXT: retq %1 = lshr <4 x i32> %a2, diff --git a/llvm/test/CodeGen/X86/known-signbits-vector.ll b/llvm/test/CodeGen/X86/known-signbits-vector.ll --- a/llvm/test/CodeGen/X86/known-signbits-vector.ll +++ b/llvm/test/CodeGen/X86/known-signbits-vector.ll @@ -148,7 +148,7 @@ ; X86-LABEL: signbits_ashr_extract_sitofp_0: ; X86: # %bb.0: ; X86-NEXT: pushl %eax -; X86-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,1,1] +; X86-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1,1,1] ; X86-NEXT: vcvtdq2ps %xmm0, %xmm0 ; X86-NEXT: vmovss %xmm0, (%esp) ; X86-NEXT: flds (%esp) @@ -157,7 +157,7 @@ ; ; X64-LABEL: signbits_ashr_extract_sitofp_0: ; X64: # %bb.0: -; X64-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,1,1] +; X64-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1,1,1] ; X64-NEXT: vcvtdq2ps %xmm0, %xmm0 ; X64-NEXT: retq %1 = ashr <2 x i64> %a0, @@ -170,7 +170,7 @@ ; X86-LABEL: signbits_ashr_extract_sitofp_1: ; X86: # %bb.0: ; X86-NEXT: pushl %eax -; X86-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,1,1] +; X86-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1,1,1] ; X86-NEXT: vcvtdq2ps %xmm0, %xmm0 ; X86-NEXT: vmovss %xmm0, (%esp) ; X86-NEXT: flds (%esp) @@ -179,7 +179,7 @@ ; ; X64-LABEL: signbits_ashr_extract_sitofp_1: ; X64: # %bb.0: -; X64-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,1,1] +; X64-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1,1,1] ; X64-NEXT: vcvtdq2ps %xmm0, %xmm0 ; X64-NEXT: retq %1 = ashr <2 x i64> %a0, @@ -322,7 +322,7 @@ define <2 x double> @signbits_ashr_concat_ashr_extract_sitofp(<2 x i64> %a0, <4 x i64> %a1) nounwind { ; CHECK-LABEL: signbits_ashr_concat_ashr_extract_sitofp: ; CHECK: # %bb.0: -; CHECK-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,3,2,3] +; CHECK-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3,2,3] ; CHECK-NEXT: vcvtdq2pd %xmm0, %xmm0 ; CHECK-NEXT: ret{{[l|q]}} %1 = ashr <2 x i64> %a0, @@ -415,7 +415,7 @@ ; X86-NEXT: vpsrad $1, %xmm2, %xmm2 ; X86-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] ; X86-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm5[2,3],xmm2[4,5],xmm5[6,7] -; X86-NEXT: vpermilps {{.*#+}} xmm5 = xmm3[2,2,3,3] +; X86-NEXT: vshufps {{.*#+}} xmm5 = xmm3[2,2,3,3] ; X86-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm6 ; X86-NEXT: vextractf128 $1, %ymm1, %xmm1 ; X86-NEXT: vextractf128 $1, %ymm0, %xmm0 @@ -443,7 +443,7 @@ ; X64-AVX1-NEXT: vpsrad $1, %xmm2, %xmm2 ; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] ; X64-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm5[2,3],xmm2[4,5],xmm5[6,7] -; X64-AVX1-NEXT: vpermilps {{.*#+}} xmm5 = xmm3[2,2,3,3] +; X64-AVX1-NEXT: vshufps {{.*#+}} xmm5 = xmm3[2,2,3,3] ; X64-AVX1-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm6 ; X64-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 ; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 @@ -465,7 +465,7 @@ ; X64-AVX2-NEXT: vpmovzxdq {{.*#+}} ymm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero ; X64-AVX2-NEXT: vpcmpeqq %ymm1, %ymm0, %ymm0 ; X64-AVX2-NEXT: vblendvpd %ymm0, %ymm2, %ymm3, %ymm0 -; X64-AVX2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,1,0,1,4,5,4,5] +; X64-AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,0,1,4,5,4,5] ; X64-AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 ; X64-AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] ; X64-AVX2-NEXT: vcvtdq2ps %xmm0, %xmm0 diff --git a/llvm/test/CodeGen/X86/masked_store.ll b/llvm/test/CodeGen/X86/masked_store.ll --- a/llvm/test/CodeGen/X86/masked_store.ll +++ b/llvm/test/CodeGen/X86/masked_store.ll @@ -4958,7 +4958,7 @@ ; X86-AVX512-NEXT: vextractf32x4 $3, %zmm0, %xmm0 ; X86-AVX512-NEXT: vmovlps %xmm0, 48(%eax) ; X86-AVX512-NEXT: vextractf128 $1, %ymm1, %xmm0 -; X86-AVX512-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,0,1] +; X86-AVX512-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,0,1] ; X86-AVX512-NEXT: vmovlps %xmm0, 88(%eax) ; X86-AVX512-NEXT: vzeroupper ; X86-AVX512-NEXT: retl diff --git a/llvm/test/CodeGen/X86/masked_store_trunc.ll b/llvm/test/CodeGen/X86/masked_store_trunc.ll --- a/llvm/test/CodeGen/X86/masked_store_trunc.ll +++ b/llvm/test/CodeGen/X86/masked_store_trunc.ll @@ -1649,7 +1649,7 @@ ; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 ; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,2],zero,zero -; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3] ; AVX1-NEXT: vmaskmovps %xmm0, %xmm1, (%rdi) ; AVX1-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/masked_store_trunc_ssat.ll b/llvm/test/CodeGen/X86/masked_store_trunc_ssat.ll --- a/llvm/test/CodeGen/X86/masked_store_trunc_ssat.ll +++ b/llvm/test/CodeGen/X86/masked_store_trunc_ssat.ll @@ -2440,7 +2440,7 @@ ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [18446744071562067968,18446744071562067968] ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm0, %xmm3 ; AVX1-NEXT: vblendvpd %xmm3, %xmm0, %xmm2, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3] ; AVX1-NEXT: vmaskmovps %xmm0, %xmm1, (%rdi) ; AVX1-NEXT: retq ; @@ -2457,7 +2457,7 @@ ; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [18446744071562067968,18446744071562067968] ; AVX2-NEXT: vpcmpgtq %xmm2, %xmm0, %xmm3 ; AVX2-NEXT: vblendvpd %xmm3, %xmm0, %xmm2, %xmm0 -; AVX2-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3] ; AVX2-NEXT: vpmaskmovd %xmm0, %xmm1, (%rdi) ; AVX2-NEXT: retq ; @@ -2597,7 +2597,7 @@ ; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [18446744073709518848,18446744073709518848] ; AVX-NEXT: vpcmpgtq %xmm3, %xmm0, %xmm4 ; AVX-NEXT: vblendvpd %xmm4, %xmm0, %xmm3, %xmm0 -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3] ; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] ; AVX-NEXT: vpcmpeqq %xmm2, %xmm1, %xmm1 ; AVX-NEXT: vmovmskpd %xmm1, %eax diff --git a/llvm/test/CodeGen/X86/masked_store_trunc_usat.ll b/llvm/test/CodeGen/X86/masked_store_trunc_usat.ll --- a/llvm/test/CodeGen/X86/masked_store_trunc_usat.ll +++ b/llvm/test/CodeGen/X86/masked_store_trunc_usat.ll @@ -2142,7 +2142,7 @@ ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [9223372041149743103,9223372041149743103] ; AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3 ; AVX1-NEXT: vblendvpd %xmm3, %xmm0, %xmm2, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3] ; AVX1-NEXT: vmaskmovps %xmm0, %xmm1, (%rdi) ; AVX1-NEXT: retq ; @@ -2158,7 +2158,7 @@ ; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = [9223372041149743103,9223372041149743103] ; AVX2-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3 ; AVX2-NEXT: vblendvpd %xmm3, %xmm0, %xmm2, %xmm0 -; AVX2-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3] ; AVX2-NEXT: vpmaskmovd %xmm0, %xmm1, (%rdi) ; AVX2-NEXT: retq ; @@ -2279,7 +2279,7 @@ ; AVX-NEXT: vmovdqa {{.*#+}} xmm5 = [9223372036854841343,9223372036854841343] ; AVX-NEXT: vpcmpgtq %xmm4, %xmm5, %xmm4 ; AVX-NEXT: vblendvpd %xmm4, %xmm0, %xmm3, %xmm0 -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3] ; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] ; AVX-NEXT: vpcmpeqq %xmm2, %xmm1, %xmm1 ; AVX-NEXT: vmovmskpd %xmm1, %eax diff --git a/llvm/test/CodeGen/X86/oddshuffles.ll b/llvm/test/CodeGen/X86/oddshuffles.ll --- a/llvm/test/CodeGen/X86/oddshuffles.ll +++ b/llvm/test/CodeGen/X86/oddshuffles.ll @@ -129,7 +129,7 @@ ; ; AVX-LABEL: v5i32: ; AVX: # %bb.0: -; AVX-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[1,2,2,3] +; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,2,2,3] ; AVX-NEXT: vunpcklps {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; AVX-NEXT: vextractps $3, %xmm0, 16(%rdi) ; AVX-NEXT: vmovaps %xmm1, (%rdi) @@ -161,7 +161,7 @@ ; AVX-LABEL: v5f32: ; AVX: # %bb.0: ; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm0[0,1],xmm1[1,2] -; AVX-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,2,1,3] +; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2,1,3] ; AVX-NEXT: vextractps $3, %xmm0, 16(%rdi) ; AVX-NEXT: vmovaps %xmm1, (%rdi) ; AVX-NEXT: retq @@ -313,9 +313,9 @@ ; AVX-LABEL: v7i32: ; AVX: # %bb.0: ; AVX-NEXT: vblendps {{.*#+}} xmm2 = xmm0[0,1],xmm1[2],xmm0[3] -; AVX-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[0,2,3,2] +; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,2,3,2] ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,3,2,3] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3,2,3] ; AVX-NEXT: vmovss %xmm1, 24(%rdi) ; AVX-NEXT: vmovlps %xmm0, 16(%rdi) ; AVX-NEXT: vmovaps %xmm2, (%rdi) @@ -513,12 +513,12 @@ ; AVX1-NEXT: vmovsldup {{.*#+}} ymm2 = ymm2[0,0,2,2,4,4,6,6] ; AVX1-NEXT: vpermilps {{.*#+}} ymm3 = ymm0[0,u,u,1,5,u,u,6] ; AVX1-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3,4,5],ymm2[6],ymm3[7] -; AVX1-NEXT: vpermilps {{.*#+}} xmm3 = xmm1[0,1,0,1] +; AVX1-NEXT: vshufps {{.*#+}} xmm3 = xmm1[0,1,0,1] ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm3 ; AVX1-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7] ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 ; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3],xmm3[3,3] -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[2,3,2,3] +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3] ; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3] ; AVX1-NEXT: vmovaps %xmm0, 32(%rdi) ; AVX1-NEXT: vmovaps %ymm2, (%rdi) @@ -533,7 +533,7 @@ ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7] ; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm3 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3],xmm3[3,3] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[2,3,2,3] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3] ; AVX2-SLOW-NEXT: vmovaps %xmm0, 32(%rdi) ; AVX2-SLOW-NEXT: vmovaps %ymm2, (%rdi) @@ -548,7 +548,7 @@ ; AVX2-FAST-ALL-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7] ; AVX2-FAST-ALL-NEXT: vmovaps {{.*#+}} ymm3 = ; AVX2-FAST-ALL-NEXT: vpermps %ymm0, %ymm3, %ymm0 -; AVX2-FAST-ALL-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[2,3,2,3] +; AVX2-FAST-ALL-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3] ; AVX2-FAST-ALL-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3] ; AVX2-FAST-ALL-NEXT: vmovaps %xmm0, 32(%rdi) ; AVX2-FAST-ALL-NEXT: vmovaps %ymm2, (%rdi) @@ -563,7 +563,7 @@ ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7] ; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm0, %xmm3 ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3],xmm3[3,3] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[2,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3] ; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm0, 32(%rdi) ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, (%rdi) @@ -574,12 +574,12 @@ ; XOP: # %bb.0: ; XOP-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3,0,1] ; XOP-NEXT: vpermil2ps {{.*#+}} ymm2 = ymm0[0],ymm2[0],ymm0[u,1,5,u],ymm2[6],ymm0[6] -; XOP-NEXT: vpermilps {{.*#+}} xmm3 = xmm1[0,1,0,1] +; XOP-NEXT: vshufps {{.*#+}} xmm3 = xmm1[0,1,0,1] ; XOP-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm3 ; XOP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7] ; XOP-NEXT: vextractf128 $1, %ymm0, %xmm3 ; XOP-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3],xmm3[3,3] -; XOP-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[2,3,2,3] +; XOP-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3] ; XOP-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3] ; XOP-NEXT: vmovaps %xmm0, 32(%rdi) ; XOP-NEXT: vmovaps %ymm2, (%rdi) @@ -1479,14 +1479,14 @@ ; AVX1-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,3],ymm4[0,2],ymm3[4,7],ymm4[4,6] ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm0[2,3,0,1] ; AVX1-NEXT: vshufps {{.*#+}} ymm5 = ymm0[1,0],ymm4[2,0],ymm0[5,4],ymm4[6,4] -; AVX1-NEXT: vpermilps {{.*#+}} ymm5 = ymm5[0,1,2,0,4,5,6,4] +; AVX1-NEXT: vshufps {{.*#+}} ymm5 = ymm5[0,1,2,0,4,5,6,4] ; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm5[6,7] ; AVX1-NEXT: vshufps {{.*#+}} ymm5 = ymm0[2,0],ymm4[3,0],ymm0[6,4],ymm4[7,4] ; AVX1-NEXT: vshufps {{.*#+}} ymm5 = ymm4[0,0],ymm5[2,0],ymm4[4,4],ymm5[6,4] ; AVX1-NEXT: vmovups 16(%rdi), %xmm6 ; AVX1-NEXT: vblendps {{.*#+}} ymm7 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] ; AVX1-NEXT: vshufps {{.*#+}} ymm7 = ymm7[1,2],ymm6[0,3],ymm7[5,6],ymm6[4,7] -; AVX1-NEXT: vpermilps {{.*#+}} ymm7 = ymm7[0,2,3,1,4,6,7,5] +; AVX1-NEXT: vshufps {{.*#+}} ymm7 = ymm7[0,2,3,1,4,6,7,5] ; AVX1-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0,1,2,3,4],ymm5[5,6,7] ; AVX1-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7] ; AVX1-NEXT: vshufps {{.*#+}} ymm2 = ymm6[1,0],ymm2[2,0],ymm6[5,4],ymm2[6,4] @@ -1520,7 +1520,7 @@ ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] ; AVX2-SLOW-NEXT: vmovaps {{.*#+}} ymm1 = <2,5,0,3,6,u,u,u> ; AVX2-SLOW-NEXT: vpermps %ymm0, %ymm1, %ymm0 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm2[0,1,0,3,4,5,4,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,1,0,3,4,5,4,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm3, (%rsi) @@ -1580,7 +1580,7 @@ ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} ymm1 = <2,5,0,3,6,u,u,u> ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm0, %ymm1, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm2[0,1,0,3,4,5,4,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,1,0,3,4,5,4,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm3, (%rsi) @@ -1600,14 +1600,14 @@ ; XOP-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,3],ymm4[0,2],ymm3[4,7],ymm4[4,6] ; XOP-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm0[2,3,0,1] ; XOP-NEXT: vshufps {{.*#+}} ymm5 = ymm0[1,0],ymm4[2,0],ymm0[5,4],ymm4[6,4] -; XOP-NEXT: vpermilps {{.*#+}} ymm5 = ymm5[0,1,2,0,4,5,6,4] +; XOP-NEXT: vshufps {{.*#+}} ymm5 = ymm5[0,1,2,0,4,5,6,4] ; XOP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm5[6,7] ; XOP-NEXT: vshufps {{.*#+}} ymm5 = ymm0[2,0],ymm4[3,0],ymm0[6,4],ymm4[7,4] ; XOP-NEXT: vshufps {{.*#+}} ymm5 = ymm4[0,0],ymm5[2,0],ymm4[4,4],ymm5[6,4] ; XOP-NEXT: vmovups 16(%rdi), %xmm6 ; XOP-NEXT: vblendps {{.*#+}} ymm7 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] ; XOP-NEXT: vshufps {{.*#+}} ymm7 = ymm7[1,2],ymm6[0,3],ymm7[5,6],ymm6[4,7] -; XOP-NEXT: vpermilps {{.*#+}} ymm7 = ymm7[0,2,3,1,4,6,7,5] +; XOP-NEXT: vshufps {{.*#+}} ymm7 = ymm7[0,2,3,1,4,6,7,5] ; XOP-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0,1,2,3,4],ymm5[5,6,7] ; XOP-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7] ; XOP-NEXT: vshufps {{.*#+}} ymm2 = ymm6[1,0],ymm2[2,0],ymm6[5,4],ymm2[6,4] @@ -1746,7 +1746,7 @@ ; AVX2-SLOW-NEXT: vmovups (%rdx), %ymm1 ; AVX2-SLOW-NEXT: vmovups (%rcx), %ymm2 ; AVX2-SLOW-NEXT: vbroadcastsd 24(%rsi), %ymm3 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm4 = ymm1[1,2,3,3,5,6,7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm4 = ymm1[1,2,3,3,5,6,7,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[2,2,2,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm4 = ymm2[2,1,3,3] @@ -1757,7 +1757,7 @@ ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5,6],ymm4[7] ; AVX2-SLOW-NEXT: vbroadcastsd (%rcx), %ymm5 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,0,3,3,4,4,7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,0,3,3,4,4,7,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,1,2,2] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm2[1,1,2,2] @@ -1786,7 +1786,7 @@ ; AVX2-FAST-ALL-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5,6],ymm4[7] ; AVX2-FAST-ALL-NEXT: vbroadcastsd (%rcx), %ymm5 ; AVX2-FAST-ALL-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7] -; AVX2-FAST-ALL-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,0,3,3,4,4,7,7] +; AVX2-FAST-ALL-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,0,3,3,4,4,7,7] ; AVX2-FAST-ALL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,1,2,2] ; AVX2-FAST-ALL-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] ; AVX2-FAST-ALL-NEXT: vpermpd {{.*#+}} ymm1 = ymm2[1,1,2,2] @@ -1803,7 +1803,7 @@ ; AVX2-FAST-PERLANE-NEXT: vmovups (%rdx), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovups (%rcx), %ymm2 ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 24(%rsi), %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm4 = ymm1[1,2,3,3,5,6,7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm4 = ymm1[1,2,3,3,5,6,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[2,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm4 = ymm2[2,1,3,3] @@ -1814,7 +1814,7 @@ ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5,6],ymm4[7] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd (%rcx), %ymm5 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,0,3,3,4,4,7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,0,3,3,4,4,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm2[1,1,2,2] @@ -2409,7 +2409,7 @@ ; AVX1-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],mem[0],ymm1[1],mem[1],ymm1[4],mem[4],ymm1[5],mem[5] ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[2],mem[2] -; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,1,2,0,4,5,6,4] +; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,2,0,4,5,6,4] ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] ; AVX1-NEXT: vpsrld $16, %xmm0, %xmm0 @@ -2420,7 +2420,7 @@ ; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,1,3,3] ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 ; AVX1-NEXT: vmovshdup {{.*#+}} ymm3 = ymm1[1,1,3,3,5,5,7,7] -; AVX1-NEXT: vpermilps {{.*#+}} ymm4 = ymm1[3,3,3,3,7,7,7,7] +; AVX1-NEXT: vshufps {{.*#+}} ymm4 = ymm1[3,3,3,3,7,7,7,7] ; AVX1-NEXT: vpermilpd {{.*#+}} ymm5 = ymm1[0,0,3,2] ; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[1,1,1,1] ; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm0[3,3,3,3] @@ -2481,7 +2481,7 @@ ; XOP-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],mem[0],ymm1[1],mem[1],ymm1[4],mem[4],ymm1[5],mem[5] ; XOP-NEXT: vextractf128 $1, %ymm1, %xmm1 ; XOP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[2],mem[2] -; XOP-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,1,2,0,4,5,6,4] +; XOP-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,2,0,4,5,6,4] ; XOP-NEXT: vextractf128 $1, %ymm0, %xmm0 ; XOP-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] ; XOP-NEXT: vpsrld $16, %xmm0, %xmm0 @@ -2492,7 +2492,7 @@ ; XOP-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,1,3,3] ; XOP-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 ; XOP-NEXT: vmovshdup {{.*#+}} ymm3 = ymm1[1,1,3,3,5,5,7,7] -; XOP-NEXT: vpermilps {{.*#+}} ymm4 = ymm1[3,3,3,3,7,7,7,7] +; XOP-NEXT: vshufps {{.*#+}} ymm4 = ymm1[3,3,3,3,7,7,7,7] ; XOP-NEXT: vpermilpd {{.*#+}} ymm5 = ymm1[0,0,3,2] ; XOP-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[1,1,1,1] ; XOP-NEXT: vpshufd {{.*#+}} xmm7 = xmm0[3,3,3,3] diff --git a/llvm/test/CodeGen/X86/packss.ll b/llvm/test/CodeGen/X86/packss.ll --- a/llvm/test/CodeGen/X86/packss.ll +++ b/llvm/test/CodeGen/X86/packss.ll @@ -185,7 +185,7 @@ ; X86-AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm1 ; X86-AVX1-NEXT: vpaddq %xmm3, %xmm1, %xmm1 ; X86-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; X86-AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4] +; X86-AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4] ; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; X86-AVX1-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 ; X86-AVX1-NEXT: vzeroupper @@ -236,7 +236,7 @@ ; X64-AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm1 ; X64-AVX1-NEXT: vpsubq %xmm2, %xmm1, %xmm1 ; X64-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; X64-AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4] +; X64-AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4] ; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; X64-AVX1-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 ; X64-AVX1-NEXT: vzeroupper diff --git a/llvm/test/CodeGen/X86/palignr.ll b/llvm/test/CodeGen/X86/palignr.ll --- a/llvm/test/CodeGen/X86/palignr.ll +++ b/llvm/test/CodeGen/X86/palignr.ll @@ -11,7 +11,7 @@ ; ; CHECK-AVX-LABEL: test1: ; CHECK-AVX: # %bb.0: -; CHECK-AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,2,3,0] +; CHECK-AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,2,3,0] ; CHECK-AVX-NEXT: retl %C = shufflevector <4 x i32> %A, <4 x i32> undef, <4 x i32> < i32 1, i32 2, i32 3, i32 0 > ret <4 x i32> %C diff --git a/llvm/test/CodeGen/X86/pr31956.ll b/llvm/test/CodeGen/X86/pr31956.ll --- a/llvm/test/CodeGen/X86/pr31956.ll +++ b/llvm/test/CodeGen/X86/pr31956.ll @@ -12,7 +12,7 @@ ; CHECK-NEXT: vmovaps G2(%rip), %xmm0 ; CHECK-NEXT: vmovlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3] ; CHECK-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,0],mem[0,2] -; CHECK-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,0,3,1] +; CHECK-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,0,3,1] ; CHECK-NEXT: retq entry: %V = load <2 x float>, ptr @G1, align 8 diff --git a/llvm/test/CodeGen/X86/pr40730.ll b/llvm/test/CodeGen/X86/pr40730.ll --- a/llvm/test/CodeGen/X86/pr40730.ll +++ b/llvm/test/CodeGen/X86/pr40730.ll @@ -6,7 +6,7 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm2 ; CHECK-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm0[1,2,3] -; CHECK-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[3,1,1,0] +; CHECK-NEXT: vshufps {{.*#+}} xmm2 = xmm2[3,1,1,0] ; CHECK-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; CHECK-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,2,3] ; CHECK-NEXT: vpermilpd {{.*#+}} ymm1 = ymm1[0,0,3,2] @@ -27,7 +27,7 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm1 ; CHECK-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm0[1,2,3] -; CHECK-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,1,1,0] +; CHECK-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,1,1,0] ; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; CHECK-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],mem[1,2,3],ymm0[4],mem[5],ymm0[6,7] ; CHECK-NEXT: retq diff --git a/llvm/test/CodeGen/X86/pr40811.ll b/llvm/test/CodeGen/X86/pr40811.ll --- a/llvm/test/CodeGen/X86/pr40811.ll +++ b/llvm/test/CodeGen/X86/pr40811.ll @@ -6,8 +6,8 @@ ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vmovaps (%rdi), %xmm0 ; CHECK-NEXT: vblendps {{.*#+}} xmm1 = xmm0[0],mem[1],xmm0[2,3] -; CHECK-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,3,1,0] -; CHECK-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,2,1,0] +; CHECK-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3,1,0] +; CHECK-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2,1,0] ; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; CHECK-NEXT: retq entry: diff --git a/llvm/test/CodeGen/X86/pr50609.ll b/llvm/test/CodeGen/X86/pr50609.ll --- a/llvm/test/CodeGen/X86/pr50609.ll +++ b/llvm/test/CodeGen/X86/pr50609.ll @@ -13,7 +13,7 @@ ; CHECK-NEXT: vpaddd %xmm3, %xmm2, %xmm2 ; CHECK-NEXT: vpsrad $2, %xmm2, %xmm2 ; CHECK-NEXT: vcvtdq2ps %ymm2, %ymm2 -; CHECK-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[0,0,0,0] +; CHECK-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,0,0,0] ; CHECK-NEXT: vinsertf128 $1, %xmm2, %ymm2, %ymm2 ; CHECK-NEXT: vmaskmovps %ymm2, %ymm0, (%rdi) ; CHECK-NEXT: vmaskmovps %ymm2, %ymm1, 32(%rdi) diff --git a/llvm/test/CodeGen/X86/prefer-fpext-splat.ll b/llvm/test/CodeGen/X86/prefer-fpext-splat.ll --- a/llvm/test/CodeGen/X86/prefer-fpext-splat.ll +++ b/llvm/test/CodeGen/X86/prefer-fpext-splat.ll @@ -73,7 +73,7 @@ ; AVX1-NEXT: pushq %rax ; AVX1-NEXT: vpinsrw $0, (%rdi), %xmm0, %xmm0 ; AVX1-NEXT: callq __extendhfsf2@PLT -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0,0,0] ; AVX1-NEXT: popq %rax ; AVX1-NEXT: retq ; @@ -120,7 +120,7 @@ ; AVX1-NEXT: pushq %rax ; AVX1-NEXT: vpinsrw $0, (%rdi), %xmm0, %xmm0 ; AVX1-NEXT: callq __extendhfsf2@PLT -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0,0,0] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-NEXT: popq %rax ; AVX1-NEXT: retq diff --git a/llvm/test/CodeGen/X86/rotate_vec.ll b/llvm/test/CodeGen/X86/rotate_vec.ll --- a/llvm/test/CodeGen/X86/rotate_vec.ll +++ b/llvm/test/CodeGen/X86/rotate_vec.ll @@ -80,7 +80,7 @@ define <4 x i32> @rot_v4i32_zero_non_splat(<4 x i32> %x) { ; XOPAVX1-LABEL: rot_v4i32_zero_non_splat: ; XOPAVX1: # %bb.0: -; XOPAVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0] +; XOPAVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0,0,0] ; XOPAVX1-NEXT: retq ; ; XOPAVX2-LABEL: rot_v4i32_zero_non_splat: diff --git a/llvm/test/CodeGen/X86/scalarize-fp.ll b/llvm/test/CodeGen/X86/scalarize-fp.ll --- a/llvm/test/CodeGen/X86/scalarize-fp.ll +++ b/llvm/test/CodeGen/X86/scalarize-fp.ll @@ -394,7 +394,7 @@ ; AVX-LABEL: fmul_splat_splat_v4f32: ; AVX: # %bb.0: ; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0,0,0] ; AVX-NEXT: retq %splatx = shufflevector <4 x float> %vx, <4 x float> undef, <4 x i32> zeroinitializer %splaty = shufflevector <4 x float> %vy, <4 x float> undef, <4 x i32> zeroinitializer @@ -413,7 +413,7 @@ ; AVX-LABEL: fdiv_splat_splat_v8f32: ; AVX: # %bb.0: ; AVX-NEXT: vdivss %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0,0,0] ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX-NEXT: retq %splatx = shufflevector <8 x float> %vx, <8 x float> undef, <8 x i32> zeroinitializer @@ -555,7 +555,7 @@ ; AVX-LABEL: fmul_splat_const_op1_v4f32: ; AVX: # %bb.0: ; AVX-NEXT: vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0,0,0] ; AVX-NEXT: retq %splatx = shufflevector <4 x float> %vx, <4 x float> undef, <4 x i32> zeroinitializer %r = fmul fast <4 x float> %splatx, @@ -575,7 +575,7 @@ ; AVX: # %bb.0: ; AVX-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; AVX-NEXT: vdivss %xmm0, %xmm1, %xmm0 -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0,0,0] ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX-NEXT: retq %splatx = shufflevector <8 x float> , <8 x float> undef, <8 x i32> zeroinitializer @@ -597,7 +597,7 @@ ; AVX: # %bb.0: ; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; AVX-NEXT: vdivss %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0,0,0] ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX-NEXT: retq %splatx = shufflevector <8 x float> %vx, <8 x float> undef, <8 x i32> zeroinitializer @@ -654,7 +654,7 @@ ; AVX-LABEL: splat0_fmul_v4f32: ; AVX: # %bb.0: ; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0,0,0] ; AVX-NEXT: retq %b = fmul fast <4 x float> %vx, %vy %r = shufflevector <4 x float> %b, <4 x float> undef, <4 x i32> zeroinitializer @@ -672,7 +672,7 @@ ; AVX-LABEL: splat0_fdiv_v8f32: ; AVX: # %bb.0: ; AVX-NEXT: vdivss %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0,0,0] ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX-NEXT: retq %b = fdiv fast <8 x float> %vx, %vy @@ -729,7 +729,7 @@ ; AVX-LABEL: splat0_fmul_const_op1_v4f32: ; AVX: # %bb.0: ; AVX-NEXT: vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0,0,0] ; AVX-NEXT: retq %b = fmul fast <4 x float> %vx, %r = shufflevector <4 x float> %b, <4 x float> undef, <4 x i32> zeroinitializer @@ -745,7 +745,7 @@ ; ; AVX-LABEL: splat0_fdiv_const_op1_v8f32: ; AVX: # %bb.0: -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0,0,0] ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX-NEXT: retq %b = fdiv fast <8 x float> %vx, @@ -766,7 +766,7 @@ ; AVX: # %bb.0: ; AVX-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; AVX-NEXT: vdivss %xmm0, %xmm1, %xmm0 -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0,0,0] ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX-NEXT: retq %b = fdiv fast <8 x float> , %vx @@ -786,7 +786,7 @@ ; AVX-LABEL: multi_use_binop: ; AVX: # %bb.0: ; AVX-NEXT: vmulps %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,0,0,0] +; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm0[0,0,0,0] ; AVX-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] ; AVX-NEXT: vaddps %xmm0, %xmm1, %xmm0 ; AVX-NEXT: retq diff --git a/llvm/test/CodeGen/X86/shuffle-of-shift.ll b/llvm/test/CodeGen/X86/shuffle-of-shift.ll --- a/llvm/test/CodeGen/X86/shuffle-of-shift.ll +++ b/llvm/test/CodeGen/X86/shuffle-of-shift.ll @@ -156,7 +156,7 @@ ; X64-AVX2-NEXT: pushq %rax ; X64-AVX2-NEXT: movl $63, %edi ; X64-AVX2-NEXT: callq llvm.x86.sse2.psrai.q@PLT -; X64-AVX2-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0] +; X64-AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,2,1,0] ; X64-AVX2-NEXT: popq %rax ; X64-AVX2-NEXT: retq ; @@ -173,7 +173,7 @@ ; X86-AVX2-NEXT: pushl $63 ; X86-AVX2-NEXT: calll llvm.x86.sse2.psrai.q@PLT ; X86-AVX2-NEXT: addl $4, %esp -; X86-AVX2-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0] +; X86-AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,2,1,0] ; X86-AVX2-NEXT: retl %i1 = tail call <2 x i64> @llvm.x86.sse2.psrai.q(<2 x i64> %x, i32 63) %i2 = bitcast <2 x i64> %i1 to <4 x i32> @@ -336,7 +336,7 @@ ; X64-AVX2-NEXT: pushq %rax ; X64-AVX2-NEXT: movl $63, %edi ; X64-AVX2-NEXT: callq llvm.x86.sse2.psrai.q@PLT -; X64-AVX2-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,0,1] +; X64-AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,0,1] ; X64-AVX2-NEXT: popq %rax ; X64-AVX2-NEXT: retq ; @@ -353,7 +353,7 @@ ; X86-AVX2-NEXT: pushl $63 ; X86-AVX2-NEXT: calll llvm.x86.sse2.psrai.q@PLT ; X86-AVX2-NEXT: addl $4, %esp -; X86-AVX2-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,0,1] +; X86-AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,0,1] ; X86-AVX2-NEXT: retl %i1 = tail call <2 x i64> @llvm.x86.sse2.psrai.q(<2 x i64> %x, i32 63) %i2 = bitcast <2 x i64> %i1 to <2 x i64> diff --git a/llvm/test/CodeGen/X86/shuffle-of-splat-multiuses.ll b/llvm/test/CodeGen/X86/shuffle-of-splat-multiuses.ll --- a/llvm/test/CodeGen/X86/shuffle-of-splat-multiuses.ll +++ b/llvm/test/CodeGen/X86/shuffle-of-splat-multiuses.ll @@ -58,7 +58,7 @@ define <4 x i32> @undef_splatmask(<4 x i32> %v) nounwind { ; AVX2-LABEL: undef_splatmask: ; AVX2: # %bb.0: -; AVX2-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,2,3,3] +; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,2,3,3] ; AVX2-NEXT: retq %res = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> %res1 = shufflevector <4 x i32> %res, <4 x i32> undef, <4 x i32> @@ -68,7 +68,7 @@ define <4 x i32> @undef_splatmask2(<4 x i32> %v) nounwind { ; AVX2-LABEL: undef_splatmask2: ; AVX2: # %bb.0: -; AVX2-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,2,3,3] +; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,2,3,3] ; AVX2-NEXT: retq %res = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> %res1 = shufflevector <4 x i32> %res, <4 x i32> undef, <4 x i32> @@ -78,7 +78,7 @@ define <4 x i32> @undef_splatmask3(<4 x i32> %v) nounwind { ; AVX2-LABEL: undef_splatmask3: ; AVX2: # %bb.0: -; AVX2-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,2,3,3] +; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,2,3,3] ; AVX2-NEXT: retq %res = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> %res1 = shufflevector <4 x i32> %res, <4 x i32> undef, <4 x i32> @@ -88,8 +88,8 @@ define <4 x i32> @undef_splatmask4(<4 x i32> %v, ptr %p) nounwind { ; AVX2-LABEL: undef_splatmask4: ; AVX2: # %bb.0: -; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,2,3,3] -; AVX2-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,2,3,3] +; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX2-NEXT: vmovaps %xmm0, (%rdi) ; AVX2-NEXT: vmovaps %xmm1, %xmm0 ; AVX2-NEXT: retq diff --git a/llvm/test/CodeGen/X86/sse-fsignum.ll b/llvm/test/CodeGen/X86/sse-fsignum.ll --- a/llvm/test/CodeGen/X86/sse-fsignum.ll +++ b/llvm/test/CodeGen/X86/sse-fsignum.ll @@ -38,10 +38,10 @@ ; AVX-NEXT: vmovapd (%rdi), %xmm0 ; AVX-NEXT: vxorpd %xmm1, %xmm1, %xmm1 ; AVX-NEXT: vcmpltpd %xmm1, %xmm0, %xmm2 -; AVX-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[0,2,2,3] +; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,2,2,3] ; AVX-NEXT: vcvtdq2pd %xmm2, %xmm2 ; AVX-NEXT: vcmpltpd %xmm0, %xmm1, %xmm0 -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3] ; AVX-NEXT: vcvtdq2pd %xmm0, %xmm0 ; AVX-NEXT: vsubpd %xmm0, %xmm2, %xmm0 ; AVX-NEXT: vmovapd %xmm0, (%rdi) diff --git a/llvm/test/CodeGen/X86/sse-intrinsics-fast-isel.ll b/llvm/test/CodeGen/X86/sse-intrinsics-fast-isel.ll --- a/llvm/test/CodeGen/X86/sse-intrinsics-fast-isel.ll +++ b/llvm/test/CodeGen/X86/sse-intrinsics-fast-isel.ll @@ -2104,7 +2104,7 @@ ; X86-AVX1: # %bb.0: ; X86-AVX1-NEXT: vmovss {{[0-9]+}}(%esp), %xmm0 # encoding: [0xc5,0xfa,0x10,0x44,0x24,0x04] ; X86-AVX1-NEXT: # xmm0 = mem[0],zero,zero,zero -; X86-AVX1-NEXT: vpermilps $0, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x04,0xc0,0x00] +; X86-AVX1-NEXT: vshufps $0, %xmm0, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0xc6,0xc0,0x00] ; X86-AVX1-NEXT: # xmm0 = xmm0[0,0,0,0] ; X86-AVX1-NEXT: retl # encoding: [0xc3] ; @@ -2123,7 +2123,7 @@ ; ; X64-AVX1-LABEL: test_mm_set_ps1: ; X64-AVX1: # %bb.0: -; X64-AVX1-NEXT: vpermilps $0, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x04,0xc0,0x00] +; X64-AVX1-NEXT: vshufps $0, %xmm0, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0xc6,0xc0,0x00] ; X64-AVX1-NEXT: # xmm0 = xmm0[0,0,0,0] ; X64-AVX1-NEXT: retq # encoding: [0xc3] ; @@ -2265,7 +2265,7 @@ ; X86-AVX1: # %bb.0: ; X86-AVX1-NEXT: vmovss {{[0-9]+}}(%esp), %xmm0 # encoding: [0xc5,0xfa,0x10,0x44,0x24,0x04] ; X86-AVX1-NEXT: # xmm0 = mem[0],zero,zero,zero -; X86-AVX1-NEXT: vpermilps $0, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x04,0xc0,0x00] +; X86-AVX1-NEXT: vshufps $0, %xmm0, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0xc6,0xc0,0x00] ; X86-AVX1-NEXT: # xmm0 = xmm0[0,0,0,0] ; X86-AVX1-NEXT: retl # encoding: [0xc3] ; @@ -2284,7 +2284,7 @@ ; ; X64-AVX1-LABEL: test_mm_set1_ps: ; X64-AVX1: # %bb.0: -; X64-AVX1-NEXT: vpermilps $0, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x04,0xc0,0x00] +; X64-AVX1-NEXT: vshufps $0, %xmm0, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0xc6,0xc0,0x00] ; X64-AVX1-NEXT: # xmm0 = xmm0[0,0,0,0] ; X64-AVX1-NEXT: retq # encoding: [0xc3] ; @@ -2623,7 +2623,7 @@ ; X86-AVX1-LABEL: test_mm_store_ps1: ; X86-AVX1: # %bb.0: ; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-AVX1-NEXT: vpermilps $0, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x04,0xc0,0x00] +; X86-AVX1-NEXT: vshufps $0, %xmm0, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0xc6,0xc0,0x00] ; X86-AVX1-NEXT: # xmm0 = xmm0[0,0,0,0] ; X86-AVX1-NEXT: vmovaps %xmm0, (%eax) # encoding: [0xc5,0xf8,0x29,0x00] ; X86-AVX1-NEXT: retl # encoding: [0xc3] @@ -2644,7 +2644,7 @@ ; ; X64-AVX1-LABEL: test_mm_store_ps1: ; X64-AVX1: # %bb.0: -; X64-AVX1-NEXT: vpermilps $0, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x04,0xc0,0x00] +; X64-AVX1-NEXT: vshufps $0, %xmm0, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0xc6,0xc0,0x00] ; X64-AVX1-NEXT: # xmm0 = xmm0[0,0,0,0] ; X64-AVX1-NEXT: vmovaps %xmm0, (%rdi) # encoding: [0xc5,0xf8,0x29,0x07] ; X64-AVX1-NEXT: retq # encoding: [0xc3] @@ -2710,7 +2710,7 @@ ; X86-AVX1-LABEL: test_mm_store1_ps: ; X86-AVX1: # %bb.0: ; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-AVX1-NEXT: vpermilps $0, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x04,0xc0,0x00] +; X86-AVX1-NEXT: vshufps $0, %xmm0, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0xc6,0xc0,0x00] ; X86-AVX1-NEXT: # xmm0 = xmm0[0,0,0,0] ; X86-AVX1-NEXT: vmovaps %xmm0, (%eax) # encoding: [0xc5,0xf8,0x29,0x00] ; X86-AVX1-NEXT: retl # encoding: [0xc3] @@ -2731,7 +2731,7 @@ ; ; X64-AVX1-LABEL: test_mm_store1_ps: ; X64-AVX1: # %bb.0: -; X64-AVX1-NEXT: vpermilps $0, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x04,0xc0,0x00] +; X64-AVX1-NEXT: vshufps $0, %xmm0, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0xc6,0xc0,0x00] ; X64-AVX1-NEXT: # xmm0 = xmm0[0,0,0,0] ; X64-AVX1-NEXT: vmovaps %xmm0, (%rdi) # encoding: [0xc5,0xf8,0x29,0x07] ; X64-AVX1-NEXT: retq # encoding: [0xc3] @@ -2972,7 +2972,7 @@ ; X86-AVX1-LABEL: test_mm_storer_ps: ; X86-AVX1: # %bb.0: ; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-AVX1-NEXT: vpermilps $27, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x04,0xc0,0x1b] +; X86-AVX1-NEXT: vshufps $27, %xmm0, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0xc6,0xc0,0x1b] ; X86-AVX1-NEXT: # xmm0 = xmm0[3,2,1,0] ; X86-AVX1-NEXT: vmovaps %xmm0, (%eax) # encoding: [0xc5,0xf8,0x29,0x00] ; X86-AVX1-NEXT: retl # encoding: [0xc3] @@ -2980,7 +2980,7 @@ ; X86-AVX512-LABEL: test_mm_storer_ps: ; X86-AVX512: # %bb.0: ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-AVX512-NEXT: vpermilps $27, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x04,0xc0,0x1b] +; X86-AVX512-NEXT: vshufps $27, %xmm0, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0xc6,0xc0,0x1b] ; X86-AVX512-NEXT: # xmm0 = xmm0[3,2,1,0] ; X86-AVX512-NEXT: vmovaps %xmm0, (%eax) # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x29,0x00] ; X86-AVX512-NEXT: retl # encoding: [0xc3] @@ -2994,14 +2994,14 @@ ; ; X64-AVX1-LABEL: test_mm_storer_ps: ; X64-AVX1: # %bb.0: -; X64-AVX1-NEXT: vpermilps $27, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x04,0xc0,0x1b] +; X64-AVX1-NEXT: vshufps $27, %xmm0, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0xc6,0xc0,0x1b] ; X64-AVX1-NEXT: # xmm0 = xmm0[3,2,1,0] ; X64-AVX1-NEXT: vmovaps %xmm0, (%rdi) # encoding: [0xc5,0xf8,0x29,0x07] ; X64-AVX1-NEXT: retq # encoding: [0xc3] ; ; X64-AVX512-LABEL: test_mm_storer_ps: ; X64-AVX512: # %bb.0: -; X64-AVX512-NEXT: vpermilps $27, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x04,0xc0,0x1b] +; X64-AVX512-NEXT: vshufps $27, %xmm0, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0xc6,0xc0,0x1b] ; X64-AVX512-NEXT: # xmm0 = xmm0[3,2,1,0] ; X64-AVX512-NEXT: vmovaps %xmm0, (%rdi) # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x29,0x07] ; X64-AVX512-NEXT: retq # encoding: [0xc3] diff --git a/llvm/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll b/llvm/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll --- a/llvm/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll +++ b/llvm/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll @@ -4523,7 +4523,7 @@ ; X86-AVX1: # %bb.0: ; X86-AVX1-NEXT: vmovss {{[0-9]+}}(%esp), %xmm0 # encoding: [0xc5,0xfa,0x10,0x44,0x24,0x04] ; X86-AVX1-NEXT: # xmm0 = mem[0],zero,zero,zero -; X86-AVX1-NEXT: vpermilps $0, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x04,0xc0,0x00] +; X86-AVX1-NEXT: vshufps $0, %xmm0, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0xc6,0xc0,0x00] ; X86-AVX1-NEXT: # xmm0 = xmm0[0,0,0,0] ; X86-AVX1-NEXT: retl # encoding: [0xc3] ; @@ -5636,7 +5636,7 @@ ; ; AVX1-LABEL: test_mm_shuffle_epi32: ; AVX1: # %bb.0: -; AVX1-NEXT: vpermilps $0, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x04,0xc0,0x00] +; AVX1-NEXT: vshufps $0, %xmm0, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0xc6,0xc0,0x00] ; AVX1-NEXT: # xmm0 = xmm0[0,0,0,0] ; AVX1-NEXT: ret{{[l|q]}} # encoding: [0xc3] ; diff --git a/llvm/test/CodeGen/X86/sse2-intrinsics-x86-upgrade.ll b/llvm/test/CodeGen/X86/sse2-intrinsics-x86-upgrade.ll --- a/llvm/test/CodeGen/X86/sse2-intrinsics-x86-upgrade.ll +++ b/llvm/test/CodeGen/X86/sse2-intrinsics-x86-upgrade.ll @@ -402,13 +402,13 @@ ; ; AVX1-LABEL: test_x86_sse2_pshuf_d: ; AVX1: ## %bb.0: ## %entry -; AVX1-NEXT: vpermilps $27, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x04,0xc0,0x1b] +; AVX1-NEXT: vshufps $27, %xmm0, %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0xc6,0xc0,0x1b] ; AVX1-NEXT: ## xmm0 = xmm0[3,2,1,0] ; AVX1-NEXT: ret{{[l|q]}} ## encoding: [0xc3] ; ; AVX512-LABEL: test_x86_sse2_pshuf_d: ; AVX512: ## %bb.0: ## %entry -; AVX512-NEXT: vpermilps $27, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x04,0xc0,0x1b] +; AVX512-NEXT: vshufps $27, %xmm0, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0xc6,0xc0,0x1b] ; AVX512-NEXT: ## xmm0 = xmm0[3,2,1,0] ; AVX512-NEXT: ret{{[l|q]}} ## encoding: [0xc3] entry: diff --git a/llvm/test/CodeGen/X86/sse2.ll b/llvm/test/CodeGen/X86/sse2.ll --- a/llvm/test/CodeGen/X86/sse2.ll +++ b/llvm/test/CodeGen/X86/sse2.ll @@ -144,7 +144,7 @@ ; X86-AVX-LABEL: test4: ; X86-AVX: # %bb.0: ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,1,3,3] +; X86-AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,1,3,3] ; X86-AVX-NEXT: vmovaps %xmm0, (%eax) ; X86-AVX-NEXT: retl ; @@ -156,7 +156,7 @@ ; ; X64-AVX-LABEL: test4: ; X64-AVX: # %bb.0: -; X64-AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,1,3,3] +; X64-AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,1,3,3] ; X64-AVX-NEXT: vmovaps %xmm0, (%rdi) ; X64-AVX-NEXT: retq %tmp5 = shufflevector <4 x float> %X, <4 x float> undef, <4 x i32> < i32 2, i32 6, i32 3, i32 7 > ; <<4 x float>> [#uses=1] @@ -448,7 +448,7 @@ ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-AVX-NEXT: vmovaps (%edx), %xmm0 ; X86-AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1],mem[0,1] -; X86-AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,1,3] +; X86-AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,1,3] ; X86-AVX-NEXT: vmovaps %xmm0, (%eax) ; X86-AVX-NEXT: retl ; @@ -464,7 +464,7 @@ ; X64-AVX: # %bb.0: ; X64-AVX-NEXT: vmovaps (%rdx), %xmm0 ; X64-AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1],mem[0,1] -; X64-AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,1,3] +; X64-AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,1,3] ; X64-AVX-NEXT: vmovaps %xmm0, (%rdi) ; X64-AVX-NEXT: retq %tmp3 = load <4 x float>, ptr %B ; <<4 x float>> [#uses=1] diff --git a/llvm/test/CodeGen/X86/sse3-avx-addsub-2.ll b/llvm/test/CodeGen/X86/sse3-avx-addsub-2.ll --- a/llvm/test/CodeGen/X86/sse3-avx-addsub-2.ll +++ b/llvm/test/CodeGen/X86/sse3-avx-addsub-2.ll @@ -338,16 +338,16 @@ ; ; AVX1-LABEL: test13: ; AVX1: # %bb.0: -; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,3,3,3] +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3] ; AVX1-NEXT: vaddss %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0] +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0,0,0] ; AVX1-NEXT: retq ; ; AVX512-LABEL: test13: ; AVX512: # %bb.0: -; AVX512-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] -; AVX512-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,3,3,3] +; AVX512-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX512-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3] ; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vbroadcastss %xmm0, %xmm0 ; AVX512-NEXT: retq @@ -407,8 +407,8 @@ ; AVX1-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] ; AVX1-NEXT: vmovshdup {{.*#+}} xmm3 = xmm1[1,1,3,3] ; AVX1-NEXT: vaddss %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,3,3,3] +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3] ; AVX1-NEXT: vaddss %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vmovsldup {{.*#+}} xmm1 = xmm2[0,0,2,2] ; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] @@ -419,8 +419,8 @@ ; AVX512-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] ; AVX512-NEXT: vmovshdup {{.*#+}} xmm3 = xmm1[1,1,3,3] ; AVX512-NEXT: vaddss %xmm3, %xmm2, %xmm2 -; AVX512-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] -; AVX512-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,3,3,3] +; AVX512-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX512-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3] ; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vbroadcastss %xmm2, %xmm1 ; AVX512-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] @@ -469,8 +469,8 @@ ; AVX-NEXT: vaddss %xmm2, %xmm5, %xmm2 ; AVX-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3] ; AVX-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm4[0],xmm2[3] -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] -; AVX-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,3,3,3] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3] ; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[0] ; AVX-NEXT: retq diff --git a/llvm/test/CodeGen/X86/sse41.ll b/llvm/test/CodeGen/X86/sse41.ll --- a/llvm/test/CodeGen/X86/sse41.ll +++ b/llvm/test/CodeGen/X86/sse41.ll @@ -194,7 +194,7 @@ ; X86-AVX1-LABEL: ext_1: ; X86-AVX1: ## %bb.0: ; X86-AVX1-NEXT: pushl %eax ## encoding: [0x50] -; X86-AVX1-NEXT: vpermilps $255, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x04,0xc0,0xff] +; X86-AVX1-NEXT: vshufps $255, %xmm0, %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0xc6,0xc0,0xff] ; X86-AVX1-NEXT: ## xmm0 = xmm0[3,3,3,3] ; X86-AVX1-NEXT: vaddss {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 ## encoding: [0xc5,0xfa,0x58,0x05,A,A,A,A] ; X86-AVX1-NEXT: ## fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 @@ -206,7 +206,7 @@ ; X86-AVX512-LABEL: ext_1: ; X86-AVX512: ## %bb.0: ; X86-AVX512-NEXT: pushl %eax ## encoding: [0x50] -; X86-AVX512-NEXT: vpermilps $255, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x04,0xc0,0xff] +; X86-AVX512-NEXT: vshufps $255, %xmm0, %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0xc6,0xc0,0xff] ; X86-AVX512-NEXT: ## xmm0 = xmm0[3,3,3,3] ; X86-AVX512-NEXT: vaddss {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x58,0x05,A,A,A,A] ; X86-AVX512-NEXT: ## fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 @@ -225,7 +225,7 @@ ; ; X64-AVX1-LABEL: ext_1: ; X64-AVX1: ## %bb.0: -; X64-AVX1-NEXT: vpermilps $255, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x04,0xc0,0xff] +; X64-AVX1-NEXT: vshufps $255, %xmm0, %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0xc6,0xc0,0xff] ; X64-AVX1-NEXT: ## xmm0 = xmm0[3,3,3,3] ; X64-AVX1-NEXT: vaddss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ## encoding: [0xc5,0xfa,0x58,0x05,A,A,A,A] ; X64-AVX1-NEXT: ## fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte @@ -233,7 +233,7 @@ ; ; X64-AVX512-LABEL: ext_1: ; X64-AVX512: ## %bb.0: -; X64-AVX512-NEXT: vpermilps $255, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x04,0xc0,0xff] +; X64-AVX512-NEXT: vshufps $255, %xmm0, %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0xc6,0xc0,0xff] ; X64-AVX512-NEXT: ## xmm0 = xmm0[3,3,3,3] ; X64-AVX512-NEXT: vaddss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x58,0x05,A,A,A,A] ; X64-AVX512-NEXT: ## fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte @@ -257,7 +257,7 @@ ; X86-AVX1-LABEL: ext_2: ; X86-AVX1: ## %bb.0: ; X86-AVX1-NEXT: pushl %eax ## encoding: [0x50] -; X86-AVX1-NEXT: vpermilps $255, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x04,0xc0,0xff] +; X86-AVX1-NEXT: vshufps $255, %xmm0, %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0xc6,0xc0,0xff] ; X86-AVX1-NEXT: ## xmm0 = xmm0[3,3,3,3] ; X86-AVX1-NEXT: vmovss %xmm0, (%esp) ## encoding: [0xc5,0xfa,0x11,0x04,0x24] ; X86-AVX1-NEXT: flds (%esp) ## encoding: [0xd9,0x04,0x24] @@ -267,7 +267,7 @@ ; X86-AVX512-LABEL: ext_2: ; X86-AVX512: ## %bb.0: ; X86-AVX512-NEXT: pushl %eax ## encoding: [0x50] -; X86-AVX512-NEXT: vpermilps $255, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x04,0xc0,0xff] +; X86-AVX512-NEXT: vshufps $255, %xmm0, %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0xc6,0xc0,0xff] ; X86-AVX512-NEXT: ## xmm0 = xmm0[3,3,3,3] ; X86-AVX512-NEXT: vmovss %xmm0, (%esp) ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x11,0x04,0x24] ; X86-AVX512-NEXT: flds (%esp) ## encoding: [0xd9,0x04,0x24] @@ -280,17 +280,11 @@ ; X64-SSE-NEXT: ## xmm0 = xmm0[3,3,3,3] ; X64-SSE-NEXT: retq ## encoding: [0xc3] ; -; X64-AVX1-LABEL: ext_2: -; X64-AVX1: ## %bb.0: -; X64-AVX1-NEXT: vpermilps $255, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x04,0xc0,0xff] -; X64-AVX1-NEXT: ## xmm0 = xmm0[3,3,3,3] -; X64-AVX1-NEXT: retq ## encoding: [0xc3] -; -; X64-AVX512-LABEL: ext_2: -; X64-AVX512: ## %bb.0: -; X64-AVX512-NEXT: vpermilps $255, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x04,0xc0,0xff] -; X64-AVX512-NEXT: ## xmm0 = xmm0[3,3,3,3] -; X64-AVX512-NEXT: retq ## encoding: [0xc3] +; X64-AVX-LABEL: ext_2: +; X64-AVX: ## %bb.0: +; X64-AVX-NEXT: vshufps $255, %xmm0, %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0xc6,0xc0,0xff] +; X64-AVX-NEXT: ## xmm0 = xmm0[3,3,3,3] +; X64-AVX-NEXT: retq ## encoding: [0xc3] %s = extractelement <4 x float> %v, i32 3 ret float %s } @@ -696,7 +690,7 @@ ; ; AVX1-LABEL: insertps_from_shufflevector_i32_2: ; AVX1: ## %bb.0: ## %entry -; AVX1-NEXT: vpermilps $238, %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x79,0x04,0xc9,0xee] +; AVX1-NEXT: vshufps $238, %xmm1, %xmm1, %xmm1 ## encoding: [0xc5,0xf0,0xc6,0xc9,0xee] ; AVX1-NEXT: ## xmm1 = xmm1[2,3,2,3] ; AVX1-NEXT: vblendps $2, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x02] ; AVX1-NEXT: ## xmm0 = xmm0[0],xmm1[1],xmm0[2,3] @@ -704,7 +698,7 @@ ; ; AVX512-LABEL: insertps_from_shufflevector_i32_2: ; AVX512: ## %bb.0: ## %entry -; AVX512-NEXT: vpermilps $238, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x04,0xc9,0xee] +; AVX512-NEXT: vshufps $238, %xmm1, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0xc6,0xc9,0xee] ; AVX512-NEXT: ## xmm1 = xmm1[2,3,2,3] ; AVX512-NEXT: vblendps $2, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x02] ; AVX512-NEXT: ## xmm0 = xmm0[0],xmm1[1],xmm0[2,3] @@ -1096,7 +1090,7 @@ ; ; AVX1-LABEL: i32_shuf_XYY0: ; AVX1: ## %bb.0: -; AVX1-NEXT: vpermilps $212, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x04,0xc0,0xd4] +; AVX1-NEXT: vshufps $212, %xmm0, %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0xc6,0xc0,0xd4] ; AVX1-NEXT: ## xmm0 = xmm0[0,1,1,3] ; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 ## encoding: [0xc5,0xf0,0x57,0xc9] ; AVX1-NEXT: vblendps $8, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x08] @@ -1105,7 +1099,7 @@ ; ; AVX512-LABEL: i32_shuf_XYY0: ; AVX512: ## %bb.0: -; AVX512-NEXT: vpermilps $212, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x04,0xc0,0xd4] +; AVX512-NEXT: vshufps $212, %xmm0, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0xc6,0xc0,0xd4] ; AVX512-NEXT: ## xmm0 = xmm0[0,1,1,3] ; AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0x57,0xc9] ; AVX512-NEXT: vblendps $8, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x08] @@ -1132,7 +1126,7 @@ ; ; AVX1-LABEL: i32_shuf_XYW0: ; AVX1: ## %bb.0: -; AVX1-NEXT: vpermilps $244, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x04,0xc0,0xf4] +; AVX1-NEXT: vshufps $244, %xmm0, %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0xc6,0xc0,0xf4] ; AVX1-NEXT: ## xmm0 = xmm0[0,1,3,3] ; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 ## encoding: [0xc5,0xf0,0x57,0xc9] ; AVX1-NEXT: vblendps $8, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x08] @@ -1141,7 +1135,7 @@ ; ; AVX512-LABEL: i32_shuf_XYW0: ; AVX512: ## %bb.0: -; AVX512-NEXT: vpermilps $244, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x04,0xc0,0xf4] +; AVX512-NEXT: vshufps $244, %xmm0, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0xc6,0xc0,0xf4] ; AVX512-NEXT: ## xmm0 = xmm0[0,1,3,3] ; AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0x57,0xc9] ; AVX512-NEXT: vblendps $8, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x08] @@ -1169,7 +1163,7 @@ ; ; AVX1-LABEL: i32_shuf_W00W: ; AVX1: ## %bb.0: -; AVX1-NEXT: vpermilps $255, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x04,0xc0,0xff] +; AVX1-NEXT: vshufps $255, %xmm0, %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0xc6,0xc0,0xff] ; AVX1-NEXT: ## xmm0 = xmm0[3,3,3,3] ; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 ## encoding: [0xc5,0xf0,0x57,0xc9] ; AVX1-NEXT: vblendps $6, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x06] @@ -1178,7 +1172,7 @@ ; ; AVX512-LABEL: i32_shuf_W00W: ; AVX512: ## %bb.0: -; AVX512-NEXT: vpermilps $255, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x04,0xc0,0xff] +; AVX512-NEXT: vshufps $255, %xmm0, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0xc6,0xc0,0xff] ; AVX512-NEXT: ## xmm0 = xmm0[3,3,3,3] ; AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0x57,0xc9] ; AVX512-NEXT: vblendps $6, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x06] @@ -1209,7 +1203,7 @@ ; AVX1-NEXT: vxorps %xmm2, %xmm2, %xmm2 ## encoding: [0xc5,0xe8,0x57,0xd2] ; AVX1-NEXT: vblendps $1, %xmm0, %xmm2, %xmm0 ## encoding: [0xc4,0xe3,0x69,0x0c,0xc0,0x01] ; AVX1-NEXT: ## xmm0 = xmm0[0],xmm2[1,2,3] -; AVX1-NEXT: vpermilps $0, %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x79,0x04,0xc9,0x00] +; AVX1-NEXT: vshufps $0, %xmm1, %xmm1, %xmm1 ## encoding: [0xc5,0xf0,0xc6,0xc9,0x00] ; AVX1-NEXT: ## xmm1 = xmm1[0,0,0,0] ; AVX1-NEXT: vblendps $8, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x08] ; AVX1-NEXT: ## xmm0 = xmm0[0,1,2],xmm1[3] @@ -1245,7 +1239,7 @@ ; AVX1-LABEL: i32_shuf_X00X: ; AVX1: ## %bb.0: ; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 ## encoding: [0xc5,0xf0,0x57,0xc9] -; AVX1-NEXT: vpermilps $0, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x04,0xc0,0x00] +; AVX1-NEXT: vshufps $0, %xmm0, %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0xc6,0xc0,0x00] ; AVX1-NEXT: ## xmm0 = xmm0[0,0,0,0] ; AVX1-NEXT: vblendps $6, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x06] ; AVX1-NEXT: ## xmm0 = xmm0[0],xmm1[1,2],xmm0[3] @@ -1880,7 +1874,7 @@ ; X86-AVX1-LABEL: insertps_pr20411: ; X86-AVX1: ## %bb.0: ; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] -; X86-AVX1-NEXT: vpermilps $238, %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x79,0x04,0xc9,0xee] +; X86-AVX1-NEXT: vshufps $238, %xmm1, %xmm1, %xmm1 ## encoding: [0xc5,0xf0,0xc6,0xc9,0xee] ; X86-AVX1-NEXT: ## xmm1 = xmm1[2,3,2,3] ; X86-AVX1-NEXT: vblendps $2, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x02] ; X86-AVX1-NEXT: ## xmm0 = xmm0[0],xmm1[1],xmm0[2,3] @@ -1890,7 +1884,7 @@ ; X86-AVX512-LABEL: insertps_pr20411: ; X86-AVX512: ## %bb.0: ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] -; X86-AVX512-NEXT: vpermilps $238, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x04,0xc9,0xee] +; X86-AVX512-NEXT: vshufps $238, %xmm1, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0xc6,0xc9,0xee] ; X86-AVX512-NEXT: ## xmm1 = xmm1[2,3,2,3] ; X86-AVX512-NEXT: vblendps $2, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x02] ; X86-AVX512-NEXT: ## xmm0 = xmm0[0],xmm1[1],xmm0[2,3] @@ -1908,7 +1902,7 @@ ; ; X64-AVX1-LABEL: insertps_pr20411: ; X64-AVX1: ## %bb.0: -; X64-AVX1-NEXT: vpermilps $238, %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x79,0x04,0xc9,0xee] +; X64-AVX1-NEXT: vshufps $238, %xmm1, %xmm1, %xmm1 ## encoding: [0xc5,0xf0,0xc6,0xc9,0xee] ; X64-AVX1-NEXT: ## xmm1 = xmm1[2,3,2,3] ; X64-AVX1-NEXT: vblendps $2, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x02] ; X64-AVX1-NEXT: ## xmm0 = xmm0[0],xmm1[1],xmm0[2,3] @@ -1917,7 +1911,7 @@ ; ; X64-AVX512-LABEL: insertps_pr20411: ; X64-AVX512: ## %bb.0: -; X64-AVX512-NEXT: vpermilps $238, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x04,0xc9,0xee] +; X64-AVX512-NEXT: vshufps $238, %xmm1, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0xc6,0xc9,0xee] ; X64-AVX512-NEXT: ## xmm1 = xmm1[2,3,2,3] ; X64-AVX512-NEXT: vblendps $2, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x02] ; X64-AVX512-NEXT: ## xmm0 = xmm0[0],xmm1[1],xmm0[2,3] diff --git a/llvm/test/CodeGen/X86/stack-folding-fp-avx1.ll b/llvm/test/CodeGen/X86/stack-folding-fp-avx1.ll --- a/llvm/test/CodeGen/X86/stack-folding-fp-avx1.ll +++ b/llvm/test/CodeGen/X86/stack-folding-fp-avx1.ll @@ -2704,8 +2704,8 @@ ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: vpermilps $27, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; CHECK-NEXT: # xmm0 = mem[3,2,1,0] +; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,2,1,0] ; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() %2 = shufflevector <4 x float> %a0, <4 x float> undef, <4 x i32> @@ -2719,8 +2719,8 @@ ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: vpermilps $27, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload -; CHECK-NEXT: # ymm0 = mem[3,2,1,0,7,6,5,4] +; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; CHECK-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] ; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() %2 = shufflevector <8 x float> %a0, <8 x float> undef, <8 x i32> diff --git a/llvm/test/CodeGen/X86/stack-folding-fp-avx512vl.ll b/llvm/test/CodeGen/X86/stack-folding-fp-avx512vl.ll --- a/llvm/test/CodeGen/X86/stack-folding-fp-avx512vl.ll +++ b/llvm/test/CodeGen/X86/stack-folding-fp-avx512vl.ll @@ -1317,8 +1317,8 @@ ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: vpermilps $27, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; CHECK-NEXT: # xmm0 = mem[3,2,1,0] +; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,2,1,0] ; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = shufflevector <4 x float> %a0, <4 x float> undef, <4 x i32> @@ -1332,8 +1332,8 @@ ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: vpermilps $27, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload -; CHECK-NEXT: # ymm0 = mem[3,2,1,0,7,6,5,4] +; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; CHECK-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] ; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = shufflevector <8 x float> %a0, <8 x float> undef, <8 x i32> diff --git a/llvm/test/CodeGen/X86/swizzle-avx2.ll b/llvm/test/CodeGen/X86/swizzle-avx2.ll --- a/llvm/test/CodeGen/X86/swizzle-avx2.ll +++ b/llvm/test/CodeGen/X86/swizzle-avx2.ll @@ -25,7 +25,7 @@ define <8 x i32> @swizzle_2(<8 x i32> %v) { ; CHECK-LABEL: swizzle_2: ; CHECK: # %bb.0: -; CHECK-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5] +; CHECK-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5] ; CHECK-NEXT: retq %1 = shufflevector <8 x i32> %v, <8 x i32> undef, <8 x i32> %2 = shufflevector <8 x i32> %1, <8 x i32> undef, <8 x i32> @@ -35,7 +35,7 @@ define <8 x i32> @swizzle_3(<8 x i32> %v) { ; CHECK-LABEL: swizzle_3: ; CHECK: # %bb.0: -; CHECK-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5] +; CHECK-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5] ; CHECK-NEXT: retq %1 = shufflevector <8 x i32> %v, <8 x i32> undef, <8 x i32> %2 = shufflevector <8 x i32> %1, <8 x i32> undef, <8 x i32> diff --git a/llvm/test/CodeGen/X86/vec-libcalls.ll b/llvm/test/CodeGen/X86/vec-libcalls.ll --- a/llvm/test/CodeGen/X86/vec-libcalls.ll +++ b/llvm/test/CodeGen/X86/vec-libcalls.ll @@ -108,8 +108,8 @@ ; CHECK-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload ; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3] ; CHECK-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill -; CHECK-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; CHECK-NEXT: # xmm0 = mem[3,3,3,3] +; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; CHECK-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; CHECK-NEXT: callq sinf@PLT ; CHECK-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload ; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] @@ -140,8 +140,9 @@ ; CHECK-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload ; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3] ; CHECK-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill -; CHECK-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; CHECK-NEXT: # xmm0 = mem[3,3,3,3] +; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; CHECK-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; CHECK-NEXT: vzeroupper ; CHECK-NEXT: callq sinf@PLT ; CHECK-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload ; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] @@ -191,8 +192,9 @@ ; CHECK-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload ; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3] ; CHECK-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill -; CHECK-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; CHECK-NEXT: # xmm0 = mem[3,3,3,3] +; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; CHECK-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; CHECK-NEXT: vzeroupper ; CHECK-NEXT: callq sinf@PLT ; CHECK-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload ; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] diff --git a/llvm/test/CodeGen/X86/vec-strict-fptoint-256.ll b/llvm/test/CodeGen/X86/vec-strict-fptoint-256.ll --- a/llvm/test/CodeGen/X86/vec-strict-fptoint-256.ll +++ b/llvm/test/CodeGen/X86/vec-strict-fptoint-256.ll @@ -607,7 +607,7 @@ ; ; AVX-64-LABEL: strict_vector_fptosi_v4f32_to_v4i64: ; AVX-64: # %bb.0: -; AVX-64-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,3,3,3] +; AVX-64-NEXT: vshufps {{.*#+}} xmm1 = xmm0[3,3,3,3] ; AVX-64-NEXT: vcvttss2si %xmm1, %rax ; AVX-64-NEXT: vmovq %rax, %xmm1 ; AVX-64-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] @@ -661,7 +661,7 @@ ; ; AVX512F-64-LABEL: strict_vector_fptosi_v4f32_to_v4i64: ; AVX512F-64: # %bb.0: -; AVX512F-64-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,3,3,3] +; AVX512F-64-NEXT: vshufps {{.*#+}} xmm1 = xmm0[3,3,3,3] ; AVX512F-64-NEXT: vcvttss2si %xmm1, %rax ; AVX512F-64-NEXT: vmovq %rax, %xmm1 ; AVX512F-64-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] @@ -715,7 +715,7 @@ ; ; AVX512VL-64-LABEL: strict_vector_fptosi_v4f32_to_v4i64: ; AVX512VL-64: # %bb.0: -; AVX512VL-64-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,3,3,3] +; AVX512VL-64-NEXT: vshufps {{.*#+}} xmm1 = xmm0[3,3,3,3] ; AVX512VL-64-NEXT: vcvttss2si %xmm1, %rax ; AVX512VL-64-NEXT: vmovq %rax, %xmm1 ; AVX512VL-64-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] @@ -774,7 +774,7 @@ ; AVX-32-NEXT: movzbl %al, %eax ; AVX-32-NEXT: shll $31, %eax ; AVX-32-NEXT: xorl {{[0-9]+}}(%esp), %eax -; AVX-32-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[3,3,3,3] +; AVX-32-NEXT: vshufps {{.*#+}} xmm2 = xmm0[3,3,3,3] ; AVX-32-NEXT: vcomiss %xmm1, %xmm2 ; AVX-32-NEXT: vmovaps %xmm1, %xmm3 ; AVX-32-NEXT: jae .LBB3_4 @@ -836,7 +836,7 @@ ; ; AVX-64-LABEL: strict_vector_fptoui_v4f32_to_v4i64: ; AVX-64: # %bb.0: -; AVX-64-NEXT: vpermilps {{.*#+}} xmm3 = xmm0[3,3,3,3] +; AVX-64-NEXT: vshufps {{.*#+}} xmm3 = xmm0[3,3,3,3] ; AVX-64-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; AVX-64-NEXT: vcomiss %xmm1, %xmm3 ; AVX-64-NEXT: vxorps %xmm2, %xmm2, %xmm2 @@ -908,7 +908,7 @@ ; AVX512F-32-NEXT: andl $-8, %esp ; AVX512F-32-NEXT: subl $40, %esp ; AVX512F-32-NEXT: .cfi_offset %ebx, -12 -; AVX512F-32-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[3,3,3,3] +; AVX512F-32-NEXT: vshufps {{.*#+}} xmm2 = xmm0[3,3,3,3] ; AVX512F-32-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; AVX512F-32-NEXT: xorl %eax, %eax ; AVX512F-32-NEXT: vcomiss %xmm1, %xmm2 @@ -974,7 +974,7 @@ ; ; AVX512F-64-LABEL: strict_vector_fptoui_v4f32_to_v4i64: ; AVX512F-64: # %bb.0: -; AVX512F-64-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,3,3,3] +; AVX512F-64-NEXT: vshufps {{.*#+}} xmm1 = xmm0[3,3,3,3] ; AVX512F-64-NEXT: vcvttss2usi %xmm1, %rax ; AVX512F-64-NEXT: vmovq %rax, %xmm1 ; AVX512F-64-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] @@ -1001,7 +1001,7 @@ ; AVX512VL-32-NEXT: andl $-8, %esp ; AVX512VL-32-NEXT: subl $40, %esp ; AVX512VL-32-NEXT: .cfi_offset %ebx, -12 -; AVX512VL-32-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[3,3,3,3] +; AVX512VL-32-NEXT: vshufps {{.*#+}} xmm2 = xmm0[3,3,3,3] ; AVX512VL-32-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; AVX512VL-32-NEXT: xorl %eax, %eax ; AVX512VL-32-NEXT: vcomiss %xmm1, %xmm2 @@ -1067,7 +1067,7 @@ ; ; AVX512VL-64-LABEL: strict_vector_fptoui_v4f32_to_v4i64: ; AVX512VL-64: # %bb.0: -; AVX512VL-64-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,3,3,3] +; AVX512VL-64-NEXT: vshufps {{.*#+}} xmm1 = xmm0[3,3,3,3] ; AVX512VL-64-NEXT: vcvttss2usi %xmm1, %rax ; AVX512VL-64-NEXT: vmovq %rax, %xmm1 ; AVX512VL-64-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] diff --git a/llvm/test/CodeGen/X86/vec-strict-fptoint-512.ll b/llvm/test/CodeGen/X86/vec-strict-fptoint-512.ll --- a/llvm/test/CodeGen/X86/vec-strict-fptoint-512.ll +++ b/llvm/test/CodeGen/X86/vec-strict-fptoint-512.ll @@ -385,7 +385,7 @@ ; AVX512VL-64-LABEL: strict_vector_fptosi_v8f32_to_v8i64: ; AVX512VL-64: # %bb.0: ; AVX512VL-64-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX512VL-64-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[3,3,3,3] +; AVX512VL-64-NEXT: vshufps {{.*#+}} xmm2 = xmm1[3,3,3,3] ; AVX512VL-64-NEXT: vcvttss2si %xmm2, %rax ; AVX512VL-64-NEXT: vmovq %rax, %xmm2 ; AVX512VL-64-NEXT: vpermilpd {{.*#+}} xmm3 = xmm1[1,0] @@ -399,7 +399,7 @@ ; AVX512VL-64-NEXT: vmovq %rax, %xmm1 ; AVX512VL-64-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm3[0],xmm1[0] ; AVX512VL-64-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 -; AVX512VL-64-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[3,3,3,3] +; AVX512VL-64-NEXT: vshufps {{.*#+}} xmm2 = xmm0[3,3,3,3] ; AVX512VL-64-NEXT: vcvttss2si %xmm2, %rax ; AVX512VL-64-NEXT: vmovq %rax, %xmm2 ; AVX512VL-64-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0] @@ -442,7 +442,7 @@ ; AVX512VL-32-NEXT: .cfi_offset %edi, -16 ; AVX512VL-32-NEXT: .cfi_offset %ebx, -12 ; AVX512VL-32-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX512VL-32-NEXT: vpermilps {{.*#+}} xmm3 = xmm2[3,3,3,3] +; AVX512VL-32-NEXT: vshufps {{.*#+}} xmm3 = xmm2[3,3,3,3] ; AVX512VL-32-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; AVX512VL-32-NEXT: xorl %eax, %eax ; AVX512VL-32-NEXT: vcomiss %xmm1, %xmm3 @@ -477,7 +477,7 @@ ; AVX512VL-32-NEXT: vmovss %xmm1, %xmm1, %xmm3 {%k1} {z} ; AVX512VL-32-NEXT: vsubss %xmm3, %xmm2, %xmm2 ; AVX512VL-32-NEXT: vmovss %xmm2, {{[0-9]+}}(%esp) -; AVX512VL-32-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[3,3,3,3] +; AVX512VL-32-NEXT: vshufps {{.*#+}} xmm2 = xmm0[3,3,3,3] ; AVX512VL-32-NEXT: xorl %eax, %eax ; AVX512VL-32-NEXT: vcomiss %xmm1, %xmm2 ; AVX512VL-32-NEXT: setae %al @@ -574,7 +574,7 @@ ; AVX512VL-64-LABEL: strict_vector_fptoui_v8f32_to_v8i64: ; AVX512VL-64: # %bb.0: ; AVX512VL-64-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX512VL-64-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[3,3,3,3] +; AVX512VL-64-NEXT: vshufps {{.*#+}} xmm2 = xmm1[3,3,3,3] ; AVX512VL-64-NEXT: vcvttss2usi %xmm2, %rax ; AVX512VL-64-NEXT: vmovq %rax, %xmm2 ; AVX512VL-64-NEXT: vpermilpd {{.*#+}} xmm3 = xmm1[1,0] @@ -588,7 +588,7 @@ ; AVX512VL-64-NEXT: vmovq %rax, %xmm1 ; AVX512VL-64-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm3[0],xmm1[0] ; AVX512VL-64-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 -; AVX512VL-64-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[3,3,3,3] +; AVX512VL-64-NEXT: vshufps {{.*#+}} xmm2 = xmm0[3,3,3,3] ; AVX512VL-64-NEXT: vcvttss2usi %xmm2, %rax ; AVX512VL-64-NEXT: vmovq %rax, %xmm2 ; AVX512VL-64-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0] diff --git a/llvm/test/CodeGen/X86/vec-strict-inttofp-128.ll b/llvm/test/CodeGen/X86/vec-strict-inttofp-128.ll --- a/llvm/test/CodeGen/X86/vec-strict-inttofp-128.ll +++ b/llvm/test/CodeGen/X86/vec-strict-inttofp-128.ll @@ -209,7 +209,7 @@ ; AVX-32-NEXT: andl $-8, %esp ; AVX-32-NEXT: subl $24, %esp ; AVX-32-NEXT: vmovlps %xmm0, {{[0-9]+}}(%esp) -; AVX-32-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX-32-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX-32-NEXT: vmovlps %xmm0, {{[0-9]+}}(%esp) ; AVX-32-NEXT: fildll {{[0-9]+}}(%esp) ; AVX-32-NEXT: fstps (%esp) @@ -236,7 +236,7 @@ ; AVX512DQ-32: # %bb.0: ; AVX512DQ-32-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512DQ-32-NEXT: vcvtqq2ps %zmm0, %ymm1 -; AVX512DQ-32-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX512DQ-32-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX512DQ-32-NEXT: vcvtqq2ps %zmm0, %ymm0 ; AVX512DQ-32-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],zero,zero ; AVX512DQ-32-NEXT: vzeroupper @@ -410,7 +410,7 @@ ; AVX-32-NEXT: andl $-8, %esp ; AVX-32-NEXT: subl $24, %esp ; AVX-32-NEXT: vmovlps %xmm0, {{[0-9]+}}(%esp) -; AVX-32-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX-32-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX-32-NEXT: vmovlps %xmm1, {{[0-9]+}}(%esp) ; AVX-32-NEXT: vextractps $1, %xmm0, %eax ; AVX-32-NEXT: shrl $31, %eax @@ -471,7 +471,7 @@ ; AVX512DQ-32: # %bb.0: ; AVX512DQ-32-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512DQ-32-NEXT: vcvtuqq2ps %zmm0, %ymm1 -; AVX512DQ-32-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX512DQ-32-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX512DQ-32-NEXT: vcvtuqq2ps %zmm0, %ymm0 ; AVX512DQ-32-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],zero,zero ; AVX512DQ-32-NEXT: vzeroupper @@ -887,21 +887,21 @@ ; ; AVX1-32-LABEL: uitofp_v2i1_v2f64: ; AVX1-32: # %bb.0: -; AVX1-32-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX1-32-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3] ; AVX1-32-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 ; AVX1-32-NEXT: vcvtdq2pd %xmm0, %xmm0 ; AVX1-32-NEXT: retl ; ; AVX1-64-LABEL: uitofp_v2i1_v2f64: ; AVX1-64: # %bb.0: -; AVX1-64-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX1-64-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3] ; AVX1-64-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX1-64-NEXT: vcvtdq2pd %xmm0, %xmm0 ; AVX1-64-NEXT: retq ; ; AVX512F-LABEL: uitofp_v2i1_v2f64: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX512F-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3] ; AVX512F-NEXT: vbroadcastss {{.*#+}} xmm1 = [1,1,1,1] ; AVX512F-NEXT: vandps %xmm1, %xmm0, %xmm0 ; AVX512F-NEXT: vcvtdq2pd %xmm0, %xmm0 @@ -923,7 +923,7 @@ ; ; AVX512DQ-LABEL: uitofp_v2i1_v2f64: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX512DQ-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3] ; AVX512DQ-NEXT: vbroadcastss {{.*#+}} xmm1 = [1,1,1,1] ; AVX512DQ-NEXT: vandps %xmm1, %xmm0, %xmm0 ; AVX512DQ-NEXT: vcvtdq2pd %xmm0, %xmm0 @@ -931,14 +931,14 @@ ; ; AVX512DQVL-32-LABEL: uitofp_v2i1_v2f64: ; AVX512DQVL-32: # %bb.0: -; AVX512DQVL-32-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX512DQVL-32-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3] ; AVX512DQVL-32-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}{1to4}, %xmm0, %xmm0 ; AVX512DQVL-32-NEXT: vcvtdq2pd %xmm0, %xmm0 ; AVX512DQVL-32-NEXT: retl ; ; AVX512DQVL-64-LABEL: uitofp_v2i1_v2f64: ; AVX512DQVL-64: # %bb.0: -; AVX512DQVL-64-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX512DQVL-64-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3] ; AVX512DQVL-64-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 ; AVX512DQVL-64-NEXT: vcvtdq2pd %xmm0, %xmm0 ; AVX512DQVL-64-NEXT: retq @@ -1218,7 +1218,7 @@ ; AVX-32-NEXT: andl $-8, %esp ; AVX-32-NEXT: subl $32, %esp ; AVX-32-NEXT: vmovlps %xmm0, {{[0-9]+}}(%esp) -; AVX-32-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX-32-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX-32-NEXT: vmovlps %xmm0, {{[0-9]+}}(%esp) ; AVX-32-NEXT: fildll {{[0-9]+}}(%esp) ; AVX-32-NEXT: fstpl {{[0-9]+}}(%esp) @@ -1406,7 +1406,7 @@ ; AVX-32-NEXT: andl $-8, %esp ; AVX-32-NEXT: subl $32, %esp ; AVX-32-NEXT: vmovlps %xmm0, {{[0-9]+}}(%esp) -; AVX-32-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX-32-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX-32-NEXT: vmovlps %xmm1, {{[0-9]+}}(%esp) ; AVX-32-NEXT: vextractps $1, %xmm0, %eax ; AVX-32-NEXT: shrl $31, %eax diff --git a/llvm/test/CodeGen/X86/vec-strict-inttofp-256.ll b/llvm/test/CodeGen/X86/vec-strict-inttofp-256.ll --- a/llvm/test/CodeGen/X86/vec-strict-inttofp-256.ll +++ b/llvm/test/CodeGen/X86/vec-strict-inttofp-256.ll @@ -641,11 +641,11 @@ ; AVX-32-NEXT: andl $-8, %esp ; AVX-32-NEXT: subl $64, %esp ; AVX-32-NEXT: vmovlps %xmm0, {{[0-9]+}}(%esp) -; AVX-32-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX-32-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX-32-NEXT: vmovlps %xmm1, {{[0-9]+}}(%esp) ; AVX-32-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX-32-NEXT: vmovlps %xmm0, {{[0-9]+}}(%esp) -; AVX-32-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX-32-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX-32-NEXT: vmovlps %xmm0, {{[0-9]+}}(%esp) ; AVX-32-NEXT: fildll {{[0-9]+}}(%esp) ; AVX-32-NEXT: fstpl {{[0-9]+}}(%esp) @@ -758,11 +758,11 @@ ; AVX-32-NEXT: andl $-8, %esp ; AVX-32-NEXT: subl $64, %esp ; AVX-32-NEXT: vmovlps %xmm0, {{[0-9]+}}(%esp) -; AVX-32-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX-32-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX-32-NEXT: vmovlps %xmm1, {{[0-9]+}}(%esp) ; AVX-32-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX-32-NEXT: vmovlps %xmm1, {{[0-9]+}}(%esp) -; AVX-32-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[2,3,2,3] +; AVX-32-NEXT: vshufps {{.*#+}} xmm2 = xmm1[2,3,2,3] ; AVX-32-NEXT: vmovlps %xmm2, {{[0-9]+}}(%esp) ; AVX-32-NEXT: vextractps $1, %xmm0, %eax ; AVX-32-NEXT: shrl $31, %eax @@ -919,11 +919,11 @@ ; AVX-32-NEXT: andl $-8, %esp ; AVX-32-NEXT: subl $48, %esp ; AVX-32-NEXT: vmovlps %xmm0, {{[0-9]+}}(%esp) -; AVX-32-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX-32-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX-32-NEXT: vmovlps %xmm1, {{[0-9]+}}(%esp) ; AVX-32-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX-32-NEXT: vmovlps %xmm0, {{[0-9]+}}(%esp) -; AVX-32-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX-32-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX-32-NEXT: vmovlps %xmm0, {{[0-9]+}}(%esp) ; AVX-32-NEXT: fildll {{[0-9]+}}(%esp) ; AVX-32-NEXT: fstps {{[0-9]+}}(%esp) @@ -1042,11 +1042,11 @@ ; AVX-32-NEXT: andl $-8, %esp ; AVX-32-NEXT: subl $48, %esp ; AVX-32-NEXT: vmovlps %xmm0, {{[0-9]+}}(%esp) -; AVX-32-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX-32-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX-32-NEXT: vmovlps %xmm1, {{[0-9]+}}(%esp) ; AVX-32-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX-32-NEXT: vmovlps %xmm1, {{[0-9]+}}(%esp) -; AVX-32-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[2,3,2,3] +; AVX-32-NEXT: vshufps {{.*#+}} xmm2 = xmm1[2,3,2,3] ; AVX-32-NEXT: vmovlps %xmm2, {{[0-9]+}}(%esp) ; AVX-32-NEXT: vextractps $1, %xmm0, %eax ; AVX-32-NEXT: shrl $31, %eax diff --git a/llvm/test/CodeGen/X86/vec-strict-inttofp-512.ll b/llvm/test/CodeGen/X86/vec-strict-inttofp-512.ll --- a/llvm/test/CodeGen/X86/vec-strict-inttofp-512.ll +++ b/llvm/test/CodeGen/X86/vec-strict-inttofp-512.ll @@ -273,18 +273,18 @@ ; NODQ-32-NEXT: subl $128, %esp ; NODQ-32-NEXT: vextractf32x4 $2, %zmm0, %xmm1 ; NODQ-32-NEXT: vmovlps %xmm1, {{[0-9]+}}(%esp) -; NODQ-32-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[2,3,2,3] +; NODQ-32-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3] ; NODQ-32-NEXT: vmovlps %xmm1, {{[0-9]+}}(%esp) ; NODQ-32-NEXT: vextractf32x4 $3, %zmm0, %xmm1 ; NODQ-32-NEXT: vmovlps %xmm1, {{[0-9]+}}(%esp) -; NODQ-32-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[2,3,2,3] +; NODQ-32-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3] ; NODQ-32-NEXT: vmovlps %xmm1, {{[0-9]+}}(%esp) ; NODQ-32-NEXT: vmovlps %xmm0, {{[0-9]+}}(%esp) -; NODQ-32-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; NODQ-32-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; NODQ-32-NEXT: vmovlps %xmm1, {{[0-9]+}}(%esp) ; NODQ-32-NEXT: vextractf128 $1, %ymm0, %xmm0 ; NODQ-32-NEXT: vmovlps %xmm0, {{[0-9]+}}(%esp) -; NODQ-32-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,2,3] +; NODQ-32-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] ; NODQ-32-NEXT: vmovlps %xmm0, {{[0-9]+}}(%esp) ; NODQ-32-NEXT: fildll {{[0-9]+}}(%esp) ; NODQ-32-NEXT: fstpl {{[0-9]+}}(%esp) @@ -371,18 +371,18 @@ ; NODQ-32-NEXT: subl $128, %esp ; NODQ-32-NEXT: vextractf32x4 $2, %zmm0, %xmm3 ; NODQ-32-NEXT: vmovlps %xmm3, {{[0-9]+}}(%esp) -; NODQ-32-NEXT: vpermilps {{.*#+}} xmm1 = xmm3[2,3,2,3] +; NODQ-32-NEXT: vshufps {{.*#+}} xmm1 = xmm3[2,3,2,3] ; NODQ-32-NEXT: vmovlps %xmm1, {{[0-9]+}}(%esp) ; NODQ-32-NEXT: vextractf32x4 $3, %zmm0, %xmm2 ; NODQ-32-NEXT: vmovlps %xmm2, {{[0-9]+}}(%esp) -; NODQ-32-NEXT: vpermilps {{.*#+}} xmm1 = xmm2[2,3,2,3] +; NODQ-32-NEXT: vshufps {{.*#+}} xmm1 = xmm2[2,3,2,3] ; NODQ-32-NEXT: vmovlps %xmm1, {{[0-9]+}}(%esp) ; NODQ-32-NEXT: vmovlps %xmm0, {{[0-9]+}}(%esp) -; NODQ-32-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; NODQ-32-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; NODQ-32-NEXT: vmovlps %xmm1, {{[0-9]+}}(%esp) ; NODQ-32-NEXT: vextractf128 $1, %ymm0, %xmm1 ; NODQ-32-NEXT: vmovlps %xmm1, {{[0-9]+}}(%esp) -; NODQ-32-NEXT: vpermilps {{.*#+}} xmm4 = xmm1[2,3,2,3] +; NODQ-32-NEXT: vshufps {{.*#+}} xmm4 = xmm1[2,3,2,3] ; NODQ-32-NEXT: vmovlps %xmm4, {{[0-9]+}}(%esp) ; NODQ-32-NEXT: vextractps $1, %xmm3, %eax ; NODQ-32-NEXT: shrl $31, %eax @@ -499,19 +499,19 @@ ; NODQ-32-NEXT: andl $-8, %esp ; NODQ-32-NEXT: subl $96, %esp ; NODQ-32-NEXT: vmovlps %xmm0, {{[0-9]+}}(%esp) -; NODQ-32-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; NODQ-32-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; NODQ-32-NEXT: vmovlps %xmm1, {{[0-9]+}}(%esp) ; NODQ-32-NEXT: vextractf128 $1, %ymm0, %xmm1 ; NODQ-32-NEXT: vmovlps %xmm1, {{[0-9]+}}(%esp) -; NODQ-32-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[2,3,2,3] +; NODQ-32-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3] ; NODQ-32-NEXT: vmovlps %xmm1, {{[0-9]+}}(%esp) ; NODQ-32-NEXT: vextractf32x4 $2, %zmm0, %xmm1 ; NODQ-32-NEXT: vmovlps %xmm1, {{[0-9]+}}(%esp) -; NODQ-32-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[2,3,2,3] +; NODQ-32-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3] ; NODQ-32-NEXT: vmovlps %xmm1, {{[0-9]+}}(%esp) ; NODQ-32-NEXT: vextractf32x4 $3, %zmm0, %xmm0 ; NODQ-32-NEXT: vmovlps %xmm0, {{[0-9]+}}(%esp) -; NODQ-32-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,2,3] +; NODQ-32-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] ; NODQ-32-NEXT: vmovlps %xmm0, {{[0-9]+}}(%esp) ; NODQ-32-NEXT: fildll {{[0-9]+}}(%esp) ; NODQ-32-NEXT: fstps {{[0-9]+}}(%esp) @@ -595,19 +595,19 @@ ; NODQ-32-NEXT: andl $-8, %esp ; NODQ-32-NEXT: subl $96, %esp ; NODQ-32-NEXT: vmovlps %xmm0, {{[0-9]+}}(%esp) -; NODQ-32-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; NODQ-32-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; NODQ-32-NEXT: vmovlps %xmm1, {{[0-9]+}}(%esp) ; NODQ-32-NEXT: vextractf128 $1, %ymm0, %xmm3 ; NODQ-32-NEXT: vmovlps %xmm3, {{[0-9]+}}(%esp) -; NODQ-32-NEXT: vpermilps {{.*#+}} xmm1 = xmm3[2,3,2,3] +; NODQ-32-NEXT: vshufps {{.*#+}} xmm1 = xmm3[2,3,2,3] ; NODQ-32-NEXT: vmovlps %xmm1, {{[0-9]+}}(%esp) ; NODQ-32-NEXT: vextractf32x4 $2, %zmm0, %xmm2 ; NODQ-32-NEXT: vmovlps %xmm2, {{[0-9]+}}(%esp) -; NODQ-32-NEXT: vpermilps {{.*#+}} xmm1 = xmm2[2,3,2,3] +; NODQ-32-NEXT: vshufps {{.*#+}} xmm1 = xmm2[2,3,2,3] ; NODQ-32-NEXT: vmovlps %xmm1, {{[0-9]+}}(%esp) ; NODQ-32-NEXT: vextractf32x4 $3, %zmm0, %xmm1 ; NODQ-32-NEXT: vmovlps %xmm1, {{[0-9]+}}(%esp) -; NODQ-32-NEXT: vpermilps {{.*#+}} xmm4 = xmm1[2,3,2,3] +; NODQ-32-NEXT: vshufps {{.*#+}} xmm4 = xmm1[2,3,2,3] ; NODQ-32-NEXT: vmovlps %xmm4, {{[0-9]+}}(%esp) ; NODQ-32-NEXT: vextractps $1, %xmm0, %eax ; NODQ-32-NEXT: shrl $31, %eax diff --git a/llvm/test/CodeGen/X86/vec_fp_to_int.ll b/llvm/test/CodeGen/X86/vec_fp_to_int.ll --- a/llvm/test/CodeGen/X86/vec_fp_to_int.ll +++ b/llvm/test/CodeGen/X86/vec_fp_to_int.ll @@ -954,7 +954,7 @@ ; ; AVX1-LABEL: fptosi_4f32_to_4i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,3,3,3] +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[3,3,3,3] ; AVX1-NEXT: vcvttss2si %xmm1, %rax ; AVX1-NEXT: vmovq %rax, %xmm1 ; AVX1-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] @@ -972,7 +972,7 @@ ; ; AVX2-LABEL: fptosi_4f32_to_4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,3,3,3] +; AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm0[3,3,3,3] ; AVX2-NEXT: vcvttss2si %xmm1, %rax ; AVX2-NEXT: vmovq %rax, %xmm1 ; AVX2-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] @@ -990,7 +990,7 @@ ; ; AVX512F-LABEL: fptosi_4f32_to_4i64: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,3,3,3] +; AVX512F-NEXT: vshufps {{.*#+}} xmm1 = xmm0[3,3,3,3] ; AVX512F-NEXT: vcvttss2si %xmm1, %rax ; AVX512F-NEXT: vmovq %rax, %xmm1 ; AVX512F-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] @@ -1008,7 +1008,7 @@ ; ; AVX512VL-LABEL: fptosi_4f32_to_4i64: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,3,3,3] +; AVX512VL-NEXT: vshufps {{.*#+}} xmm1 = xmm0[3,3,3,3] ; AVX512VL-NEXT: vcvttss2si %xmm1, %rax ; AVX512VL-NEXT: vmovq %rax, %xmm1 ; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] @@ -1062,7 +1062,7 @@ ; ; AVX1-LABEL: fptosi_8f32_to_4i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,3,3,3] +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[3,3,3,3] ; AVX1-NEXT: vcvttss2si %xmm1, %rax ; AVX1-NEXT: vmovq %rax, %xmm1 ; AVX1-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] @@ -1080,7 +1080,7 @@ ; ; AVX2-LABEL: fptosi_8f32_to_4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,3,3,3] +; AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm0[3,3,3,3] ; AVX2-NEXT: vcvttss2si %xmm1, %rax ; AVX2-NEXT: vmovq %rax, %xmm1 ; AVX2-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] @@ -1103,7 +1103,7 @@ ; AVX512F-NEXT: vcvttss2si %xmm0, %rcx ; AVX512F-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX512F-NEXT: vcvttss2si %xmm1, %rdx -; AVX512F-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX512F-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX512F-NEXT: vcvttss2si %xmm0, %rsi ; AVX512F-NEXT: vmovq %rsi, %xmm0 ; AVX512F-NEXT: vmovq %rdx, %xmm1 @@ -1121,7 +1121,7 @@ ; AVX512VL-NEXT: vcvttss2si %xmm0, %rcx ; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX512VL-NEXT: vcvttss2si %xmm1, %rdx -; AVX512VL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX512VL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX512VL-NEXT: vcvttss2si %xmm0, %rsi ; AVX512VL-NEXT: vmovq %rsi, %xmm0 ; AVX512VL-NEXT: vmovq %rdx, %xmm1 @@ -1566,7 +1566,7 @@ ; ; AVX1-LABEL: fptoui_4f32_to_4i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[3,3,3,3] +; AVX1-NEXT: vshufps {{.*#+}} xmm2 = xmm0[3,3,3,3] ; AVX1-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; AVX1-NEXT: vsubss %xmm1, %xmm2, %xmm3 ; AVX1-NEXT: vcvttss2si %xmm3, %rax @@ -1609,7 +1609,7 @@ ; ; AVX2-LABEL: fptoui_4f32_to_4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[3,3,3,3] +; AVX2-NEXT: vshufps {{.*#+}} xmm2 = xmm0[3,3,3,3] ; AVX2-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; AVX2-NEXT: vsubss %xmm1, %xmm2, %xmm3 ; AVX2-NEXT: vcvttss2si %xmm3, %rax @@ -1652,7 +1652,7 @@ ; ; AVX512F-LABEL: fptoui_4f32_to_4i64: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,3,3,3] +; AVX512F-NEXT: vshufps {{.*#+}} xmm1 = xmm0[3,3,3,3] ; AVX512F-NEXT: vcvttss2usi %xmm1, %rax ; AVX512F-NEXT: vmovq %rax, %xmm1 ; AVX512F-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] @@ -1670,7 +1670,7 @@ ; ; AVX512VL-LABEL: fptoui_4f32_to_4i64: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,3,3,3] +; AVX512VL-NEXT: vshufps {{.*#+}} xmm1 = xmm0[3,3,3,3] ; AVX512VL-NEXT: vcvttss2usi %xmm1, %rax ; AVX512VL-NEXT: vmovq %rax, %xmm1 ; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] @@ -1750,7 +1750,7 @@ ; ; AVX1-LABEL: fptoui_8f32_to_4i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[3,3,3,3] +; AVX1-NEXT: vshufps {{.*#+}} xmm2 = xmm0[3,3,3,3] ; AVX1-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; AVX1-NEXT: vsubss %xmm1, %xmm2, %xmm3 ; AVX1-NEXT: vcvttss2si %xmm3, %rax @@ -1793,7 +1793,7 @@ ; ; AVX2-LABEL: fptoui_8f32_to_4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[3,3,3,3] +; AVX2-NEXT: vshufps {{.*#+}} xmm2 = xmm0[3,3,3,3] ; AVX2-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; AVX2-NEXT: vsubss %xmm1, %xmm2, %xmm3 ; AVX2-NEXT: vcvttss2si %xmm3, %rax @@ -1841,7 +1841,7 @@ ; AVX512F-NEXT: vcvttss2usi %xmm0, %rcx ; AVX512F-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX512F-NEXT: vcvttss2usi %xmm1, %rdx -; AVX512F-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX512F-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX512F-NEXT: vcvttss2usi %xmm0, %rsi ; AVX512F-NEXT: vmovq %rsi, %xmm0 ; AVX512F-NEXT: vmovq %rdx, %xmm1 @@ -1859,7 +1859,7 @@ ; AVX512VL-NEXT: vcvttss2usi %xmm0, %rcx ; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] ; AVX512VL-NEXT: vcvttss2usi %xmm1, %rdx -; AVX512VL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX512VL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX512VL-NEXT: vcvttss2usi %xmm0, %rsi ; AVX512VL-NEXT: vmovq %rsi, %xmm0 ; AVX512VL-NEXT: vmovq %rdx, %xmm1 diff --git a/llvm/test/CodeGen/X86/vec_int_to_fp.ll b/llvm/test/CodeGen/X86/vec_int_to_fp.ll --- a/llvm/test/CodeGen/X86/vec_int_to_fp.ll +++ b/llvm/test/CodeGen/X86/vec_int_to_fp.ll @@ -933,7 +933,7 @@ ; AVX1-NEXT: vblendps {{.*#+}} ymm2 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] ; AVX1-NEXT: vorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 ; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm1[1,3],ymm0[5,7],ymm1[5,7] -; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7] +; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7] ; AVX1-NEXT: vorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX1-NEXT: vsubpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX1-NEXT: vaddpd %ymm0, %ymm2, %ymm0 @@ -3631,7 +3631,7 @@ ; AVX1-NEXT: vblendps {{.*#+}} ymm2 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] ; AVX1-NEXT: vorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 ; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm1[1,3],ymm0[1,3],ymm1[5,7],ymm0[5,7] -; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,0,3,1,6,4,7,5] +; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0,3,1,6,4,7,5] ; AVX1-NEXT: vorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX1-NEXT: vsubpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX1-NEXT: vaddpd %ymm0, %ymm2, %ymm0 @@ -5431,7 +5431,7 @@ ; ; AVX-LABEL: extract3_sitofp_v4i32_f32: ; AVX: # %bb.0: -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX-NEXT: vcvtdq2ps %xmm0, %xmm0 ; AVX-NEXT: retq %e = extractelement <4 x i32> %x, i32 3 @@ -5457,7 +5457,7 @@ ; ; AVX-LABEL: extract3_sitofp_v4i32_f64: ; AVX: # %bb.0: -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX-NEXT: vcvtdq2pd %xmm0, %xmm0 ; AVX-NEXT: retq %e = extractelement <4 x i32> %x, i32 3 @@ -5489,7 +5489,7 @@ ; ; AVX512F-LABEL: extract3_uitofp_v4i32_f32: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX512F-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX512F-NEXT: vcvtudq2ps %zmm0, %zmm0 ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512F-NEXT: vzeroupper @@ -5497,13 +5497,13 @@ ; ; AVX512VL-LABEL: extract3_uitofp_v4i32_f32: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX512VL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX512VL-NEXT: vcvtudq2ps %xmm0, %xmm0 ; AVX512VL-NEXT: retq ; ; AVX512DQ-LABEL: extract3_uitofp_v4i32_f32: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX512DQ-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX512DQ-NEXT: vcvtudq2ps %zmm0, %zmm0 ; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512DQ-NEXT: vzeroupper @@ -5511,7 +5511,7 @@ ; ; AVX512VLDQ-LABEL: extract3_uitofp_v4i32_f32: ; AVX512VLDQ: # %bb.0: -; AVX512VLDQ-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX512VLDQ-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX512VLDQ-NEXT: vcvtudq2ps %xmm0, %xmm0 ; AVX512VLDQ-NEXT: retq %e = extractelement <4 x i32> %x, i32 3 @@ -5543,7 +5543,7 @@ ; ; AVX512F-LABEL: extract3_uitofp_v4i32_f64: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX512F-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX512F-NEXT: vcvtudq2pd %ymm0, %zmm0 ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512F-NEXT: vzeroupper @@ -5551,13 +5551,13 @@ ; ; AVX512VL-LABEL: extract3_uitofp_v4i32_f64: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX512VL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX512VL-NEXT: vcvtudq2pd %xmm0, %xmm0 ; AVX512VL-NEXT: retq ; ; AVX512DQ-LABEL: extract3_uitofp_v4i32_f64: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX512DQ-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX512DQ-NEXT: vcvtudq2pd %ymm0, %zmm0 ; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512DQ-NEXT: vzeroupper @@ -5565,7 +5565,7 @@ ; ; AVX512VLDQ-LABEL: extract3_uitofp_v4i32_f64: ; AVX512VLDQ: # %bb.0: -; AVX512VLDQ-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX512VLDQ-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX512VLDQ-NEXT: vcvtudq2pd %xmm0, %xmm0 ; AVX512VLDQ-NEXT: retq %e = extractelement <4 x i32> %x, i32 3 diff --git a/llvm/test/CodeGen/X86/vec_umulo.ll b/llvm/test/CodeGen/X86/vec_umulo.ll --- a/llvm/test/CodeGen/X86/vec_umulo.ll +++ b/llvm/test/CodeGen/X86/vec_umulo.ll @@ -2748,8 +2748,8 @@ ; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [16777215,16777215,16777215,16777215] ; AVX1-NEXT: vandps %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vandps %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[1,1,3,3] -; AVX1-NEXT: vpermilps {{.*#+}} xmm3 = xmm0[1,1,3,3] +; AVX1-NEXT: vshufps {{.*#+}} xmm2 = xmm1[1,1,3,3] +; AVX1-NEXT: vshufps {{.*#+}} xmm3 = xmm0[1,1,3,3] ; AVX1-NEXT: vpmuludq %xmm2, %xmm3, %xmm2 ; AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm3 ; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] diff --git a/llvm/test/CodeGen/X86/vector-fshr-256.ll b/llvm/test/CodeGen/X86/vector-fshr-256.ll --- a/llvm/test/CodeGen/X86/vector-fshr-256.ll +++ b/llvm/test/CodeGen/X86/vector-fshr-256.ll @@ -43,7 +43,7 @@ ; AVX1-NEXT: vpsllq %xmm4, %xmm5, %xmm4 ; AVX1-NEXT: vpsllq %xmm3, %xmm5, %xmm3 ; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4,5,6,7] -; AVX1-NEXT: vpermilps {{.*#+}} xmm4 = xmm2[2,3,2,3] +; AVX1-NEXT: vshufps {{.*#+}} xmm4 = xmm2[2,3,2,3] ; AVX1-NEXT: vpaddq %xmm0, %xmm0, %xmm0 ; AVX1-NEXT: vpsllq %xmm4, %xmm0, %xmm4 ; AVX1-NEXT: vpsllq %xmm2, %xmm0, %xmm0 diff --git a/llvm/test/CodeGen/X86/vector-half-conversions.ll b/llvm/test/CodeGen/X86/vector-half-conversions.ll --- a/llvm/test/CodeGen/X86/vector-half-conversions.ll +++ b/llvm/test/CodeGen/X86/vector-half-conversions.ll @@ -2135,8 +2135,8 @@ ; AVX-NEXT: # xmm0 = mem[1,0] ; AVX-NEXT: callq __truncsfhf2@PLT ; AVX-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vpshufd $255, (%rsp), %xmm0 # 16-byte Folded Reload -; AVX-NEXT: # xmm0 = mem[3,3,3,3] +; AVX-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX-NEXT: callq __truncsfhf2@PLT ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] @@ -2176,8 +2176,8 @@ ; AVX-NEXT: # xmm0 = mem[1,0] ; AVX-NEXT: callq __truncsfhf2@PLT ; AVX-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vpshufd $255, (%rsp), %xmm0 # 16-byte Folded Reload -; AVX-NEXT: # xmm0 = mem[3,3,3,3] +; AVX-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX-NEXT: callq __truncsfhf2@PLT ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] @@ -2218,8 +2218,8 @@ ; AVX-NEXT: # xmm0 = mem[1,0] ; AVX-NEXT: callq __truncsfhf2@PLT ; AVX-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vpshufd $255, (%rsp), %xmm0 # 16-byte Folded Reload -; AVX-NEXT: # xmm0 = mem[3,3,3,3] +; AVX-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX-NEXT: callq __truncsfhf2@PLT ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] @@ -2252,7 +2252,7 @@ ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX-NEXT: vzeroupper ; AVX-NEXT: callq __truncsfhf2@PLT ; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill @@ -2273,8 +2273,9 @@ ; AVX-NEXT: vpunpckldq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; AVX-NEXT: # xmm0 = mem[3,3,3,3] +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX-NEXT: vzeroupper ; AVX-NEXT: callq __truncsfhf2@PLT ; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill ; AVX-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload @@ -2324,7 +2325,7 @@ ; AVX1-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0 ; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: callq __truncsfhf2@PLT ; AVX1-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill @@ -2345,8 +2346,9 @@ ; AVX1-NEXT: vpunpckldq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX1-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; AVX1-NEXT: # xmm0 = mem[3,3,3,3] +; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX1-NEXT: vzeroupper ; AVX1-NEXT: callq __truncsfhf2@PLT ; AVX1-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill ; AVX1-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload @@ -2373,7 +2375,7 @@ ; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: callq __truncsfhf2@PLT ; AVX1-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill @@ -2394,8 +2396,9 @@ ; AVX1-NEXT: vpunpckldq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX1-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; AVX1-NEXT: # xmm0 = mem[3,3,3,3] +; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX1-NEXT: vzeroupper ; AVX1-NEXT: callq __truncsfhf2@PLT ; AVX1-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill ; AVX1-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload @@ -2429,7 +2432,7 @@ ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vextractf128 $1, %ymm1, %xmm0 ; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: callq __truncsfhf2@PLT ; AVX2-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill @@ -2450,8 +2453,9 @@ ; AVX2-NEXT: vpunpckldq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; AVX2-NEXT: # xmm0 = mem[3,3,3,3] +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX2-NEXT: vzeroupper ; AVX2-NEXT: callq __truncsfhf2@PLT ; AVX2-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill ; AVX2-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload @@ -2478,7 +2482,7 @@ ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: callq __truncsfhf2@PLT ; AVX2-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill @@ -2499,8 +2503,9 @@ ; AVX2-NEXT: vpunpckldq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; AVX2-NEXT: # xmm0 = mem[3,3,3,3] +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX2-NEXT: vzeroupper ; AVX2-NEXT: callq __truncsfhf2@PLT ; AVX2-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill ; AVX2-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload @@ -2590,8 +2595,8 @@ ; AVX-NEXT: # xmm0 = mem[1,0] ; AVX-NEXT: callq __truncsfhf2@PLT ; AVX-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vpermilps $255, (%rsp), %xmm0 # 16-byte Folded Reload -; AVX-NEXT: # xmm0 = mem[3,3,3,3] +; AVX-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX-NEXT: callq __truncsfhf2@PLT ; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload @@ -2639,8 +2644,8 @@ ; AVX-NEXT: # xmm0 = mem[1,0] ; AVX-NEXT: callq __truncsfhf2@PLT ; AVX-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vpshufd $255, (%rsp), %xmm0 # 16-byte Folded Reload -; AVX-NEXT: # xmm0 = mem[3,3,3,3] +; AVX-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX-NEXT: callq __truncsfhf2@PLT ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] @@ -2688,8 +2693,8 @@ ; AVX-NEXT: # xmm0 = mem[1,0] ; AVX-NEXT: callq __truncsfhf2@PLT ; AVX-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vpshufd $255, (%rsp), %xmm0 # 16-byte Folded Reload -; AVX-NEXT: # xmm0 = mem[3,3,3,3] +; AVX-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX-NEXT: callq __truncsfhf2@PLT ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] @@ -2729,7 +2734,7 @@ ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX-NEXT: vzeroupper ; AVX-NEXT: callq __truncsfhf2@PLT ; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill @@ -2750,8 +2755,9 @@ ; AVX-NEXT: vpunpckldq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; AVX-NEXT: # xmm0 = mem[3,3,3,3] +; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX-NEXT: vzeroupper ; AVX-NEXT: callq __truncsfhf2@PLT ; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill ; AVX-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload @@ -2806,7 +2812,7 @@ ; AVX1-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0 ; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: callq __truncsfhf2@PLT ; AVX1-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill @@ -2827,8 +2833,9 @@ ; AVX1-NEXT: vpunpckldq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX1-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; AVX1-NEXT: # xmm0 = mem[3,3,3,3] +; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX1-NEXT: vzeroupper ; AVX1-NEXT: callq __truncsfhf2@PLT ; AVX1-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill ; AVX1-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload @@ -2855,7 +2862,7 @@ ; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: callq __truncsfhf2@PLT ; AVX1-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill @@ -2876,8 +2883,9 @@ ; AVX1-NEXT: vpunpckldq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX1-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; AVX1-NEXT: # xmm0 = mem[3,3,3,3] +; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX1-NEXT: vzeroupper ; AVX1-NEXT: callq __truncsfhf2@PLT ; AVX1-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill ; AVX1-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload @@ -2916,7 +2924,7 @@ ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vextractf128 $1, %ymm1, %xmm0 ; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: callq __truncsfhf2@PLT ; AVX2-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill @@ -2937,8 +2945,9 @@ ; AVX2-NEXT: vpunpckldq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; AVX2-NEXT: # xmm0 = mem[3,3,3,3] +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX2-NEXT: vzeroupper ; AVX2-NEXT: callq __truncsfhf2@PLT ; AVX2-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill ; AVX2-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload @@ -2965,7 +2974,7 @@ ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: callq __truncsfhf2@PLT ; AVX2-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill @@ -2986,8 +2995,9 @@ ; AVX2-NEXT: vpunpckldq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; AVX2-NEXT: # xmm0 = mem[3,3,3,3] +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX2-NEXT: vzeroupper ; AVX2-NEXT: callq __truncsfhf2@PLT ; AVX2-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill ; AVX2-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload @@ -4432,7 +4442,7 @@ ; AVX1-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0 ; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: callq __truncsfhf2@PLT ; AVX1-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill @@ -4453,8 +4463,9 @@ ; AVX1-NEXT: vpunpckldq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX1-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; AVX1-NEXT: # xmm0 = mem[3,3,3,3] +; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX1-NEXT: vzeroupper ; AVX1-NEXT: callq __truncsfhf2@PLT ; AVX1-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill ; AVX1-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload @@ -4481,7 +4492,7 @@ ; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: callq __truncsfhf2@PLT ; AVX1-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill @@ -4502,8 +4513,9 @@ ; AVX1-NEXT: vpunpckldq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX1-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; AVX1-NEXT: # xmm0 = mem[3,3,3,3] +; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX1-NEXT: vzeroupper ; AVX1-NEXT: callq __truncsfhf2@PLT ; AVX1-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill ; AVX1-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload @@ -4531,7 +4543,7 @@ ; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: callq __truncsfhf2@PLT ; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -4552,8 +4564,9 @@ ; AVX1-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX1-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; AVX1-NEXT: # xmm0 = mem[3,3,3,3] +; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX1-NEXT: vzeroupper ; AVX1-NEXT: callq __truncsfhf2@PLT ; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload @@ -4580,7 +4593,7 @@ ; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: callq __truncsfhf2@PLT ; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -4601,8 +4614,9 @@ ; AVX1-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX1-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; AVX1-NEXT: # xmm0 = mem[3,3,3,3] +; AVX1-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX1-NEXT: vzeroupper ; AVX1-NEXT: callq __truncsfhf2@PLT ; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload @@ -4645,7 +4659,7 @@ ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-NEXT: vextractf128 $1, %ymm1, %xmm0 ; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: callq __truncsfhf2@PLT ; AVX2-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill @@ -4666,8 +4680,9 @@ ; AVX2-NEXT: vpunpckldq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; AVX2-NEXT: # xmm0 = mem[3,3,3,3] +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX2-NEXT: vzeroupper ; AVX2-NEXT: callq __truncsfhf2@PLT ; AVX2-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill ; AVX2-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload @@ -4694,7 +4709,7 @@ ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: callq __truncsfhf2@PLT ; AVX2-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill @@ -4715,8 +4730,9 @@ ; AVX2-NEXT: vpunpckldq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; AVX2-NEXT: # xmm0 = mem[3,3,3,3] +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX2-NEXT: vzeroupper ; AVX2-NEXT: callq __truncsfhf2@PLT ; AVX2-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill ; AVX2-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload @@ -4744,7 +4760,7 @@ ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: callq __truncsfhf2@PLT ; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -4765,8 +4781,9 @@ ; AVX2-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; AVX2-NEXT: # xmm0 = mem[3,3,3,3] +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX2-NEXT: vzeroupper ; AVX2-NEXT: callq __truncsfhf2@PLT ; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload @@ -4793,7 +4810,7 @@ ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: callq __truncsfhf2@PLT ; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -4814,8 +4831,9 @@ ; AVX2-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] ; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; AVX2-NEXT: # xmm0 = mem[3,3,3,3] +; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX2-NEXT: vzeroupper ; AVX2-NEXT: callq __truncsfhf2@PLT ; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload diff --git a/llvm/test/CodeGen/X86/vector-interleave.ll b/llvm/test/CodeGen/X86/vector-interleave.ll --- a/llvm/test/CodeGen/X86/vector-interleave.ll +++ b/llvm/test/CodeGen/X86/vector-interleave.ll @@ -538,10 +538,10 @@ ; AVX1: # %bb.0: ; AVX1-NEXT: vmovups (%rdi), %xmm0 ; AVX1-NEXT: vmovups 16(%rdi), %xmm1 -; AVX1-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[0,0,1,1] -; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,2,3,3] -; AVX1-NEXT: vpermilps {{.*#+}} xmm3 = xmm1[0,0,1,1] -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[2,2,3,3] +; AVX1-NEXT: vshufps {{.*#+}} xmm2 = xmm0[0,0,1,1] +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,2,3,3] +; AVX1-NEXT: vshufps {{.*#+}} xmm3 = xmm1[0,0,1,1] +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,2,3,3] ; AVX1-NEXT: vmovups %xmm1, 48(%rsi) ; AVX1-NEXT: vmovups %xmm3, 32(%rsi) ; AVX1-NEXT: vmovups %xmm0, 16(%rsi) @@ -551,8 +551,8 @@ ; AVX2-LABEL: splat2_i32: ; AVX2: # %bb.0: ; AVX2-NEXT: vpermpd $216, (%rdi), %ymm0 # ymm0 = mem[0,2,1,3] -; AVX2-NEXT: vpermilps {{.*#+}} ymm1 = ymm0[0,0,1,1,4,4,5,5] -; AVX2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,2,3,3,6,6,7,7] +; AVX2-NEXT: vshufps {{.*#+}} ymm1 = ymm0[0,0,1,1,4,4,5,5] +; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,2,3,3,6,6,7,7] ; AVX2-NEXT: vmovups %ymm0, 32(%rsi) ; AVX2-NEXT: vmovups %ymm1, (%rsi) ; AVX2-NEXT: vzeroupper diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-5.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-5.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-5.ll @@ -1170,7 +1170,7 @@ ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm11 = [65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,65535,65535,65535] ; AVX1-ONLY-NEXT: vandps %ymm11, %ymm9, %ymm12 ; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm9 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm13 = xmm9[0,1,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm13 = xmm9[0,1,0,1] ; AVX1-ONLY-NEXT: vandnps %ymm13, %ymm11, %ymm13 ; AVX1-ONLY-NEXT: vorps %ymm13, %ymm12, %ymm12 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm12, %ymm10 @@ -2278,7 +2278,7 @@ ; AVX1-ONLY-NEXT: vandps %ymm3, %ymm14, %ymm3 ; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm4 = xmm0[0,1,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm0[0,1,0,1] ; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm14, %ymm4 ; AVX1-ONLY-NEXT: vorps %ymm4, %ymm3, %ymm3 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm0 @@ -2321,7 +2321,7 @@ ; AVX1-ONLY-NEXT: vandps %ymm0, %ymm14, %ymm0 ; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm8 = xmm1[0,1,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm8 = xmm1[0,1,0,1] ; AVX1-ONLY-NEXT: vandnps %ymm8, %ymm14, %ymm8 ; AVX1-ONLY-NEXT: vorps %ymm0, %ymm8, %ymm0 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 @@ -4705,7 +4705,7 @@ ; AVX1-ONLY-NEXT: vandps %ymm6, %ymm3, %ymm3 ; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm4 = xmm0[0,1,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm0[0,1,0,1] ; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm6, %ymm4 ; AVX1-ONLY-NEXT: vorps %ymm4, %ymm3, %ymm3 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm0 @@ -4748,7 +4748,7 @@ ; AVX1-ONLY-NEXT: vandps %ymm6, %ymm3, %ymm3 ; AVX1-ONLY-NEXT: vmovaps 544(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm4 = xmm0[0,1,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm0[0,1,0,1] ; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm6, %ymm4 ; AVX1-ONLY-NEXT: vorps %ymm4, %ymm3, %ymm3 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm0 @@ -4791,7 +4791,7 @@ ; AVX1-ONLY-NEXT: vandps %ymm6, %ymm3, %ymm3 ; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm4 = xmm0[0,1,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm0[0,1,0,1] ; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm6, %ymm4 ; AVX1-ONLY-NEXT: vorps %ymm4, %ymm3, %ymm3 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm0 @@ -4834,7 +4834,7 @@ ; AVX1-ONLY-NEXT: vandps %ymm6, %ymm2, %ymm2 ; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm11 = xmm0[0,1,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm11 = xmm0[0,1,0,1] ; AVX1-ONLY-NEXT: vandnps %ymm11, %ymm6, %ymm11 ; AVX1-ONLY-NEXT: vorps %ymm2, %ymm11, %ymm2 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm0 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-8.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-8.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-8.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-8.ll @@ -2520,7 +2520,7 @@ ; AVX1-ONLY-NEXT: vmovdqa %xmm11, %xmm1 ; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm7[2],xmm11[2],xmm7[3],xmm11[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm3 = xmm10[2,2,2,2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm10[2,2,2,2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm12[0,1,2],xmm3[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm3[6,7] @@ -5302,7 +5302,7 @@ ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm8[2,2,2,2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm8[2,2,2,2] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm9[0,1,2],xmm0[3] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload @@ -5326,12 +5326,12 @@ ; AVX1-ONLY-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm3 = xmm4[2,2,2,2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm4[2,2,2,2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm1[0,1,2],xmm3[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm3[6,7] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm3 = xmm6[2,2,2,2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm6[2,2,2,2] ; AVX1-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm3 = mem[0,1,2],xmm3[3] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload @@ -5383,14 +5383,14 @@ ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm9[2],xmm8[2],xmm9[3],xmm8[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm3 = xmm2[2,3,2,3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm14 = xmm13[3,3,3,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm2[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm13[3,3,3,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm14[0],xmm3[1],xmm14[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm3[0,1],xmm0[2,3] ; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm3 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm3 = xmm11[2],mem[2],xmm11[3],mem[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm11 = xmm5[2,3,2,3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm10 = xmm12[3,3,3,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm11 = xmm5[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm12[3,3,3,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm10[0],xmm11[1],xmm10[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm10 @@ -5409,7 +5409,7 @@ ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm3 = xmm2[2],xmm6[2],xmm2[3],xmm6[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm1[2,3,2,3] ; AVX1-ONLY-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm1 = mem[3,3,3,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3] @@ -5618,7 +5618,7 @@ ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps (%rsp), %xmm10 # 16-byte Reload -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm10[1,1,1,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm10[1,1,1,1] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm8[1],xmm0[2,3] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload @@ -5649,7 +5649,7 @@ ; AVX1-ONLY-NEXT: # xmm1 = xmm4[0],mem[0],xmm4[1],mem[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm14 = xmm4[1,1,1,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm4[1,1,1,1] ; AVX1-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm14 = xmm14[0],mem[1],xmm14[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14 @@ -5666,7 +5666,7 @@ ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm12[1,1,1,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm12[1,1,1,1] ; AVX1-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm1 = xmm1[0],mem[1],xmm1[2,3] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload @@ -5690,7 +5690,7 @@ ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm14[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm5[2,2,2,2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm5[2,2,2,2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm7[0,1,2],xmm0[3] ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm10[2],xmm8[2],xmm10[3],xmm8[3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] @@ -5705,7 +5705,7 @@ ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm7[2,2,2,2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm7[2,2,2,2] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload @@ -5716,7 +5716,7 @@ ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm4[2],xmm9[2],xmm4[3],xmm9[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm14 = xmm8[2,2,2,2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm8[2,2,2,2] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm3[0,1,2],xmm14[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14 @@ -5728,7 +5728,7 @@ ; AVX1-ONLY-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm11[2,2,2,2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm11[2,2,2,2] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm10[0,1,2],xmm1[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 @@ -5763,12 +5763,12 @@ ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm14[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm2[2],xmm7[2],xmm2[3],xmm7[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm5[2,3,2,3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm4 = xmm6[3,3,3,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm5[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm6[3,3,3,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm4[0],xmm1[1],xmm4[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm3[2],xmm8[2],xmm3[3],xmm8[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm2 = xmm9[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm9[2,3,2,3] ; AVX1-ONLY-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm3 = mem[3,3,3,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2,3] @@ -5809,8 +5809,8 @@ ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm4 = xmm13[2,3,2,3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm5 = xmm12[3,3,3,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm13[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm12[3,3,3,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-2.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-2.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-2.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-2.ll @@ -28,8 +28,8 @@ ; AVX-LABEL: load_i32_stride2_vf2: ; AVX: # %bb.0: ; AVX-NEXT: vmovaps (%rdi), %xmm0 -; AVX-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,2,2,3] -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,3,2,3] +; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm0[0,2,2,3] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3,2,3] ; AVX-NEXT: vmovlps %xmm1, (%rsi) ; AVX-NEXT: vmovlps %xmm0, (%rdx) ; AVX-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-3.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-3.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-3.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-3.ll @@ -35,10 +35,10 @@ ; AVX1-ONLY: # %bb.0: ; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[0,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm0[0,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm1[0],xmm0[1],xmm1[2,3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm3 = xmm3[1,0,2,3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm3[1,0,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] ; AVX1-ONLY-NEXT: vmovlps %xmm2, (%rsi) ; AVX1-ONLY-NEXT: vmovlps %xmm3, (%rdx) @@ -49,9 +49,9 @@ ; AVX2-ONLY: # %bb.0: ; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm0 ; AVX2-ONLY-NEXT: vmovaps 16(%rdi), %xmm1 -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[0,3,2,3] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm0[0,3,2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,0,2,3] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,0,2,3] ; AVX2-ONLY-NEXT: vbroadcastss 8(%rdi), %xmm3 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2,3] ; AVX2-ONLY-NEXT: vmovlps %xmm2, (%rsi) @@ -63,9 +63,9 @@ ; AVX512F-SLOW: # %bb.0: ; AVX512F-SLOW-NEXT: vmovaps (%rdi), %xmm0 ; AVX512F-SLOW-NEXT: vmovaps 16(%rdi), %xmm1 -; AVX512F-SLOW-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[0,3,2,3] +; AVX512F-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm0[0,3,2,3] ; AVX512F-SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] -; AVX512F-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,0,2,3] +; AVX512F-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,0,2,3] ; AVX512F-SLOW-NEXT: vbroadcastss 8(%rdi), %xmm3 ; AVX512F-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2,3] ; AVX512F-SLOW-NEXT: vmovlps %xmm2, (%rsi) @@ -91,9 +91,9 @@ ; AVX512BW-SLOW: # %bb.0: ; AVX512BW-SLOW-NEXT: vmovaps (%rdi), %xmm0 ; AVX512BW-SLOW-NEXT: vmovaps 16(%rdi), %xmm1 -; AVX512BW-SLOW-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[0,3,2,3] +; AVX512BW-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm0[0,3,2,3] ; AVX512BW-SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] -; AVX512BW-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,0,2,3] +; AVX512BW-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,0,2,3] ; AVX512BW-SLOW-NEXT: vbroadcastss 8(%rdi), %xmm3 ; AVX512BW-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2,3] ; AVX512BW-SLOW-NEXT: vmovlps %xmm2, (%rsi) @@ -153,14 +153,14 @@ ; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm0[0],xmm2[1],xmm0[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm1[2],xmm3[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm3 = xmm3[0,3,2,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,3,2,1] ; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm4 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = mem[0,1],xmm4[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm1[0],xmm4[1,2],xmm1[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm4 = xmm4[1,0,3,2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm4[1,0,3,2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,1,0,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,1,0,3] ; AVX1-ONLY-NEXT: vmovaps %xmm3, (%rsi) ; AVX1-ONLY-NEXT: vmovaps %xmm4, (%rdx) ; AVX1-ONLY-NEXT: vmovaps %xmm0, (%rcx) @@ -260,14 +260,14 @@ ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,3],ymm4[0,2],ymm3[4,7],ymm4[4,6] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm0[2,3,0,1] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm0[1,0],ymm4[2,0],ymm0[5,4],ymm4[6,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm5 = ymm5[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm5[6,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm0[2,0],ymm4[3,0],ymm0[6,4],ymm4[7,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm4[0,0],ymm5[2,0],ymm4[4,4],ymm5[6,4] ; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm6 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm7[1,2],ymm6[0,3],ymm7[5,6],ymm6[4,7] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm7 = ymm7[0,2,3,1,4,6,7,5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm7[0,2,3,1,4,6,7,5] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0,1,2,3,4],ymm5[5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm6[1,0],ymm2[2,0],ymm6[5,4],ymm2[6,4] @@ -301,7 +301,7 @@ ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] ; AVX2-SLOW-NEXT: vmovaps {{.*#+}} ymm1 = <2,5,0,3,6,u,u,u> ; AVX2-SLOW-NEXT: vpermps %ymm0, %ymm1, %ymm0 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm2[0,1,0,3,4,5,4,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,1,0,3,4,5,4,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-SLOW-NEXT: vmovaps %ymm3, (%rsi) @@ -361,7 +361,7 @@ ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} ymm1 = <2,5,0,3,6,u,u,u> ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm0, %ymm1, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm2[0,1,0,3,4,5,4,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,1,0,3,4,5,4,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm3, (%rsi) @@ -506,7 +506,7 @@ ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[0,3],ymm7[0,2],ymm5[4,7],ymm7[4,6] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm3[2,3,0,1] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm3[1,0],ymm7[2,0],ymm3[5,4],ymm7[6,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm8 = ymm8[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm8[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6],ymm1[7] ; AVX1-ONLY-NEXT: vmovaps 112(%rdi), %xmm9 @@ -514,21 +514,21 @@ ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[0,3],ymm9[0,2],ymm8[4,7],ymm9[4,6] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm0[2,3,0,1] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm0[1,0],ymm9[2,0],ymm0[5,4],ymm9[6,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm10 = ymm10[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm10[6,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm3[2,0],ymm7[3,0],ymm3[6,4],ymm7[7,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm7[0,0],ymm10[2,0],ymm7[4,4],ymm10[6,4] ; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm11 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm6[0,1],ymm4[2],ymm6[3,4],ymm4[5],ymm6[6,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm12[1,2],ymm11[0,3],ymm12[5,6],ymm11[4,7] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm12 = ymm12[0,2,3,1,4,6,7,5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm12[0,2,3,1,4,6,7,5] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm12[0,1,2,3,4],ymm10[5,6,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm0[2,0],ymm9[3,0],ymm0[6,4],ymm9[7,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm9[0,0],ymm12[2,0],ymm9[4,4],ymm12[6,4] ; AVX1-ONLY-NEXT: vmovaps 112(%rdi), %xmm13 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[1,2],ymm13[0,3],ymm14[5,6],ymm13[4,7] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm14 = ymm14[0,2,3,1,4,6,7,5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[0,2,3,1,4,6,7,5] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm14[0,1,2,3,4],ymm12[5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm4[0,1],ymm6[2],ymm4[3,4],ymm6[5],ymm4[6,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm11[1,0],ymm6[2,0],ymm11[5,4],ymm6[6,4] @@ -581,12 +581,12 @@ ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7] ; AVX2-SLOW-NEXT: vmovaps {{.*#+}} ymm3 = <2,5,0,3,6,u,u,u> ; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm3, %ymm2 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm4 = ymm4[0,1,0,3,4,5,4,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,1,0,3,4,5,4,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,0,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm4[5,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm5[2],ymm1[3,4],ymm5[5],ymm1[6,7] ; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm3, %ymm1 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,1,0,3,4,5,4,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,0,3,4,5,4,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7] ; AVX2-SLOW-NEXT: vmovaps %ymm6, 32(%rsi) @@ -678,12 +678,12 @@ ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} ymm3 = <2,5,0,3,6,u,u,u> ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm2, %ymm3, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm4 = ymm4[0,1,0,3,4,5,4,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,1,0,3,4,5,4,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,0,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm4[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm5[2],ymm1[3,4],ymm5[5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm3, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,1,0,3,4,5,4,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,0,3,4,5,4,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm6, 32(%rsi) @@ -968,7 +968,7 @@ ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm8[2,3,0,1] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm8[1,0],ymm10[2,0],ymm8[5,4],ymm10[6,4] ; AVX1-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0],ymm5[1],ymm7[2,3],ymm5[4],ymm7[5,6],ymm5[7] @@ -979,7 +979,7 @@ ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm14 = ymm6[2,3,0,1] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm6[1,0],ymm14[2,0],ymm6[5,4],ymm14[6,4] ; AVX1-ONLY-NEXT: vmovups %ymm6, (%rsp) # 32-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -990,7 +990,7 @@ ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm2[2,3,0,1] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[1,0],ymm12[2,0],ymm2[5,4],ymm12[6,4] ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %ymm15 @@ -1002,7 +1002,7 @@ ; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %ymm4 ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm4[2,3,0,1] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm4[1,0],ymm1[2,0],ymm4[5,4],ymm1[6,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm13 = ymm13[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm13[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm13[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -1012,7 +1012,7 @@ ; AVX1-ONLY-NEXT: vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm8 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm8 = mem[0,1],ymm9[2],mem[3,4],ymm9[5],mem[6,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[1,2],ymm13[0,3],ymm8[5,6],ymm13[4,7] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm8 = ymm8[0,2,3,1,4,6,7,5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[0,2,3,1,4,6,7,5] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1,2,3,4],ymm0[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm6[2,0],ymm14[3,0],ymm6[6,4],ymm14[7,4] @@ -1020,7 +1020,7 @@ ; AVX1-ONLY-NEXT: vmovaps 304(%rdi), %xmm8 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm7[0,1],ymm5[2],ymm7[3,4],ymm5[5],ymm7[6,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[1,2],ymm8[0,3],ymm10[5,6],ymm8[4,7] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm10 = ymm10[0,2,3,1,4,6,7,5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[0,2,3,1,4,6,7,5] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3,4],ymm0[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm4[2,0],ymm1[3,0],ymm4[6,4],ymm1[7,4] @@ -1028,7 +1028,7 @@ ; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm10 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm11[0,1],ymm15[2],ymm11[3,4],ymm15[5],ymm11[6,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm6[1,2],ymm10[0,3],ymm6[5,6],ymm10[4,7] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm6 = ymm6[0,2,3,1,4,6,7,5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm6[0,2,3,1,4,6,7,5] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3,4],ymm0[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm2[2,0],ymm12[3,0],ymm2[6,4],ymm12[7,4] @@ -1038,7 +1038,7 @@ ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm2[0,1],ymm0[2],ymm2[3,4],ymm0[5],ymm2[6,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm7[1,2],ymm6[0,3],ymm7[5,6],ymm6[4,7] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm7 = ymm7[0,2,3,1,4,6,7,5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm7[0,2,3,1,4,6,7,5] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm7[0,1,2,3,4],ymm3[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vblendps $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm7 # 32-byte Folded Reload @@ -1155,22 +1155,22 @@ ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm10[0,1],ymm15[2],ymm10[3,4],ymm15[5],ymm10[6,7] ; AVX2-SLOW-NEXT: vmovaps {{.*#+}} ymm10 = <2,5,0,3,6,u,u,u> ; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm10, %ymm1 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm11 = ymm12[0,1,0,3,4,5,4,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm11 = ymm12[0,1,0,3,4,5,4,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[0,1,0,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm11[5,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm7[2],ymm4[3,4],ymm7[5],ymm4[6,7] ; AVX2-SLOW-NEXT: vpermps %ymm4, %ymm10, %ymm4 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm5 = ymm5[0,1,0,3,4,5,4,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm5 = ymm5[0,1,0,3,4,5,4,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,0,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm5[5,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm3[0,1],ymm13[2],ymm3[3,4],ymm13[5],ymm3[6,7] ; AVX2-SLOW-NEXT: vpermps %ymm5, %ymm10, %ymm5 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm3 = ymm8[0,1,0,3,4,5,4,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm3 = ymm8[0,1,0,3,4,5,4,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3,4],ymm3[5,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm2[0,1],ymm14[2],ymm2[3,4],ymm14[5],ymm2[6,7] ; AVX2-SLOW-NEXT: vpermps %ymm5, %ymm10, %ymm5 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm2 = ymm6[0,1,0,3,4,5,4,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm6[0,1,0,3,4,5,4,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1,2,3,4],ymm2[5,6,7] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload @@ -1360,22 +1360,22 @@ ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm10[0,1],ymm15[2],ymm10[3,4],ymm15[5],ymm10[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} ymm10 = <2,5,0,3,6,u,u,u> ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm10, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm11 = ymm12[0,1,0,3,4,5,4,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm11 = ymm12[0,1,0,3,4,5,4,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[0,1,0,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm11[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm7[2],ymm4[3,4],ymm7[5],ymm4[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm4, %ymm10, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm5 = ymm5[0,1,0,3,4,5,4,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm5 = ymm5[0,1,0,3,4,5,4,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,0,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm5[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm3[0,1],ymm13[2],ymm3[3,4],ymm13[5],ymm3[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm5, %ymm10, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm3 = ymm8[0,1,0,3,4,5,4,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm3 = ymm8[0,1,0,3,4,5,4,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3,4],ymm3[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm2[0,1],ymm14[2],ymm2[3,4],ymm14[5],ymm2[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm5, %ymm10, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm2 = ymm6[0,1,0,3,4,5,4,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm6[0,1,0,3,4,5,4,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload @@ -1930,7 +1930,7 @@ ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm8[2,3,0,1] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm8[1,0],ymm1[2,0],ymm8[5,4],ymm1[6,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0],ymm6[1],ymm7[2,3],ymm6[4],ymm7[5,6],ymm6[7] @@ -1939,7 +1939,7 @@ ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,3],ymm1[0,2],ymm0[4,7],ymm1[4,6] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm15 = ymm5[2,3,0,1] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm5[1,0],ymm15[2,0],ymm5[5,4],ymm15[6,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6],ymm3[7] @@ -1949,7 +1949,7 @@ ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm2[2,3,0,1] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[1,0],ymm5[2,0],ymm2[5,4],ymm5[6,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 704(%rdi), %ymm2 @@ -1964,7 +1964,7 @@ ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm14 = ymm1[2,3,0,1] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,0],ymm14[2,0],ymm1[5,4],ymm14[6,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %ymm8 @@ -1978,7 +1978,7 @@ ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm13 = ymm1[2,3,0,1] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,0],ymm13[2,0],ymm1[5,4],ymm13[6,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %ymm9 @@ -1993,7 +1993,7 @@ ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[1,0],ymm12[2,0],ymm2[5,4],ymm12[6,4] ; AVX1-ONLY-NEXT: vmovaps %ymm2, %ymm4 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %ymm10 @@ -2008,7 +2008,7 @@ ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[1,0],ymm7[2,0],ymm2[5,4],ymm7[6,4] ; AVX1-ONLY-NEXT: vmovaps %ymm2, %ymm6 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 608(%rdi), %ymm11 @@ -2023,7 +2023,7 @@ ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3,0,1] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm0[1,0],ymm1[2,0],ymm0[5,4],ymm1[6,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload @@ -2036,7 +2036,7 @@ ; AVX1-ONLY-NEXT: vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,2],ymm2[0,3],ymm1[5,6],ymm2[4,7] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,2,3,1,4,6,7,5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2,3,1,4,6,7,5] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -2049,7 +2049,7 @@ ; AVX1-ONLY-NEXT: vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,2],ymm2[0,3],ymm1[5,6],ymm2[4,7] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,2,3,1,4,6,7,5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2,3,1,4,6,7,5] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -2062,7 +2062,7 @@ ; AVX1-ONLY-NEXT: vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,2],ymm2[0,3],ymm1[5,6],ymm2[4,7] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,2,3,1,4,6,7,5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2,3,1,4,6,7,5] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -2075,7 +2075,7 @@ ; AVX1-ONLY-NEXT: vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,2],ymm2[0,3],ymm1[5,6],ymm2[4,7] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,2,3,1,4,6,7,5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2,3,1,4,6,7,5] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -2086,7 +2086,7 @@ ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm0[0,1],ymm8[2],ymm0[3,4],ymm8[5],ymm0[6,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[1,2],ymm3[0,3],ymm2[5,6],ymm3[4,7] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm2 = ymm2[0,2,3,1,4,6,7,5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,2,3,1,4,6,7,5] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -2096,7 +2096,7 @@ ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm1[0,1],ymm9[2],ymm1[3,4],ymm9[5],ymm1[6,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[1,2],ymm4[0,3],ymm5[5,6],ymm4[4,7] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm5 = ymm5[0,2,3,1,4,6,7,5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[0,2,3,1,4,6,7,5] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1,2,3,4],ymm2[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -2106,7 +2106,7 @@ ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm2[0,1],ymm10[2],ymm2[3,4],ymm10[5],ymm2[6,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[1,2],ymm6[0,3],ymm15[5,6],ymm6[4,7] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm15 = ymm15[0,2,3,1,4,6,7,5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[0,2,3,1,4,6,7,5] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm15[0,1,2,3,4],ymm5[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload @@ -2117,7 +2117,7 @@ ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm5[0,1],ymm11[2],ymm5[3,4],ymm11[5],ymm5[6,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[1,2],ymm7[0,3],ymm14[5,6],ymm7[4,7] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm14 = ymm14[0,2,3,1,4,6,7,5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[0,2,3,1,4,6,7,5] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0,1,2,3,4],ymm15[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm15 # 32-byte Reload @@ -2383,12 +2383,12 @@ ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1],ymm12[2],ymm7[3,4],ymm12[5],ymm7[6,7] ; AVX2-SLOW-NEXT: vmovaps {{.*#+}} ymm12 = <2,5,0,3,6,u,u,u> ; AVX2-SLOW-NEXT: vpermps %ymm0, %ymm12, %ymm0 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm8[0,1,0,3,4,5,4,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm8[0,1,0,3,4,5,4,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1],ymm2[2],ymm4[3,4],ymm2[5],ymm4[6,7] ; AVX2-SLOW-NEXT: vpermps %ymm0, %ymm12, %ymm0 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm3[0,1,0,3,4,5,4,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm3[0,1,0,3,4,5,4,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-SLOW-NEXT: vblendps $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm1 # 32-byte Folded Reload @@ -2414,7 +2414,7 @@ ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm10[0,1],ymm9[2],ymm10[3,4],ymm9[5],ymm10[6,7] ; AVX2-SLOW-NEXT: vpermps %ymm4, %ymm12, %ymm4 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm5 = ymm11[0,1,0,3,4,5,4,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm5 = ymm11[0,1,0,3,4,5,4,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,0,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm5[5,6,7] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload @@ -2428,7 +2428,7 @@ ; AVX2-SLOW-NEXT: vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm6 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm6 = mem[0,1],ymm14[2],mem[3,4],ymm14[5],mem[6,7] ; AVX2-SLOW-NEXT: vpermps %ymm6, %ymm12, %ymm6 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm7 = ymm15[0,1,0,3,4,5,4,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm7 = ymm15[0,1,0,3,4,5,4,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,0,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm7[5,6,7] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload @@ -2865,12 +2865,12 @@ ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1],ymm12[2],ymm7[3,4],ymm12[5],ymm7[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} ymm12 = <2,5,0,3,6,u,u,u> ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm0, %ymm12, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm8[0,1,0,3,4,5,4,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm8[0,1,0,3,4,5,4,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1],ymm2[2],ymm4[3,4],ymm2[5],ymm4[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm0, %ymm12, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm3[0,1,0,3,4,5,4,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm3[0,1,0,3,4,5,4,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm1 # 32-byte Folded Reload @@ -2896,7 +2896,7 @@ ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm10[0,1],ymm9[2],ymm10[3,4],ymm9[5],ymm10[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm4, %ymm12, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm5 = ymm11[0,1,0,3,4,5,4,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm5 = ymm11[0,1,0,3,4,5,4,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,0,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm5[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload @@ -2910,7 +2910,7 @@ ; AVX2-FAST-PERLANE-NEXT: vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm6 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm6 = mem[0,1],ymm14[2],mem[3,4],ymm14[5],mem[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm6, %ymm12, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm7 = ymm15[0,1,0,3,4,5,4,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm7 = ymm15[0,1,0,3,4,5,4,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,0,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm7[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-5.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-5.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-5.ll @@ -227,7 +227,7 @@ ; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm4 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm3[0,1],xmm4[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1],mem[2],xmm5[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm5 = xmm5[1,2,3,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm5[1,2,3,3] ; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm6 ; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm5 = xmm5[0,1,2],xmm6[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] @@ -434,7 +434,7 @@ ; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm10 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm11 = xmm9[0,1],xmm10[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm11 = xmm11[0,1],mem[2],xmm11[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm11 = xmm11[1,2,3,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm11 = xmm11[1,2,3,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm11[0,1,2],ymm7[3,4,5,6,7] ; AVX1-ONLY-NEXT: vbroadcastss 144(%rdi), %ymm11 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm11[7] @@ -447,7 +447,7 @@ ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm11[0,1,2],ymm8[3,4,5,6,7] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm0[2,3,0,1] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm0[1,0],ymm11[0,0],ymm0[5,4],ymm11[4,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm12 = ymm12[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm12[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm12[6,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm2[0,0],ymm1[3,0],ymm2[4,4],ymm1[7,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm12[2,0],ymm1[2,2],ymm12[6,4],ymm1[6,6] @@ -455,7 +455,7 @@ ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm9 = xmm9[3,0],mem[1,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2],ymm12[3,4,5,6,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm0[2,0],ymm11[1,0],ymm0[6,4],ymm11[5,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm10 = ymm10[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm10[6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5],ymm1[6,7] @@ -863,7 +863,7 @@ ; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm10 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm15[0,1],xmm10[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],mem[2],xmm3[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm3 = xmm3[1,2,3,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm3[1,2,3,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3,4,5,6,7] ; AVX1-ONLY-NEXT: vbroadcastss 144(%rdi), %ymm3 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm3[7] @@ -876,7 +876,7 @@ ; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %xmm8 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm9[0,1],xmm8[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],mem[2],xmm3[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm3 = xmm3[1,2,3,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm3[1,2,3,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3,4,5,6,7] ; AVX1-ONLY-NEXT: vbroadcastss 304(%rdi), %ymm3 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm3[7] @@ -891,7 +891,7 @@ ; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %ymm1 ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm1[2,3,0,1] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm1[1,0],ymm4[0,0],ymm1[5,4],ymm4[4,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm13 = ymm13[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm13[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm13[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm3, (%rsp) # 32-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,0],ymm2[2,0],ymm0[7,4],ymm2[6,4] @@ -905,7 +905,7 @@ ; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm0[2,3,0,1] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm0[1,0],ymm3[0,0],ymm0[5,4],ymm3[4,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm14 = ymm14[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],ymm14[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm10[0,1],xmm15[2,3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm6[0,0],ymm5[3,0],ymm6[4,4],ymm5[7,4] @@ -913,7 +913,7 @@ ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm10[3,0],mem[1,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2],ymm14[3,4,5,6,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm1[2,0],ymm4[1,0],ymm1[6,4],ymm4[5,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm14 = ymm14[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm14[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0,1],xmm9[2,3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm11[0,0],ymm2[3,0],ymm11[4,4],ymm2[7,4] @@ -921,7 +921,7 @@ ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm8 = xmm8[3,0],mem[1,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2],ymm9[3,4,5,6,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm0[2,0],ymm3[1,0],ymm0[6,4],ymm3[5,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm9 = ymm9[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm9[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm9[6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm6[5],ymm5[6,7] @@ -1793,7 +1793,7 @@ ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm6[0,1],xmm2[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],mem[2],xmm2[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[1,2,3,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm2[1,2,3,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7] ; AVX1-ONLY-NEXT: vbroadcastss 304(%rdi), %ymm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm2[7] @@ -1809,7 +1809,7 @@ ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm4[0,1],xmm2[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],mem[2],xmm2[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[1,2,3,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm2[1,2,3,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7] ; AVX1-ONLY-NEXT: vbroadcastss 624(%rdi), %ymm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm2[7] @@ -1825,7 +1825,7 @@ ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm4[0,1],xmm2[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],mem[2],xmm2[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[1,2,3,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm2[1,2,3,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7] ; AVX1-ONLY-NEXT: vbroadcastss 144(%rdi), %ymm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm2[7] @@ -1841,7 +1841,7 @@ ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],mem[2],xmm2[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[1,2,3,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm2[1,2,3,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7] ; AVX1-ONLY-NEXT: vbroadcastss 464(%rdi), %ymm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm2[7] @@ -1859,7 +1859,7 @@ ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm5[1,0],ymm1[0,0],ymm5[5,4],ymm1[4,4] ; AVX1-ONLY-NEXT: vmovaps %ymm1, %ymm11 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm13[3,0],ymm7[2,0],ymm13[7,4],ymm7[6,4] @@ -1874,7 +1874,7 @@ ; AVX1-ONLY-NEXT: vmovaps 608(%rdi), %ymm6 ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm6[2,3,0,1] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm6[1,0],ymm12[0,0],ymm6[5,4],ymm12[4,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm4 = ymm4[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm4[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm10[3,0],ymm14[2,0],ymm10[7,4],ymm14[6,4] @@ -1888,7 +1888,7 @@ ; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %ymm9 ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm13 = ymm9[2,3,0,1] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm9[1,0],ymm13[0,0],ymm9[5,4],ymm13[4,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm4 = ymm4[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm4[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -1904,7 +1904,7 @@ ; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %ymm4 ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm4[2,3,0,1] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm4[1,0],ymm10[0,0],ymm4[5,4],ymm10[4,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm15 = ymm15[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm15[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -1916,7 +1916,7 @@ ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,0],mem[1,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm15[3,4,5,6,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm5[2,0],ymm11[1,0],ymm5[6,4],ymm11[5,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm15 = ymm15[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm15[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -1928,7 +1928,7 @@ ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,0],mem[1,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm15[3,4,5,6,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm6[2,0],ymm12[1,0],ymm6[6,4],ymm12[5,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm15 = ymm15[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm15[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -1940,7 +1940,7 @@ ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,0],mem[1,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm15[3,4,5,6,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm9[2,0],ymm13[1,0],ymm9[6,4],ymm13[5,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm15 = ymm15[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm15[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -1952,7 +1952,7 @@ ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,0],mem[1,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm15[3,4,5,6,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm4[2,0],ymm10[1,0],ymm4[6,4],ymm10[5,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm15 = ymm15[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm15[6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: vblendps $32, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload @@ -3734,7 +3734,7 @@ ; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],mem[2],xmm4[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm4 = xmm4[1,2,3,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm4[1,2,3,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3,4,5,6,7] ; AVX1-ONLY-NEXT: vbroadcastss 304(%rdi), %ymm4 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm4[7] @@ -3751,7 +3751,7 @@ ; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],mem[2],xmm4[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm4 = xmm4[1,2,3,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm4[1,2,3,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3,4,5,6,7] ; AVX1-ONLY-NEXT: vbroadcastss 624(%rdi), %ymm4 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm4[7] @@ -3768,7 +3768,7 @@ ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm0[0,1],xmm1[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],mem[2],xmm4[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm4 = xmm4[1,2,3,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm4[1,2,3,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3,4,5,6,7] ; AVX1-ONLY-NEXT: vbroadcastss 944(%rdi), %ymm4 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5,6],ymm4[7] @@ -3785,7 +3785,7 @@ ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm0[0,1],xmm1[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],mem[2],xmm4[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm4 = xmm4[1,2,3,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm4[1,2,3,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3,4,5,6,7] ; AVX1-ONLY-NEXT: vbroadcastss 1264(%rdi), %ymm4 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5,6],ymm4[7] @@ -3801,7 +3801,7 @@ ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm0[0,1],xmm1[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],mem[2],xmm4[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm4 = xmm4[1,2,3,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm4[1,2,3,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3,4,5,6,7] ; AVX1-ONLY-NEXT: vbroadcastss 144(%rdi), %ymm4 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5,6],ymm4[7] @@ -3817,7 +3817,7 @@ ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm0[0,1],xmm1[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],mem[2],xmm4[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm4 = xmm4[1,2,3,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm4[1,2,3,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3,4,5,6,7] ; AVX1-ONLY-NEXT: vbroadcastss 464(%rdi), %ymm4 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5,6],ymm4[7] @@ -3834,7 +3834,7 @@ ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm0[0,1],xmm1[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],mem[2],xmm4[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm4 = xmm4[1,2,3,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm4[1,2,3,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3,4,5,6,7] ; AVX1-ONLY-NEXT: vbroadcastss 784(%rdi), %ymm4 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5,6],ymm4[7] @@ -3851,7 +3851,7 @@ ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm0[0,1],xmm1[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],mem[2],xmm4[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm4 = xmm4[1,2,3,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm4[1,2,3,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3,4,5,6,7] ; AVX1-ONLY-NEXT: vbroadcastss 1104(%rdi), %ymm4 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5,6],ymm4[7] @@ -3872,7 +3872,7 @@ ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm1[2,3,0,1] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm1[1,0],ymm10[0,0],ymm1[5,4],ymm10[4,4] ; AVX1-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm3 = ymm3[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm3[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload @@ -3891,7 +3891,7 @@ ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3,0,1] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[1,0],ymm1[0,0],ymm2[5,4],ymm1[4,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload @@ -3910,7 +3910,7 @@ ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3,0,1] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[1,0],ymm1[0,0],ymm2[5,4],ymm1[4,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -3929,7 +3929,7 @@ ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm1[2,3,0,1] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,0],ymm7[0,0],ymm1[5,4],ymm7[4,4] ; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -3947,7 +3947,7 @@ ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3,0,1] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[1,0],ymm1[0,0],ymm2[5,4],ymm1[4,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -3964,7 +3964,7 @@ ; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %ymm3 ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm3[2,3,0,1] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm3[1,0],ymm9[0,0],ymm3[5,4],ymm9[4,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm2[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -3980,7 +3980,7 @@ ; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %ymm6 ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm6[2,3,0,1] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm6[1,0],ymm12[0,0],ymm6[5,4],ymm12[4,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm5 = ymm5[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm5[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -3998,7 +3998,7 @@ ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm8[1,0],ymm1[0,0],ymm8[5,4],ymm1[4,4] ; AVX1-ONLY-NEXT: vmovaps %ymm1, %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm15 = ymm15[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm15[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -4012,7 +4012,7 @@ ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm15[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm1[2,0],ymm10[1,0],ymm1[6,4],ymm10[5,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm15 = ymm15[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm15[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -4027,7 +4027,7 @@ ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps $18, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm15 = ymm1[2,0],mem[1,0],ymm1[6,4],mem[5,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm15 = ymm15[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm15[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -4042,7 +4042,7 @@ ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps $18, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm15 = ymm1[2,0],mem[1,0],ymm1[6,4],mem[5,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm15 = ymm15[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm15[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -4056,7 +4056,7 @@ ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm15[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm1[2,0],ymm7[1,0],ymm1[6,4],ymm7[5,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm15 = ymm15[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm15[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -4071,7 +4071,7 @@ ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps $18, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm15 = ymm1[2,0],mem[1,0],ymm1[6,4],mem[5,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm15 = ymm15[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm15[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -4083,7 +4083,7 @@ ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,0],mem[1,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm15[3,4,5,6,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm8[2,0],ymm2[1,0],ymm8[6,4],ymm2[5,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm15 = ymm15[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm15[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -4095,7 +4095,7 @@ ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,0],mem[1,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm15[3,4,5,6,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm6[2,0],ymm12[1,0],ymm6[6,4],ymm12[5,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm15 = ymm15[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm15[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -4107,7 +4107,7 @@ ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,0],mem[1,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm15[3,4,5,6,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm3[2,0],ymm9[1,0],ymm3[6,4],ymm9[5,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm15 = ymm15[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm0[0,1,2,3,4,5],ymm15[6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-6.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-6.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-6.ll @@ -51,18 +51,18 @@ ; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm3 = xmm1[2,2,3,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm1[2,2,3,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm0[0],xmm3[1],xmm0[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm0[0,1],xmm1[2,3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm4 = xmm4[1,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm4[1,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm2[0,1],xmm0[2,3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm5 = xmm5[2,0,2,3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm5[2,0,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm6 = xmm2[2,2,3,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm2[2,2,3,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm1[0],xmm6[1],xmm1[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[1,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,3,2,3] ; AVX1-ONLY-NEXT: vmovlps %xmm3, (%rsi) ; AVX1-ONLY-NEXT: vmovlps %xmm4, (%rdx) ; AVX1-ONLY-NEXT: vmovlps %xmm5, (%rcx) @@ -78,13 +78,13 @@ ; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm1 ; AVX2-ONLY-NEXT: vmovaps 16(%rdi), %xmm2 ; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm3 -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm4 = xmm2[2,2,3,3] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm2[2,2,3,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm1[0],xmm4[1],xmm1[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3] -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[1,3,2,3] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm2[1,3,2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm3[0,1],xmm1[2,3] -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm5 = xmm5[2,0,2,3] -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,3,3,3] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm5[2,0,2,3] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2,3] ; AVX2-ONLY-NEXT: vmovaps {{.*#+}} xmm3 = <4,2,u,u> ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] @@ -307,16 +307,16 @@ ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,1],xmm1[3,3] ; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm6[1] ; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm1 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm7 = xmm1[2,2,3,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm1[2,2,3,3] ; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm8 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm8[0],xmm7[1],xmm8[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm9 = xmm5[0,1],xmm6[2,3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm9 = xmm9[0,1,0,2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm9 = xmm9[0,1,0,2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm9[2,3] ; AVX1-ONLY-NEXT: vpmovzxdq {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5],xmm6[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm8[0,1],xmm1[2,3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[1,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm5[2,3] ; AVX1-ONLY-NEXT: vmovaps %xmm4, (%rsi) ; AVX1-ONLY-NEXT: vmovaps %xmm2, (%rdx) @@ -558,7 +558,7 @@ ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm10[0,1,2],ymm2[3,4,5,6,7] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm4[2,3],ymm3[0,1] ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm11 = ymm10[0],ymm4[0],ymm10[3],ymm4[2] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm11 = ymm11[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm11[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm11[6,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm0[3,0],ymm5[1,0],ymm0[7,4],ymm5[5,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm11[2,0],ymm5[2,3],ymm11[6,4],ymm5[6,7] @@ -566,7 +566,7 @@ ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm8 = xmm8[0,2],xmm9[1,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm8[0,1,2],ymm5[3,4,5,6,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm10[3,1],ymm4[1,3],ymm10[7,5],ymm4[5,7] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm8 = ymm8[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm8[6,7] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm8 = ymm4[0,1],ymm3[2],ymm4[3] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm8[2,3,0,1] @@ -589,7 +589,7 @@ ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm3[2,0],ymm4[0,0],ymm3[6,4],ymm4[4,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm4[0,2],ymm8[2,0],ymm4[4,6],ymm8[6,4] ; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm9 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm10 = xmm9[2,2,3,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm9[2,2,3,3] ; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm11 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm11[0],xmm10[1],xmm11[2,3] ; AVX1-ONLY-NEXT: vmovapd 80(%rdi), %xmm12 @@ -600,7 +600,7 @@ ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[3,0],ymm4[1,0],ymm3[7,4],ymm4[5,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm4[0,3],ymm3[2,0],ymm4[4,7],ymm3[6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm11[0,1],xmm9[2,3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm4 = xmm4[1,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm4[1,3,2,3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm12[3,1],ymm1[1,3],ymm12[7,5],ymm1[5,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,1],ymm1[2,0],ymm0[5,5],ymm1[6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,3,4,5,6,7] @@ -628,7 +628,7 @@ ; AVX2-SLOW-NEXT: vpermps %ymm6, %ymm2, %ymm2 ; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm7[0,1],ymm5[0,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm5[6,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm9 = ymm8[0,2,2,2,4,6,6,6] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm9 = ymm8[0,2,2,2,4,6,6,6] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm9[3,4,5,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vbroadcastsd {{.*#+}} ymm10 = [8589934596,8589934596,8589934596,8589934596] @@ -636,42 +636,42 @@ ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm10[6,7] ; AVX2-SLOW-NEXT: vmovaps {{.*#+}} xmm10 = <1,7,5,u> ; AVX2-SLOW-NEXT: vpermps %ymm6, %ymm10, %ymm6 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm8 = ymm8[1,3,2,3,5,7,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm8 = ymm8[1,3,2,3,5,7,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2],ymm8[3,4,5,6,7] ; AVX2-SLOW-NEXT: vbroadcastsd {{.*#+}} ymm8 = [12884901893,12884901893,12884901893,12884901893] ; AVX2-SLOW-NEXT: vpermps %ymm9, %ymm8, %ymm8 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm8[6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm7[0,1],ymm5[2,3],ymm7[4,5],ymm5[6,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm8 = ymm8[2,0,2,3,6,4,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm8 = ymm8[2,0,2,3,6,4,6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,2,0,3] ; AVX2-SLOW-NEXT: vmovaps (%rdi), %xmm9 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm10 = xmm9[2,3,2,3] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm11 = ymm4[0,0,2,3,4,4,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm10 = xmm9[2,3,2,3] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm11 = ymm4[0,0,2,3,4,4,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0],ymm11[1,2,3],ymm10[4],ymm11[5,6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[0,3,2,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm10[0,1,2],ymm8[3,4,5,6,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm10 = ymm1[0,0,0,0,4,4,4,4] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm11 = ymm0[0,0,2,3,4,4,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm10 = ymm1[0,0,0,0,4,4,4,4] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm11 = ymm0[0,0,2,3,4,4,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0,1,2],ymm10[3],ymm11[4,5,6],ymm10[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[0,1,0,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm10[5,6,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm10 = ymm5[3,3,3,3,7,7,7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm10 = ymm5[3,3,3,3,7,7,7,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0],ymm7[1],ymm10[2,3,4],ymm7[5],ymm10[6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[0,2,0,3] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm9 = xmm9[3,3,3,3] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm11 = ymm4[0,1,3,3,4,5,7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm9 = xmm9[3,3,3,3] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm11 = ymm4[0,1,3,3,4,5,7,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0],ymm11[1,2,3],ymm9[4],ymm11[5,6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[0,3,2,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2],ymm10[3,4,5,6,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm10 = ymm1[0,1,0,1,4,5,4,5] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm11 = ymm0[0,1,3,3,4,5,7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm10 = ymm1[0,1,0,1,4,5,4,5] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm11 = ymm0[0,1,3,3,4,5,7,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0,1,2],ymm10[3],ymm11[4,5,6],ymm10[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[0,1,0,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4],ymm10[5,6,7] ; AVX2-SLOW-NEXT: vmovaps 80(%rdi), %xmm10 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0,1,2,3],ymm5[4,5,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm5[0,1],ymm10[2,3],ymm5[4,5,6,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm7 = ymm7[0,1,0,2,4,5,4,6] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm7 = ymm7[0,1,0,2,4,5,4,6] ; AVX2-SLOW-NEXT: vmovaps {{.*#+}} xmm11 = <4,2,u,u> ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] ; AVX2-SLOW-NEXT: vpermps %ymm3, %ymm11, %ymm4 @@ -681,7 +681,7 @@ ; AVX2-SLOW-NEXT: # ymm1 = mem[0,1,0,1] ; AVX2-SLOW-NEXT: vpermps %ymm0, %ymm1, %ymm1 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3,4],ymm1[5,6,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm4 = ymm5[1,1,1,1,5,5,5,5] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm4 = ymm5[1,1,1,1,5,5,5,5] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2],ymm10[3],ymm4[4,5,6,7] ; AVX2-SLOW-NEXT: vmovaps {{.*#+}} xmm5 = <5,3,u,u> ; AVX2-SLOW-NEXT: vpermps %ymm3, %ymm5, %ymm3 @@ -713,7 +713,7 @@ ; AVX2-FAST-NEXT: vpermps %ymm6, %ymm2, %ymm2 ; AVX2-FAST-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm7[0,1],ymm5[0,1] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm5[6,7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm9 = ymm8[0,2,2,2,4,6,6,6] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm9 = ymm8[0,2,2,2,4,6,6,6] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm9[3,4,5,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm9 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-NEXT: vbroadcastsd {{.*#+}} ymm10 = [8589934596,8589934596,8589934596,8589934596] @@ -721,7 +721,7 @@ ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm10[6,7] ; AVX2-FAST-NEXT: vmovaps {{.*#+}} xmm10 = <1,7,5,u> ; AVX2-FAST-NEXT: vpermps %ymm6, %ymm10, %ymm6 -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm8 = ymm8[1,3,2,3,5,7,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm8 = ymm8[1,3,2,3,5,7,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2],ymm8[3,4,5,6,7] ; AVX2-FAST-NEXT: vbroadcastsd {{.*#+}} ymm8 = [12884901893,12884901893,12884901893,12884901893] ; AVX2-FAST-NEXT: vpermps %ymm9, %ymm8, %ymm8 @@ -730,33 +730,33 @@ ; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm9 = ; AVX2-FAST-NEXT: vpermps %ymm8, %ymm9, %ymm8 ; AVX2-FAST-NEXT: vmovaps (%rdi), %xmm9 -; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm10 = xmm9[2,3,2,3] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm11 = ymm4[0,0,2,3,4,4,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm10 = xmm9[2,3,2,3] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm11 = ymm4[0,0,2,3,4,4,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0],ymm11[1,2,3],ymm10[4],ymm11[5,6,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[0,3,2,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm10[0,1,2],ymm8[3,4,5,6,7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm10 = ymm1[0,0,0,0,4,4,4,4] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm11 = ymm0[0,0,2,3,4,4,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm10 = ymm1[0,0,0,0,4,4,4,4] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm11 = ymm0[0,0,2,3,4,4,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0,1,2],ymm10[3],ymm11[4,5,6],ymm10[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[0,1,0,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm10[5,6,7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm10 = ymm5[3,3,3,3,7,7,7,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm10 = ymm5[3,3,3,3,7,7,7,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0],ymm7[1],ymm10[2,3,4],ymm7[5],ymm10[6,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[0,2,0,3] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm9 = xmm9[3,3,3,3] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm11 = ymm4[0,1,3,3,4,5,7,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm9 = xmm9[3,3,3,3] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm11 = ymm4[0,1,3,3,4,5,7,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0],ymm11[1,2,3],ymm9[4],ymm11[5,6,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[0,3,2,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2],ymm10[3,4,5,6,7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm10 = ymm1[0,1,0,1,4,5,4,5] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm11 = ymm0[0,1,3,3,4,5,7,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm10 = ymm1[0,1,0,1,4,5,4,5] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm11 = ymm0[0,1,3,3,4,5,7,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0,1,2],ymm10[3],ymm11[4,5,6],ymm10[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[0,1,0,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4],ymm10[5,6,7] ; AVX2-FAST-NEXT: vmovaps 80(%rdi), %xmm10 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0,1,2,3],ymm5[4,5,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm5[0,1],ymm10[2,3],ymm5[4,5,6,7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm7 = ymm7[0,1,0,2,4,5,4,6] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm7 = ymm7[0,1,0,2,4,5,4,6] ; AVX2-FAST-NEXT: vmovaps {{.*#+}} xmm11 = <4,2,u,u> ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] ; AVX2-FAST-NEXT: vpermps %ymm3, %ymm11, %ymm4 @@ -766,7 +766,7 @@ ; AVX2-FAST-NEXT: # ymm1 = mem[0,1,0,1] ; AVX2-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm1 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3,4],ymm1[5,6,7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm4 = ymm5[1,1,1,1,5,5,5,5] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm5[1,1,1,1,5,5,5,5] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2],ymm10[3],ymm4[4,5,6,7] ; AVX2-FAST-NEXT: vmovaps {{.*#+}} xmm5 = <5,3,u,u> ; AVX2-FAST-NEXT: vpermps %ymm3, %ymm5, %ymm3 @@ -798,7 +798,7 @@ ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm6, %ymm2, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vperm2f128 {{.*#+}} ymm8 = ymm7[0,1],ymm5[0,1] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm5[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm9 = ymm8[0,2,2,2,4,6,6,6] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm9 = ymm8[0,2,2,2,4,6,6,6] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm9[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd {{.*#+}} ymm10 = [8589934596,8589934596,8589934596,8589934596] @@ -806,42 +806,42 @@ ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm10[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} xmm10 = <1,7,5,u> ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm6, %ymm10, %ymm6 -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm8 = ymm8[1,3,2,3,5,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm8 = ymm8[1,3,2,3,5,7,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2],ymm8[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd {{.*#+}} ymm8 = [12884901893,12884901893,12884901893,12884901893] ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm9, %ymm8, %ymm8 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm8[6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm7[0,1],ymm5[2,3],ymm7[4,5],ymm5[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm8 = ymm8[2,0,2,3,6,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm8 = ymm8[2,0,2,3,6,4,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,2,0,3] ; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdi), %xmm9 -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm10 = xmm9[2,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm11 = ymm4[0,0,2,3,4,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm10 = xmm9[2,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm11 = ymm4[0,0,2,3,4,4,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0],ymm11[1,2,3],ymm10[4],ymm11[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[0,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm10[0,1,2],ymm8[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm10 = ymm1[0,0,0,0,4,4,4,4] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm11 = ymm0[0,0,2,3,4,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm10 = ymm1[0,0,0,0,4,4,4,4] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm11 = ymm0[0,0,2,3,4,4,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0,1,2],ymm10[3],ymm11[4,5,6],ymm10[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[0,1,0,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm10[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm10 = ymm5[3,3,3,3,7,7,7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm10 = ymm5[3,3,3,3,7,7,7,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0],ymm7[1],ymm10[2,3,4],ymm7[5],ymm10[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[0,2,0,3] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm9 = xmm9[3,3,3,3] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm11 = ymm4[0,1,3,3,4,5,7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm9 = xmm9[3,3,3,3] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm11 = ymm4[0,1,3,3,4,5,7,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0],ymm11[1,2,3],ymm9[4],ymm11[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[0,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2],ymm10[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm10 = ymm1[0,1,0,1,4,5,4,5] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm11 = ymm0[0,1,3,3,4,5,7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm10 = ymm1[0,1,0,1,4,5,4,5] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm11 = ymm0[0,1,3,3,4,5,7,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0,1,2],ymm10[3],ymm11[4,5,6],ymm10[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[0,1,0,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4],ymm10[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps 80(%rdi), %xmm10 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0,1,2,3],ymm5[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm5[0,1],ymm10[2,3],ymm5[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm7 = ymm7[0,1,0,2,4,5,4,6] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm7 = ymm7[0,1,0,2,4,5,4,6] ; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} xmm11 = <4,2,u,u> ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm3, %ymm11, %ymm4 @@ -851,7 +851,7 @@ ; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0,1,0,1] ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm0, %ymm1, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3,4],ymm1[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm4 = ymm5[1,1,1,1,5,5,5,5] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm4 = ymm5[1,1,1,1,5,5,5,5] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2],ymm10[3],ymm4[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} xmm5 = <5,3,u,u> ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm3, %ymm5, %ymm3 @@ -1253,7 +1253,7 @@ ; AVX1-ONLY-NEXT: vmovapd %ymm0, %ymm10 ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm11 = ymm7[0],ymm14[0],ymm7[3],ymm14[2] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm11 = ymm11[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm11[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm11[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, 288(%rdi), %ymm3, %ymm1 @@ -1270,7 +1270,7 @@ ; AVX1-ONLY-NEXT: vmovapd 320(%rdi), %ymm12 ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm12[2,3],ymm5[0,1] ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm11 = ymm5[0],ymm12[0],ymm5[3],ymm12[2] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm11 = ymm11[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm11[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm11[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm13[3,0],ymm6[1,0],ymm13[7,4],ymm6[5,4] @@ -1280,7 +1280,7 @@ ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm9[0,2],xmm2[1,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm6[3,4,5,6,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm7[3,1],ymm14[1,3],ymm7[7,5],ymm14[5,7] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm6 = ymm6[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm6[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm6[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps %ymm8, %ymm4 @@ -1290,7 +1290,7 @@ ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm2[0,2],xmm0[1,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm5[3,1],ymm12[1,3],ymm5[7,5],ymm12[5,7] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -1341,7 +1341,7 @@ ; AVX1-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = mem[0,1],ymm14[2,3],mem[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm4[2,2,3,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm4[2,2,3,3] ; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm5 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm5[0],xmm1[1],xmm5[2,3] ; AVX1-ONLY-NEXT: vmovapd 80(%rdi), %xmm7 @@ -1356,7 +1356,7 @@ ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1],ymm12[2,3],ymm10[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm11 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm12 = xmm11[2,2,3,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm12 = xmm11[2,2,3,3] ; AVX1-ONLY-NEXT: vmovaps 208(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm12 = xmm1[0],xmm12[1],xmm1[2,3] ; AVX1-ONLY-NEXT: vmovapd 272(%rdi), %xmm2 @@ -1373,7 +1373,7 @@ ; AVX1-ONLY-NEXT: vshufps $215, (%rsp), %ymm7, %ymm5 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm5 = ymm7[3,1],mem[1,3],ymm7[7,5],mem[5,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm14[1,1],ymm5[2,0],ymm14[5,5],ymm5[6,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm4 = xmm4[1,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm4[1,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm5[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3,4],ymm0[5,6,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm9[3,0],ymm12[1,0],ymm9[7,4],ymm12[5,4] @@ -1381,7 +1381,7 @@ ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm11[2,3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[3,1],ymm3[1,3],ymm2[7,5],ymm3[5,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm13[1,1],ymm2[2,0],ymm13[5,5],ymm2[6,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[1,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm4[5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload @@ -1431,7 +1431,7 @@ ; AVX2-SLOW-NEXT: vpermps %ymm4, %ymm1, %ymm6 ; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm9[0,1],ymm7[0,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],ymm7[6,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm13 = ymm12[0,2,2,2,4,6,6,6] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm13 = ymm12[0,2,2,2,4,6,6,6] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2],ymm13[3,4,5,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm13 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vbroadcastsd {{.*#+}} ymm0 = [8589934596,8589934596,8589934596,8589934596] @@ -1443,7 +1443,7 @@ ; AVX2-SLOW-NEXT: vmovaps 256(%rdi), %ymm5 ; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm15 = ymm5[0,1],ymm8[0,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm15[0,1,2,3,4,5],ymm8[6,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm15 = ymm2[0,2,2,2,4,6,6,6] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm15 = ymm2[0,2,2,2,4,6,6,6] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm14[0,1,2],ymm15[3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovaps 320(%rdi), %ymm14 ; AVX2-SLOW-NEXT: vmovaps 352(%rdi), %ymm15 @@ -1453,73 +1453,73 @@ ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps {{.*#+}} xmm0 = <1,7,5,u> ; AVX2-SLOW-NEXT: vpermps %ymm4, %ymm0, %ymm3 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm4 = ymm12[1,3,2,3,5,7,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm4 = ymm12[1,3,2,3,5,7,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm4[3,4,5,6,7] ; AVX2-SLOW-NEXT: vbroadcastsd {{.*#+}} ymm4 = [12884901893,12884901893,12884901893,12884901893] ; AVX2-SLOW-NEXT: vpermps %ymm13, %ymm4, %ymm12 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm12[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpermps %ymm6, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm2 = ymm2[1,3,2,3,5,7,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm2[1,3,2,3,5,7,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7] ; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm4, %ymm1 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps (%rdi), %xmm0 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm2 = ymm11[0,0,2,3,4,4,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm11[0,0,2,3,4,4,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm9[0,1],ymm7[2,3],ymm9[4,5],ymm7[6,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm2 = ymm2[2,0,2,3,6,4,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,0,2,3,6,4,6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,0,3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,3,2,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm2 = ymm12[0,0,0,0,4,4,4,4] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm12[0,0,0,0,4,4,4,4] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm3 = ymm6[0,0,2,3,4,4,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm3 = ymm6[0,0,2,3,4,4,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm1, (%rsp) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps 192(%rdi), %xmm1 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[2,3,2,3] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm3 = ymm10[0,0,2,3,4,4,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm1[2,3,2,3] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm3 = ymm10[0,0,2,3,4,4,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm3[1,2,3],ymm2[4],ymm3[5,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1],ymm8[2,3],ymm5[4,5],ymm8[6,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm3 = ymm3[2,0,2,3,6,4,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm3 = ymm3[2,0,2,3,6,4,6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,2,0,3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,3,2,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3,4,5,6,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm3 = ymm15[0,0,0,0,4,4,4,4] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm4 = ymm14[0,0,2,3,4,4,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm3 = ymm15[0,0,0,0,4,4,4,4] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm4 = ymm14[0,0,2,3,4,4,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3],ymm4[4,5,6],ymm3[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm2 = ymm11[0,1,3,3,4,5,7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm11[0,1,3,3,4,5,7,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm2 = ymm7[3,3,3,3,7,7,7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm7[3,3,3,3,7,7,7,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm9[1],ymm2[2,3,4],ymm9[5],ymm2[6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,0,3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm2 = ymm12[0,1,0,1,4,5,4,5] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm3 = ymm6[0,1,3,3,4,5,7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm12[0,1,0,1,4,5,4,5] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm3 = ymm6[0,1,3,3,4,5,7,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm13 = ymm0[0,1,2,3,4],ymm2[5,6,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm1[3,3,3,3] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm10[0,1,3,3,4,5,7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm1[3,3,3,3] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm10[0,1,3,3,4,5,7,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm8[3,3,3,3,7,7,7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm8[3,3,3,3,7,7,7,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm5[1],ymm1[2,3,4],ymm5[5],ymm1[6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm15[0,1,0,1,4,5,4,5] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm2 = ymm14[0,1,3,3,4,5,7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm15[0,1,0,1,4,5,4,5] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm14[0,1,3,3,4,5,7,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm0[0,1,2,3,4],ymm1[5,6,7] @@ -1528,7 +1528,7 @@ ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm8[4,5,6,7] ; AVX2-SLOW-NEXT: vmovaps 272(%rdi), %xmm4 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm4[2,3],ymm2[4,5,6,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,1,0,2,4,5,4,6] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,0,2,4,5,4,6] ; AVX2-SLOW-NEXT: vmovaps {{.*#+}} xmm5 = <4,2,u,u> ; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm5, %ymm8 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1],ymm0[2,3,4,5,6,7] @@ -1542,13 +1542,13 @@ ; AVX2-SLOW-NEXT: # ymm9 = ymm11[0,1,2,3],mem[4,5,6,7] ; AVX2-SLOW-NEXT: vmovaps 80(%rdi), %xmm11 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm14 = ymm7[0,1],ymm11[2,3],ymm7[4,5,6,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm14 = ymm14[0,1,0,2,4,5,4,6] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm14 = ymm14[0,1,0,2,4,5,4,6] ; AVX2-SLOW-NEXT: vpermps %ymm9, %ymm5, %ymm5 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm14[2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm14 = ymm12[0,1],ymm6[2,3],ymm12[4,5,6,7] ; AVX2-SLOW-NEXT: vpermps %ymm14, %ymm10, %ymm10 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm10[5,6,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm2 = ymm2[1,1,1,1,5,5,5,5] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm2[1,1,1,1,5,5,5,5] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm4[3],ymm2[4,5,6,7] ; AVX2-SLOW-NEXT: vmovaps {{.*#+}} xmm4 = <5,3,u,u> ; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm4, %ymm1 @@ -1557,7 +1557,7 @@ ; AVX2-SLOW-NEXT: # ymm2 = mem[0,1,0,1] ; AVX2-SLOW-NEXT: vpermps %ymm8, %ymm2, %ymm8 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm8[5,6,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm7 = ymm7[1,1,1,1,5,5,5,5] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm7 = ymm7[1,1,1,1,5,5,5,5] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2],ymm11[3],ymm7[4,5,6,7] ; AVX2-SLOW-NEXT: vpermps %ymm9, %ymm4, %ymm4 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm7[2,3,4,5,6,7] @@ -1607,7 +1607,7 @@ ; AVX2-FAST-NEXT: vpermps %ymm4, %ymm1, %ymm6 ; AVX2-FAST-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm9[0,1],ymm7[0,1] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],ymm7[6,7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm13 = ymm12[0,2,2,2,4,6,6,6] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm13 = ymm12[0,2,2,2,4,6,6,6] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2],ymm13[3,4,5,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm13 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-NEXT: vbroadcastsd {{.*#+}} ymm0 = [8589934596,8589934596,8589934596,8589934596] @@ -1619,7 +1619,7 @@ ; AVX2-FAST-NEXT: vmovaps 256(%rdi), %ymm5 ; AVX2-FAST-NEXT: vperm2f128 {{.*#+}} ymm15 = ymm5[0,1],ymm8[0,1] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm15[0,1,2,3,4,5],ymm8[6,7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm15 = ymm2[0,2,2,2,4,6,6,6] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm15 = ymm2[0,2,2,2,4,6,6,6] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm14[0,1,2],ymm15[3,4,5,6,7] ; AVX2-FAST-NEXT: vmovaps 320(%rdi), %ymm14 ; AVX2-FAST-NEXT: vmovaps 352(%rdi), %ymm15 @@ -1629,21 +1629,21 @@ ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps {{.*#+}} xmm0 = <1,7,5,u> ; AVX2-FAST-NEXT: vpermps %ymm4, %ymm0, %ymm3 -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm4 = ymm12[1,3,2,3,5,7,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm12[1,3,2,3,5,7,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm4[3,4,5,6,7] ; AVX2-FAST-NEXT: vbroadcastsd {{.*#+}} ymm4 = [12884901893,12884901893,12884901893,12884901893] ; AVX2-FAST-NEXT: vpermps %ymm13, %ymm4, %ymm12 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm12[6,7] ; AVX2-FAST-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpermps %ymm6, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm2 = ymm2[1,3,2,3,5,7,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm2[1,3,2,3,5,7,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7] ; AVX2-FAST-NEXT: vpermps %ymm1, %ymm4, %ymm1 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps (%rdi), %xmm0 -; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm2 = ymm11[0,0,2,3,4,4,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm11[0,0,2,3,4,4,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm9[0,1],ymm7[2,3],ymm9[4,5],ymm7[6,7] ; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm3 = [2,0,6,4,2,0,6,7] @@ -1651,9 +1651,9 @@ ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,3,2,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm2 = ymm12[0,0,0,0,4,4,4,4] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm12[0,0,0,0,4,4,4,4] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm4 = ymm6[0,0,2,3,4,4,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm6[0,0,2,3,4,4,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2],ymm2[3],ymm4[4,5,6],ymm2[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] @@ -1661,40 +1661,40 @@ ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1],ymm8[2,3],ymm5[4,5],ymm8[6,7] ; AVX2-FAST-NEXT: vpermps %ymm1, %ymm3, %ymm1 ; AVX2-FAST-NEXT: vmovaps 192(%rdi), %xmm2 -; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm3 = xmm2[2,3,2,3] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm4 = ymm10[0,0,2,3,4,4,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm3 = xmm2[2,3,2,3] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm10[0,0,2,3,4,4,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm4[1,2,3],ymm3[4],ymm4[5,6,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,3,2,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3,4,5,6,7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm3 = ymm15[0,0,0,0,4,4,4,4] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm4 = ymm14[0,0,2,3,4,4,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm3 = ymm15[0,0,0,0,4,4,4,4] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm14[0,0,2,3,4,4,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3],ymm4[4,5,6],ymm3[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm3[5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm1 = ymm11[0,1,3,3,4,5,7,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm11[0,1,3,3,4,5,7,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm1 = ymm7[3,3,3,3,7,7,7,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm7[3,3,3,3,7,7,7,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm9[1],ymm1[2,3,4],ymm9[5],ymm1[6,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm1 = ymm12[0,1,0,1,4,5,4,5] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm3 = ymm6[0,1,3,3,4,5,7,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm12[0,1,0,1,4,5,4,5] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm3 = ymm6[0,1,3,3,4,5,7,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3],ymm3[4,5,6],ymm1[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm13 = ymm0[0,1,2,3,4],ymm1[5,6,7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm0 = xmm2[3,3,3,3] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm1 = ymm10[0,1,3,3,4,5,7,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm2[3,3,3,3] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm10[0,1,3,3,4,5,7,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm1 = ymm8[3,3,3,3,7,7,7,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm8[3,3,3,3,7,7,7,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm5[1],ymm1[2,3,4],ymm5[5],ymm1[6,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm1 = ymm15[0,1,0,1,4,5,4,5] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm2 = ymm14[0,1,3,3,4,5,7,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm15[0,1,0,1,4,5,4,5] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm14[0,1,3,3,4,5,7,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm0[0,1,2,3,4],ymm1[5,6,7] @@ -1703,7 +1703,7 @@ ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm8[4,5,6,7] ; AVX2-FAST-NEXT: vmovaps 272(%rdi), %xmm4 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm4[2,3],ymm2[4,5,6,7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,1,0,2,4,5,4,6] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,0,2,4,5,4,6] ; AVX2-FAST-NEXT: vmovaps {{.*#+}} xmm5 = <4,2,u,u> ; AVX2-FAST-NEXT: vpermps %ymm1, %ymm5, %ymm8 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1],ymm0[2,3,4,5,6,7] @@ -1717,13 +1717,13 @@ ; AVX2-FAST-NEXT: # ymm9 = ymm11[0,1,2,3],mem[4,5,6,7] ; AVX2-FAST-NEXT: vmovaps 80(%rdi), %xmm11 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm14 = ymm7[0,1],ymm11[2,3],ymm7[4,5,6,7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm14 = ymm14[0,1,0,2,4,5,4,6] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm14 = ymm14[0,1,0,2,4,5,4,6] ; AVX2-FAST-NEXT: vpermps %ymm9, %ymm5, %ymm5 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm14[2,3,4,5,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm14 = ymm12[0,1],ymm6[2,3],ymm12[4,5,6,7] ; AVX2-FAST-NEXT: vpermps %ymm14, %ymm10, %ymm10 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm10[5,6,7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm2 = ymm2[1,1,1,1,5,5,5,5] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm2[1,1,1,1,5,5,5,5] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm4[3],ymm2[4,5,6,7] ; AVX2-FAST-NEXT: vmovaps {{.*#+}} xmm4 = <5,3,u,u> ; AVX2-FAST-NEXT: vpermps %ymm1, %ymm4, %ymm1 @@ -1732,7 +1732,7 @@ ; AVX2-FAST-NEXT: # ymm2 = mem[0,1,0,1] ; AVX2-FAST-NEXT: vpermps %ymm8, %ymm2, %ymm8 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm8[5,6,7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm7 = ymm7[1,1,1,1,5,5,5,5] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm7 = ymm7[1,1,1,1,5,5,5,5] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2],ymm11[3],ymm7[4,5,6,7] ; AVX2-FAST-NEXT: vpermps %ymm9, %ymm4, %ymm4 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm7[2,3,4,5,6,7] @@ -1782,7 +1782,7 @@ ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm4, %ymm1, %ymm6 ; AVX2-FAST-PERLANE-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm9[0,1],ymm7[0,1] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],ymm7[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm13 = ymm12[0,2,2,2,4,6,6,6] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm13 = ymm12[0,2,2,2,4,6,6,6] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2],ymm13[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm13 = ymm2[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd {{.*#+}} ymm0 = [8589934596,8589934596,8589934596,8589934596] @@ -1794,7 +1794,7 @@ ; AVX2-FAST-PERLANE-NEXT: vmovaps 256(%rdi), %ymm5 ; AVX2-FAST-PERLANE-NEXT: vperm2f128 {{.*#+}} ymm15 = ymm5[0,1],ymm8[0,1] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm15[0,1,2,3,4,5],ymm8[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm15 = ymm2[0,2,2,2,4,6,6,6] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm15 = ymm2[0,2,2,2,4,6,6,6] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm14[0,1,2],ymm15[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps 320(%rdi), %ymm14 ; AVX2-FAST-PERLANE-NEXT: vmovaps 352(%rdi), %ymm15 @@ -1804,73 +1804,73 @@ ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} xmm0 = <1,7,5,u> ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm4, %ymm0, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm4 = ymm12[1,3,2,3,5,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm4 = ymm12[1,3,2,3,5,7,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm4[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd {{.*#+}} ymm4 = [12884901893,12884901893,12884901893,12884901893] ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm13, %ymm4, %ymm12 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm12[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm6, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm2 = ymm2[1,3,2,3,5,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm2[1,3,2,3,5,7,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm4, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdi), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm2 = ymm11[0,0,2,3,4,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm11[0,0,2,3,4,4,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm9[0,1],ymm7[2,3],ymm9[4,5],ymm7[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm2 = ymm2[2,0,2,3,6,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,0,2,3,6,4,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,0,3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm2 = ymm12[0,0,0,0,4,4,4,4] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm12[0,0,0,0,4,4,4,4] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm3 = ymm6[0,0,2,3,4,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm3 = ymm6[0,0,2,3,4,4,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, (%rsp) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 192(%rdi), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[2,3,2,3] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm3 = ymm10[0,0,2,3,4,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm2 = xmm1[2,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm3 = ymm10[0,0,2,3,4,4,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm3[1,2,3],ymm2[4],ymm3[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1],ymm8[2,3],ymm5[4,5],ymm8[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm3 = ymm3[2,0,2,3,6,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm3 = ymm3[2,0,2,3,6,4,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,2,0,3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm3 = ymm15[0,0,0,0,4,4,4,4] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm4 = ymm14[0,0,2,3,4,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm3 = ymm15[0,0,0,0,4,4,4,4] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm4 = ymm14[0,0,2,3,4,4,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3],ymm4[4,5,6],ymm3[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm2 = ymm11[0,1,3,3,4,5,7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm11[0,1,3,3,4,5,7,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm2 = ymm7[3,3,3,3,7,7,7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm7[3,3,3,3,7,7,7,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm9[1],ymm2[2,3,4],ymm9[5],ymm2[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,0,3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm2 = ymm12[0,1,0,1,4,5,4,5] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm3 = ymm6[0,1,3,3,4,5,7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm12[0,1,0,1,4,5,4,5] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm3 = ymm6[0,1,3,3,4,5,7,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm13 = ymm0[0,1,2,3,4],ymm2[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm0 = xmm1[3,3,3,3] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm10[0,1,3,3,4,5,7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm1[3,3,3,3] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm10[0,1,3,3,4,5,7,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm8[3,3,3,3,7,7,7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm8[3,3,3,3,7,7,7,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm5[1],ymm1[2,3,4],ymm5[5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm15[0,1,0,1,4,5,4,5] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm2 = ymm14[0,1,3,3,4,5,7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm15[0,1,0,1,4,5,4,5] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm14[0,1,3,3,4,5,7,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm0[0,1,2,3,4],ymm1[5,6,7] @@ -1879,7 +1879,7 @@ ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm8[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps 272(%rdi), %xmm4 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm4[2,3],ymm2[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,1,0,2,4,5,4,6] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,0,2,4,5,4,6] ; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} xmm5 = <4,2,u,u> ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm5, %ymm8 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1],ymm0[2,3,4,5,6,7] @@ -1893,13 +1893,13 @@ ; AVX2-FAST-PERLANE-NEXT: # ymm9 = ymm11[0,1,2,3],mem[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps 80(%rdi), %xmm11 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm14 = ymm7[0,1],ymm11[2,3],ymm7[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm14 = ymm14[0,1,0,2,4,5,4,6] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm14 = ymm14[0,1,0,2,4,5,4,6] ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm9, %ymm5, %ymm5 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm14[2,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm14 = ymm12[0,1],ymm6[2,3],ymm12[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm14, %ymm10, %ymm10 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm10[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm2 = ymm2[1,1,1,1,5,5,5,5] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm2[1,1,1,1,5,5,5,5] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm4[3],ymm2[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} xmm4 = <5,3,u,u> ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm4, %ymm1 @@ -1908,7 +1908,7 @@ ; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[0,1,0,1] ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm8, %ymm2, %ymm8 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm8[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm7 = ymm7[1,1,1,1,5,5,5,5] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm7 = ymm7[1,1,1,1,5,5,5,5] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2],ymm11[3],ymm7[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm9, %ymm4, %ymm4 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm7[2,3,4,5,6,7] @@ -2782,7 +2782,7 @@ ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm5[2,3],ymm4[0,1] ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm12[0],ymm5[0],ymm12[3],ymm5[2] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, 480(%rdi), %ymm3, %ymm5 @@ -2803,7 +2803,7 @@ ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[0,1] ; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[3],ymm2[2] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %ymm1 @@ -2829,7 +2829,7 @@ ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm7[2,3],ymm1[0,1] ; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0],ymm7[0],ymm1[3],ymm7[2] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 672(%rdi), %ymm1 @@ -2854,7 +2854,7 @@ ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm15 = ymm0[2,3],ymm13[0,1] ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm13 = ymm15[0],ymm0[0],ymm15[3],ymm0[2] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm13 = ymm13[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm13[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0,1,2,3,4,5],ymm13[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload @@ -2865,7 +2865,7 @@ ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2],ymm8[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm12[3,1],ymm11[1,3],ymm12[7,5],ymm11[5,7] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm8 = ymm8[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm8[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload @@ -2878,7 +2878,7 @@ ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm0[3,1],ymm8[1,3],ymm0[7,5],ymm8[5,7] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm5 = ymm5[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -2890,7 +2890,7 @@ ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps $215, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm3 = ymm0[3,1],mem[1,3],ymm0[7,5],mem[5,7] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm3 = ymm3[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload @@ -2901,7 +2901,7 @@ ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm3[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm15[3,1],ymm7[1,3],ymm15[7,5],ymm7[5,7] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -3017,7 +3017,7 @@ ; AVX1-ONLY-NEXT: # ymm8 = mem[0,1],ymm0[2,3],mem[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,2,3,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,2,3,3] ; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] @@ -3036,7 +3036,7 @@ ; AVX1-ONLY-NEXT: # ymm6 = mem[0,1],ymm0[2,3],mem[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,2,3,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,2,3,3] ; AVX1-ONLY-NEXT: vmovaps 400(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2,3] @@ -3055,7 +3055,7 @@ ; AVX1-ONLY-NEXT: # ymm9 = mem[0,1],ymm0[2,3],mem[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,2,3,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,2,3,3] ; AVX1-ONLY-NEXT: vmovaps 208(%rdi), %xmm13 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm13[0],xmm0[1],xmm13[2,3] ; AVX1-ONLY-NEXT: vmovapd 272(%rdi), %xmm12 @@ -3071,7 +3071,7 @@ ; AVX1-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm4 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm4 = mem[0,1],ymm2[2,3],mem[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 608(%rdi), %xmm11 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm5 = xmm11[2,2,3,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm11[2,2,3,3] ; AVX1-ONLY-NEXT: vmovaps 592(%rdi), %xmm10 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm10[0],xmm5[1],xmm10[2,3] ; AVX1-ONLY-NEXT: vmovapd 656(%rdi), %xmm5 @@ -3094,7 +3094,7 @@ ; AVX1-ONLY-NEXT: # ymm8 = ymm8[3,1],mem[1,3],ymm8[7,5],mem[5,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm15[1,1],ymm8[2,0],ymm15[5,5],ymm8[6,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[1,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm2[1,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm8[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm1[5,6,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm6[3,0],ymm3[1,0],ymm6[7,4],ymm3[5,4] @@ -3107,7 +3107,7 @@ ; AVX1-ONLY-NEXT: # ymm6 = ymm6[3,1],mem[1,3],ymm6[7,5],mem[5,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm8[1,1],ymm6[2,0],ymm8[5,5],ymm6[6,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm3 = xmm3[1,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm3[1,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm6[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3,4],ymm1[5,6,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm9[3,0],ymm0[1,0],ymm9[7,4],ymm0[5,4] @@ -3118,7 +3118,7 @@ ; AVX1-ONLY-NEXT: # ymm6 = ymm12[3,1],mem[1,3],ymm12[7,5],mem[5,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm8[1,1],ymm6[2,0],ymm8[5,5],ymm6[6,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm3 = xmm3[1,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm3[1,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm6[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3,4],ymm0[5,6,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm4[3,0],ymm14[1,0],ymm4[7,4],ymm14[5,4] @@ -3127,7 +3127,7 @@ ; AVX1-ONLY-NEXT: vshufps $215, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm5 = ymm5[3,1],mem[1,3],ymm5[7,5],mem[5,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm7[1,1],ymm5[2,0],ymm7[5,5],ymm5[6,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm4 = xmm4[1,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm4[1,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm5[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm3[5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload @@ -3205,7 +3205,7 @@ ; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm14, %ymm2 ; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm5[0,1],ymm3[0,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm11 = ymm1[0,1,2,3,4,5],ymm3[6,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm3 = ymm11[0,2,2,2,4,6,6,6] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm3 = ymm11[0,2,2,2,4,6,6,6] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm2[0,1,2],ymm3[3,4,5,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm10 = ymm4[0,1,2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vbroadcastsd {{.*#+}} ymm2 = [8589934596,8589934596,8589934596,8589934596] @@ -3219,7 +3219,7 @@ ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm0[0,1,2,3],ymm8[4,5],ymm0[6,7] ; AVX2-SLOW-NEXT: vpermps %ymm6, %ymm14, %ymm5 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm7 = ymm12[0,2,2,2,4,6,6,6] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm7 = ymm12[0,2,2,2,4,6,6,6] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2],ymm7[3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovaps 512(%rdi), %ymm0 ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -3241,7 +3241,7 @@ ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] ; AVX2-SLOW-NEXT: vpermps %ymm3, %ymm14, %ymm15 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm13 = ymm9[0,2,2,2,4,6,6,6] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm13 = ymm9[0,2,2,2,4,6,6,6] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm13 = ymm15[0,1,2],ymm13[3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovaps 320(%rdi), %ymm0 ; AVX2-SLOW-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill @@ -3263,7 +3263,7 @@ ; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm15[0,1],ymm1[0,1] ; AVX2-SLOW-NEXT: vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm1[6,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm5 = ymm7[0,2,2,2,4,6,6,6] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm5 = ymm7[0,2,2,2,4,6,6,6] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm14[0,1,2],ymm5[3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovaps 704(%rdi), %ymm1 ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -3275,59 +3275,59 @@ ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps {{.*#+}} xmm0 = <1,7,5,u> ; AVX2-SLOW-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm11 = ymm11[1,3,2,3,5,7,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm11 = ymm11[1,3,2,3,5,7,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2],ymm11[3,4,5,6,7] ; AVX2-SLOW-NEXT: vbroadcastsd {{.*#+}} ymm11 = [12884901893,12884901893,12884901893,12884901893] ; AVX2-SLOW-NEXT: vpermps %ymm10, %ymm11, %ymm10 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm10[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpermps %ymm6, %ymm0, %ymm5 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm6 = ymm12[1,3,2,3,5,7,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm6 = ymm12[1,3,2,3,5,7,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2],ymm6[3,4,5,6,7] ; AVX2-SLOW-NEXT: vpermps %ymm4, %ymm11, %ymm4 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpermps %ymm3, %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm3 = ymm9[1,3,2,3,5,7,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm3 = ymm9[1,3,2,3,5,7,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm3[3,4,5,6,7] ; AVX2-SLOW-NEXT: vpermps %ymm8, %ymm11, %ymm2 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpermps %ymm13, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm7[1,3,2,3,5,7,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm7[1,3,2,3,5,7,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX2-SLOW-NEXT: vpermps %ymm14, %ymm11, %ymm1 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps (%rdi), %xmm2 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm2[2,3,2,3] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm2[2,3,2,3] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm13[0,0,2,3,4,4,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm13[0,0,2,3,4,4,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm11[0,1],ymm12[2,3],ymm11[4,5],ymm12[6,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[2,0,2,3,6,4,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0,2,3,6,4,6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm9[0,0,0,0,4,4,4,4] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm9[0,0,0,0,4,4,4,4] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm3 = ymm10[0,0,2,3,4,4,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm3 = ymm10[0,0,2,3,4,4,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3],ymm3[4,5,6],ymm1[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps 384(%rdi), %xmm1 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm1[2,3,2,3] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm1[2,3,2,3] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm3 = ymm8[0,0,2,3,4,4,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm3 = ymm8[0,0,2,3,4,4,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm3[1,2,3],ymm0[4],ymm3[5,6,7] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-SLOW-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm3 = mem[0,1],ymm3[2,3],mem[4,5],ymm3[6,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm3 = ymm3[2,0,2,3,6,4,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm3 = ymm3[2,0,2,3,6,4,6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,2,0,3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm3[3,4,5,6,7] @@ -3341,13 +3341,13 @@ ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps 576(%rdi), %xmm0 ; AVX2-SLOW-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX2-SLOW-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm4 = mem[0,0,2,3,4,4,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm4[1,2,3],ymm0[4],ymm4[5,6,7] ; AVX2-SLOW-NEXT: vblendps $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm4 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm4 = ymm15[0,1],mem[2,3],ymm15[4,5],mem[6,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm4 = ymm4[2,0,2,3,6,4,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm4 = ymm4[2,0,2,3,6,4,6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,2,0,3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm4[3,4,5,6,7] @@ -3360,69 +3360,69 @@ ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps 192(%rdi), %xmm0 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm4 = xmm0[2,3,2,3] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm4 = xmm0[2,3,2,3] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm5 = ymm7[0,0,2,3,4,4,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm5 = ymm7[0,0,2,3,4,4,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm5[1,2,3],ymm4[4],ymm5[5,6,7] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm14[0,1],ymm15[2,3],ymm14[4,5],ymm15[6,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm5 = ymm5[2,0,2,3,6,4,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm5 = ymm5[2,0,2,3,6,4,6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,2,0,3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,3,2,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2],ymm5[3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm5 = ymm3[0,0,0,0,4,4,4,4] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm5 = ymm3[0,0,0,0,4,4,4,4] ; AVX2-SLOW-NEXT: vpermilps $224, (%rsp), %ymm6 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm6 = mem[0,0,2,3,4,4,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2],ymm5[3],ymm6[4,5,6],ymm5[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,0,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm5[5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[3,3,3,3] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm4 = ymm13[0,1,3,3,4,5,7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm2[3,3,3,3] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm4 = ymm13[0,1,3,3,4,5,7,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm4[1,2,3],ymm2[4],ymm4[5,6,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm4 = ymm12[3,3,3,3,7,7,7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm4 = ymm12[3,3,3,3,7,7,7,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm11[1],ymm4[2,3,4],ymm11[5],ymm4[6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,2,0,3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,3,2,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm4[3,4,5,6,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm4 = ymm9[0,1,0,1,4,5,4,5] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm5 = ymm10[0,1,3,3,4,5,7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm4 = ymm9[0,1,0,1,4,5,4,5] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm5 = ymm10[0,1,3,3,4,5,7,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3],ymm5[4,5,6],ymm4[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,0,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm4[5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,3,3,3] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm2 = ymm8[0,1,3,3,4,5,7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm8[0,1,3,3,4,5,7,7] ; AVX2-SLOW-NEXT: vmovaps %ymm8, %ymm13 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm2 = ymm9[3,3,3,3,7,7,7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm9[3,3,3,3,7,7,7,7] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm5[1],ymm2[2,3,4],ymm5[5],ymm2[6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,0,3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,3,2,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm2 = ymm8[0,1,0,1,4,5,4,5] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm8[0,1,0,1,4,5,4,5] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm4 = ymm6[0,1,3,3,4,5,7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm4 = ymm6[0,1,3,3,4,5,7,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2],ymm2[3],ymm4[4,5,6],ymm2[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm7[0,1,3,3,4,5,7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm7[0,1,3,3,4,5,7,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm15[3,3,3,3,7,7,7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm15[3,3,3,3,7,7,7,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm14[1],ymm1[2,3,4],ymm14[5],ymm1[6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm3[0,1,0,1,4,5,4,5] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm3[0,1,0,1,4,5,4,5] ; AVX2-SLOW-NEXT: vmovups (%rsp), %ymm10 # 32-byte Reload -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm2 = ymm10[0,1,3,3,4,5,7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm10[0,1,3,3,4,5,7,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] @@ -3430,10 +3430,10 @@ ; AVX2-SLOW-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm0 = mem[3,3,3,3] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm3[0,1,3,3,4,5,7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm3[0,1,3,3,4,5,7,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm12[3,3,3,3,7,7,7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm12[3,3,3,3,7,7,7,7] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm11[1],ymm1[2,3,4],ymm11[5],ymm1[6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3] @@ -3442,7 +3442,7 @@ ; AVX2-SLOW-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm1 = mem[0,1,0,1,4,5,4,5] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm2 = ymm15[0,1,3,3,4,5,7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm15[0,1,3,3,4,5,7,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] @@ -3454,7 +3454,7 @@ ; AVX2-SLOW-NEXT: vmovaps 464(%rdi), %xmm1 ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm14[0,1],ymm1[2,3],ymm14[4,5,6,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm2 = ymm2[0,1,0,2,4,5,4,6] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,1,0,2,4,5,4,6] ; AVX2-SLOW-NEXT: vmovaps {{.*#+}} xmm1 = <4,2,u,u> ; AVX2-SLOW-NEXT: vpermps %ymm0, %ymm1, %ymm4 ; AVX2-SLOW-NEXT: vmovaps %ymm1, %ymm9 @@ -3474,7 +3474,7 @@ ; AVX2-SLOW-NEXT: # ymm13 = ymm1[0,1,2,3],mem[4,5,6,7] ; AVX2-SLOW-NEXT: vmovaps 272(%rdi), %xmm6 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm13[0,1],ymm6[2,3],ymm13[4,5,6,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm4 = ymm4[0,1,0,2,4,5,4,6] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,1,0,2,4,5,4,6] ; AVX2-SLOW-NEXT: vmovaps %ymm9, %ymm1 ; AVX2-SLOW-NEXT: vpermps %ymm0, %ymm9, %ymm8 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm8[0,1],ymm4[2,3,4,5,6,7] @@ -3488,7 +3488,7 @@ ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm12 = ymm11[0,1,2,3],ymm12[4,5,6,7] ; AVX2-SLOW-NEXT: vmovaps 656(%rdi), %xmm4 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm12[0,1],ymm4[2,3],ymm12[4,5,6,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm8 = ymm8[0,1,0,2,4,5,4,6] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm8 = ymm8[0,1,0,2,4,5,4,6] ; AVX2-SLOW-NEXT: vpermps %ymm9, %ymm1, %ymm11 ; AVX2-SLOW-NEXT: vmovaps %ymm1, %ymm3 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm11[0,1],ymm8[2,3,4,5,6,7] @@ -3505,7 +3505,7 @@ ; AVX2-SLOW-NEXT: # ymm1 = ymm0[0,1,2,3],mem[4,5,6,7] ; AVX2-SLOW-NEXT: vmovaps 80(%rdi), %xmm0 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm11 = ymm8[0,1],ymm0[2,3],ymm8[4,5,6,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm11 = ymm11[0,1,0,2,4,5,4,6] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm11 = ymm11[0,1,0,2,4,5,4,6] ; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm3, %ymm5 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm11[2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload @@ -3513,7 +3513,7 @@ ; AVX2-SLOW-NEXT: # ymm11 = mem[0,1],ymm3[2,3],mem[4,5,6,7] ; AVX2-SLOW-NEXT: vpermps %ymm11, %ymm2, %ymm2 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm2[5,6,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm2 = ymm14[1,1,1,1,5,5,5,5] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm14[1,1,1,1,5,5,5,5] ; AVX2-SLOW-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm2 = ymm2[0,1,2],mem[3],ymm2[4,5,6,7] ; AVX2-SLOW-NEXT: vmovaps {{.*#+}} xmm14 = <5,3,u,u> @@ -3523,19 +3523,19 @@ ; AVX2-SLOW-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-SLOW-NEXT: vpermps %ymm7, %ymm3, %ymm7 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm7[5,6,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm7 = ymm13[1,1,1,1,5,5,5,5] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm7 = ymm13[1,1,1,1,5,5,5,5] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2],ymm6[3],ymm7[4,5,6,7] ; AVX2-SLOW-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm7 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpermps %ymm10, %ymm3, %ymm7 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm7[5,6,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm7 = ymm12[1,1,1,1,5,5,5,5] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm7 = ymm12[1,1,1,1,5,5,5,5] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm7[0,1,2],ymm4[3],ymm7[4,5,6,7] ; AVX2-SLOW-NEXT: vpermps %ymm9, %ymm14, %ymm7 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm7[0,1],ymm4[2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpermps %ymm15, %ymm3, %ymm7 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm7[5,6,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm7 = ymm8[1,1,1,1,5,5,5,5] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm7 = ymm8[1,1,1,1,5,5,5,5] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1,2],ymm0[3],ymm7[4,5,6,7] ; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm14, %ymm1 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] @@ -3615,7 +3615,7 @@ ; AVX2-FAST-NEXT: vpermps %ymm1, %ymm14, %ymm2 ; AVX2-FAST-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm5[0,1],ymm3[0,1] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm11 = ymm1[0,1,2,3,4,5],ymm3[6,7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm3 = ymm11[0,2,2,2,4,6,6,6] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm3 = ymm11[0,2,2,2,4,6,6,6] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm2[0,1,2],ymm3[3,4,5,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm10 = ymm4[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-NEXT: vbroadcastsd {{.*#+}} ymm2 = [8589934596,8589934596,8589934596,8589934596] @@ -3629,7 +3629,7 @@ ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm0[0,1,2,3],ymm8[4,5],ymm0[6,7] ; AVX2-FAST-NEXT: vpermps %ymm6, %ymm14, %ymm5 -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm7 = ymm12[0,2,2,2,4,6,6,6] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm7 = ymm12[0,2,2,2,4,6,6,6] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2],ymm7[3,4,5,6,7] ; AVX2-FAST-NEXT: vmovaps 512(%rdi), %ymm0 ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -3651,7 +3651,7 @@ ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] ; AVX2-FAST-NEXT: vpermps %ymm3, %ymm14, %ymm15 -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm13 = ymm9[0,2,2,2,4,6,6,6] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm13 = ymm9[0,2,2,2,4,6,6,6] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm13 = ymm15[0,1,2],ymm13[3,4,5,6,7] ; AVX2-FAST-NEXT: vmovaps 320(%rdi), %ymm0 ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -3673,7 +3673,7 @@ ; AVX2-FAST-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm15[0,1],ymm1[0,1] ; AVX2-FAST-NEXT: vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm1[6,7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm5 = ymm7[0,2,2,2,4,6,6,6] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm5 = ymm7[0,2,2,2,4,6,6,6] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm14[0,1,2],ymm5[3,4,5,6,7] ; AVX2-FAST-NEXT: vmovaps 704(%rdi), %ymm1 ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -3685,34 +3685,34 @@ ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps {{.*#+}} xmm0 = <1,7,5,u> ; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm11 = ymm11[1,3,2,3,5,7,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm11 = ymm11[1,3,2,3,5,7,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2],ymm11[3,4,5,6,7] ; AVX2-FAST-NEXT: vbroadcastsd {{.*#+}} ymm11 = [12884901893,12884901893,12884901893,12884901893] ; AVX2-FAST-NEXT: vpermps %ymm10, %ymm11, %ymm10 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm10[6,7] ; AVX2-FAST-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpermps %ymm6, %ymm0, %ymm5 -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm6 = ymm12[1,3,2,3,5,7,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm6 = ymm12[1,3,2,3,5,7,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2],ymm6[3,4,5,6,7] ; AVX2-FAST-NEXT: vpermps %ymm4, %ymm11, %ymm4 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7] ; AVX2-FAST-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpermps %ymm3, %ymm0, %ymm1 -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm3 = ymm9[1,3,2,3,5,7,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm3 = ymm9[1,3,2,3,5,7,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm3[3,4,5,6,7] ; AVX2-FAST-NEXT: vpermps %ymm8, %ymm11, %ymm2 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpermps %ymm13, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm1 = ymm7[1,3,2,3,5,7,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm7[1,3,2,3,5,7,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX2-FAST-NEXT: vpermps %ymm14, %ymm11, %ymm1 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps (%rdi), %xmm2 -; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm0 = xmm2[2,3,2,3] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm2[2,3,2,3] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm1 = ymm9[0,0,2,3,4,4,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm9[0,0,2,3,4,4,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload @@ -3722,17 +3722,17 @@ ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,3,2,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm3[3,4,5,6,7] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm3 = ymm8[0,0,0,0,4,4,4,4] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm3 = ymm8[0,0,0,0,4,4,4,4] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm4 = ymm10[0,0,2,3,4,4,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm10[0,0,2,3,4,4,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3],ymm4[4,5,6],ymm3[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm3[5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps 384(%rdi), %xmm1 -; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm3 = xmm1[2,3,2,3] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm3 = xmm1[2,3,2,3] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm4 = ymm0[0,0,2,3,4,4,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm0[0,0,2,3,4,4,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm4[1,2,3],ymm3[4],ymm4[5,6,7] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX2-FAST-NEXT: vblendps $51, (%rsp), %ymm4, %ymm4 # 32-byte Folded Reload @@ -3750,7 +3750,7 @@ ; AVX2-FAST-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps 576(%rdi), %xmm3 ; AVX2-FAST-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm4 = xmm3[2,3,2,3] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm4 = xmm3[2,3,2,3] ; AVX2-FAST-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm5 = mem[0,0,2,3,4,4,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm5[1,2,3],ymm4[4],ymm5[5,6,7] @@ -3772,64 +3772,64 @@ ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm14[0,1],ymm15[2,3],ymm14[4,5],ymm15[6,7] ; AVX2-FAST-NEXT: vpermps %ymm4, %ymm7, %ymm4 ; AVX2-FAST-NEXT: vmovaps 192(%rdi), %xmm13 -; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm5 = xmm13[2,3,2,3] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm5 = xmm13[2,3,2,3] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm6 = ymm7[0,0,2,3,4,4,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm6 = ymm7[0,0,2,3,4,4,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm6[1,2,3],ymm5[4],ymm6[5,6,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,3,2,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3,4,5,6,7] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm5 = ymm3[0,0,0,0,4,4,4,4] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm5 = ymm3[0,0,0,0,4,4,4,4] ; AVX2-FAST-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm6 = mem[0,0,2,3,4,4,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2],ymm5[3],ymm6[4,5,6],ymm5[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,0,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm5[5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[3,3,3,3] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm4 = ymm9[0,1,3,3,4,5,7,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm2 = xmm2[3,3,3,3] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm9[0,1,3,3,4,5,7,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm4[1,2,3],ymm2[4],ymm4[5,6,7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm4 = ymm12[3,3,3,3,7,7,7,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm12[3,3,3,3,7,7,7,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm11[1],ymm4[2,3,4],ymm11[5],ymm4[6,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,2,0,3] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,3,2,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm4[3,4,5,6,7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm4 = ymm8[0,1,0,1,4,5,4,5] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm5 = ymm10[0,1,3,3,4,5,7,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm8[0,1,0,1,4,5,4,5] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm5 = ymm10[0,1,3,3,4,5,7,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3],ymm5[4,5,6],ymm4[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,0,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm4[5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,3,3,3] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm2 = ymm0[0,1,3,3,4,5,7,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm0[0,1,3,3,4,5,7,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm2 = ymm9[3,3,3,3,7,7,7,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm9[3,3,3,3,7,7,7,7] ; AVX2-FAST-NEXT: vmovups (%rsp), %ymm5 # 32-byte Reload ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm5[1],ymm2[2,3,4],ymm5[5],ymm2[6,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,0,3] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,3,2,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm2 = ymm8[0,1,0,1,4,5,4,5] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm8[0,1,0,1,4,5,4,5] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm4 = ymm6[0,1,3,3,4,5,7,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm6[0,1,3,3,4,5,7,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2],ymm2[3],ymm4[4,5,6],ymm2[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm0 = xmm13[3,3,3,3] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm1 = ymm7[0,1,3,3,4,5,7,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm13[3,3,3,3] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm7[0,1,3,3,4,5,7,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm1 = ymm15[3,3,3,3,7,7,7,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm15[3,3,3,3,7,7,7,7] ; AVX2-FAST-NEXT: vmovaps %ymm15, %ymm13 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm14[1],ymm1[2,3,4],ymm14[5],ymm1[6,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm1 = ymm3[0,1,0,1,4,5,4,5] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm3[0,1,0,1,4,5,4,5] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm2 = ymm10[0,1,3,3,4,5,7,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm10[0,1,3,3,4,5,7,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] @@ -3837,10 +3837,10 @@ ; AVX2-FAST-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; AVX2-FAST-NEXT: # xmm0 = mem[3,3,3,3] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm1 = ymm3[0,1,3,3,4,5,7,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm3[0,1,3,3,4,5,7,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm1 = ymm12[3,3,3,3,7,7,7,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm12[3,3,3,3,7,7,7,7] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm11[1],ymm1[2,3,4],ymm11[5],ymm1[6,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3] @@ -3849,7 +3849,7 @@ ; AVX2-FAST-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm1 = mem[0,1,0,1,4,5,4,5] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm2 = ymm15[0,1,3,3,4,5,7,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm15[0,1,3,3,4,5,7,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] @@ -3862,7 +3862,7 @@ ; AVX2-FAST-NEXT: vmovaps 464(%rdi), %xmm1 ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm14[0,1],ymm1[2,3],ymm14[4,5,6,7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm2 = ymm2[0,1,0,2,4,5,4,6] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,1,0,2,4,5,4,6] ; AVX2-FAST-NEXT: vmovaps {{.*#+}} xmm1 = <4,2,u,u> ; AVX2-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm4 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm2[2,3,4,5,6,7] @@ -3880,7 +3880,7 @@ ; AVX2-FAST-NEXT: # ymm13 = mem[0,1,2,3],ymm13[4,5,6,7] ; AVX2-FAST-NEXT: vmovaps 272(%rdi), %xmm6 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm13[0,1],ymm6[2,3],ymm13[4,5,6,7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm4 = ymm4[0,1,0,2,4,5,4,6] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,1,0,2,4,5,4,6] ; AVX2-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm8 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm8[0,1],ymm4[2,3,4,5,6,7] ; AVX2-FAST-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload @@ -3893,7 +3893,7 @@ ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm12 = ymm11[0,1,2,3],ymm12[4,5,6,7] ; AVX2-FAST-NEXT: vmovaps 656(%rdi), %xmm4 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm12[0,1],ymm4[2,3],ymm12[4,5,6,7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm8 = ymm8[0,1,0,2,4,5,4,6] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm8 = ymm8[0,1,0,2,4,5,4,6] ; AVX2-FAST-NEXT: vpermps %ymm9, %ymm1, %ymm11 ; AVX2-FAST-NEXT: vmovaps %ymm1, %ymm3 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm11[0,1],ymm8[2,3,4,5,6,7] @@ -3910,7 +3910,7 @@ ; AVX2-FAST-NEXT: # ymm1 = ymm0[0,1,2,3],mem[4,5,6,7] ; AVX2-FAST-NEXT: vmovaps 80(%rdi), %xmm0 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm11 = ymm8[0,1],ymm0[2,3],ymm8[4,5,6,7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm11 = ymm11[0,1,0,2,4,5,4,6] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm11 = ymm11[0,1,0,2,4,5,4,6] ; AVX2-FAST-NEXT: vpermps %ymm1, %ymm3, %ymm5 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm11[2,3,4,5,6,7] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload @@ -3918,7 +3918,7 @@ ; AVX2-FAST-NEXT: # ymm11 = mem[0,1],ymm3[2,3],mem[4,5,6,7] ; AVX2-FAST-NEXT: vpermps %ymm11, %ymm2, %ymm2 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm2[5,6,7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm2 = ymm14[1,1,1,1,5,5,5,5] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm14[1,1,1,1,5,5,5,5] ; AVX2-FAST-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm2 = ymm2[0,1,2],mem[3],ymm2[4,5,6,7] ; AVX2-FAST-NEXT: vmovaps {{.*#+}} xmm14 = <5,3,u,u> @@ -3928,19 +3928,19 @@ ; AVX2-FAST-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-FAST-NEXT: vpermps %ymm7, %ymm3, %ymm7 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm7[5,6,7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm7 = ymm13[1,1,1,1,5,5,5,5] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm7 = ymm13[1,1,1,1,5,5,5,5] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2],ymm6[3],ymm7[4,5,6,7] ; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm7 # 32-byte Folded Reload ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3,4,5,6,7] ; AVX2-FAST-NEXT: vpermps %ymm10, %ymm3, %ymm7 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm7[5,6,7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm7 = ymm12[1,1,1,1,5,5,5,5] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm7 = ymm12[1,1,1,1,5,5,5,5] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm7[0,1,2],ymm4[3],ymm7[4,5,6,7] ; AVX2-FAST-NEXT: vpermps %ymm9, %ymm14, %ymm7 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm7[0,1],ymm4[2,3,4,5,6,7] ; AVX2-FAST-NEXT: vpermps %ymm15, %ymm3, %ymm7 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm7[5,6,7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm7 = ymm8[1,1,1,1,5,5,5,5] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm7 = ymm8[1,1,1,1,5,5,5,5] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1,2],ymm0[3],ymm7[4,5,6,7] ; AVX2-FAST-NEXT: vpermps %ymm1, %ymm14, %ymm1 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] @@ -4020,7 +4020,7 @@ ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm14, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm5[0,1],ymm3[0,1] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm11 = ymm1[0,1,2,3,4,5],ymm3[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm3 = ymm11[0,2,2,2,4,6,6,6] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm3 = ymm11[0,2,2,2,4,6,6,6] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm2[0,1,2],ymm3[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm10 = ymm4[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd {{.*#+}} ymm2 = [8589934596,8589934596,8589934596,8589934596] @@ -4034,7 +4034,7 @@ ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm0[0,1,2,3],ymm8[4,5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm6, %ymm14, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm7 = ymm12[0,2,2,2,4,6,6,6] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm7 = ymm12[0,2,2,2,4,6,6,6] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2],ymm7[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps 512(%rdi), %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -4056,7 +4056,7 @@ ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm3, %ymm14, %ymm15 -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm13 = ymm9[0,2,2,2,4,6,6,6] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm13 = ymm9[0,2,2,2,4,6,6,6] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm13 = ymm15[0,1,2],ymm13[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps 320(%rdi), %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill @@ -4078,7 +4078,7 @@ ; AVX2-FAST-PERLANE-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm15[0,1],ymm1[0,1] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm5 = ymm7[0,2,2,2,4,6,6,6] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm5 = ymm7[0,2,2,2,4,6,6,6] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm14[0,1,2],ymm5[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps 704(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -4090,59 +4090,59 @@ ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} xmm0 = <1,7,5,u> ; AVX2-FAST-PERLANE-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm11 = ymm11[1,3,2,3,5,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm11 = ymm11[1,3,2,3,5,7,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2],ymm11[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd {{.*#+}} ymm11 = [12884901893,12884901893,12884901893,12884901893] ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm10, %ymm11, %ymm10 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm10[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm6, %ymm0, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm6 = ymm12[1,3,2,3,5,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm6 = ymm12[1,3,2,3,5,7,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2],ymm6[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm4, %ymm11, %ymm4 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm3, %ymm0, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm3 = ymm9[1,3,2,3,5,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm3 = ymm9[1,3,2,3,5,7,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm3[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm8, %ymm11, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm13, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm7[1,3,2,3,5,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm7[1,3,2,3,5,7,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm14, %ymm11, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdi), %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm0 = xmm2[2,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm2[2,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm13[0,0,2,3,4,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm13[0,0,2,3,4,4,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm11[0,1],ymm12[2,3],ymm11[4,5],ymm12[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[2,0,2,3,6,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0,2,3,6,4,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm9[0,0,0,0,4,4,4,4] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm9[0,0,0,0,4,4,4,4] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm3 = ymm10[0,0,2,3,4,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm3 = ymm10[0,0,2,3,4,4,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3],ymm3[4,5,6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 384(%rdi), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm0 = xmm1[2,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm1[2,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm3 = ymm8[0,0,2,3,4,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm3 = ymm8[0,0,2,3,4,4,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm3[1,2,3],ymm0[4],ymm3[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm3 = mem[0,1],ymm3[2,3],mem[4,5],ymm3[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm3 = ymm3[2,0,2,3,6,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm3 = ymm3[2,0,2,3,6,4,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,2,0,3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm3[3,4,5,6,7] @@ -4156,13 +4156,13 @@ ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 576(%rdi), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm4 = mem[0,0,2,3,4,4,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm4[1,2,3],ymm0[4],ymm4[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps $204, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm4 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm4 = ymm15[0,1],mem[2,3],ymm15[4,5],mem[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm4 = ymm4[2,0,2,3,6,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm4 = ymm4[2,0,2,3,6,4,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,2,0,3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm4[3,4,5,6,7] @@ -4175,69 +4175,69 @@ ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm4[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 192(%rdi), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm4 = xmm0[2,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm4 = xmm0[2,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm5 = ymm7[0,0,2,3,4,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm5 = ymm7[0,0,2,3,4,4,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm5[1,2,3],ymm4[4],ymm5[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm14[0,1],ymm15[2,3],ymm14[4,5],ymm15[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm5 = ymm5[2,0,2,3,6,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm5 = ymm5[2,0,2,3,6,4,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,2,0,3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2],ymm5[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm5 = ymm3[0,0,0,0,4,4,4,4] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm5 = ymm3[0,0,0,0,4,4,4,4] ; AVX2-FAST-PERLANE-NEXT: vpermilps $224, (%rsp), %ymm6 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm6 = mem[0,0,2,3,4,4,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2],ymm5[3],ymm6[4,5,6],ymm5[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,0,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm5[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[3,3,3,3] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm4 = ymm13[0,1,3,3,4,5,7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm2 = xmm2[3,3,3,3] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm4 = ymm13[0,1,3,3,4,5,7,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm4[1,2,3],ymm2[4],ymm4[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm4 = ymm12[3,3,3,3,7,7,7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm4 = ymm12[3,3,3,3,7,7,7,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm11[1],ymm4[2,3,4],ymm11[5],ymm4[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,2,0,3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm4[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm4 = ymm9[0,1,0,1,4,5,4,5] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm5 = ymm10[0,1,3,3,4,5,7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm4 = ymm9[0,1,0,1,4,5,4,5] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm5 = ymm10[0,1,3,3,4,5,7,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3],ymm5[4,5,6],ymm4[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,0,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm4[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,3,3,3] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm2 = ymm8[0,1,3,3,4,5,7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm8[0,1,3,3,4,5,7,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm8, %ymm13 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm2 = ymm9[3,3,3,3,7,7,7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm9[3,3,3,3,7,7,7,7] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm5[1],ymm2[2,3,4],ymm5[5],ymm2[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,0,3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm2 = ymm8[0,1,0,1,4,5,4,5] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm8[0,1,0,1,4,5,4,5] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm4 = ymm6[0,1,3,3,4,5,7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm4 = ymm6[0,1,3,3,4,5,7,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2],ymm2[3],ymm4[4,5,6],ymm2[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm7[0,1,3,3,4,5,7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm7[0,1,3,3,4,5,7,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm15[3,3,3,3,7,7,7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm15[3,3,3,3,7,7,7,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm14[1],ymm1[2,3,4],ymm14[5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm3[0,1,0,1,4,5,4,5] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm3[0,1,0,1,4,5,4,5] ; AVX2-FAST-PERLANE-NEXT: vmovups (%rsp), %ymm10 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm2 = ymm10[0,1,3,3,4,5,7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm10[0,1,3,3,4,5,7,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] @@ -4245,10 +4245,10 @@ ; AVX2-FAST-PERLANE-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm0 = mem[3,3,3,3] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm3[0,1,3,3,4,5,7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm3[0,1,3,3,4,5,7,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm12[3,3,3,3,7,7,7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm12[3,3,3,3,7,7,7,7] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm11[1],ymm1[2,3,4],ymm11[5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3] @@ -4257,7 +4257,7 @@ ; AVX2-FAST-PERLANE-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0,1,0,1,4,5,4,5] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm2 = ymm15[0,1,3,3,4,5,7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm15[0,1,3,3,4,5,7,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] @@ -4269,7 +4269,7 @@ ; AVX2-FAST-PERLANE-NEXT: vmovaps 464(%rdi), %xmm1 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm14[0,1],ymm1[2,3],ymm14[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm2 = ymm2[0,1,0,2,4,5,4,6] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,1,0,2,4,5,4,6] ; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} xmm1 = <4,2,u,u> ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm0, %ymm1, %ymm4 ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, %ymm9 @@ -4289,7 +4289,7 @@ ; AVX2-FAST-PERLANE-NEXT: # ymm13 = ymm1[0,1,2,3],mem[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps 272(%rdi), %xmm6 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm13[0,1],ymm6[2,3],ymm13[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm4 = ymm4[0,1,0,2,4,5,4,6] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,1,0,2,4,5,4,6] ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm9, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm0, %ymm9, %ymm8 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm8[0,1],ymm4[2,3,4,5,6,7] @@ -4303,7 +4303,7 @@ ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm12 = ymm11[0,1,2,3],ymm12[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps 656(%rdi), %xmm4 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm12[0,1],ymm4[2,3],ymm12[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm8 = ymm8[0,1,0,2,4,5,4,6] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm8 = ymm8[0,1,0,2,4,5,4,6] ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm9, %ymm1, %ymm11 ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, %ymm3 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm11[0,1],ymm8[2,3,4,5,6,7] @@ -4320,7 +4320,7 @@ ; AVX2-FAST-PERLANE-NEXT: # ymm1 = ymm0[0,1,2,3],mem[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps 80(%rdi), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm11 = ymm8[0,1],ymm0[2,3],ymm8[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm11 = ymm11[0,1,0,2,4,5,4,6] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm11 = ymm11[0,1,0,2,4,5,4,6] ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm3, %ymm5 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm11[2,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload @@ -4328,7 +4328,7 @@ ; AVX2-FAST-PERLANE-NEXT: # ymm11 = mem[0,1],ymm3[2,3],mem[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm11, %ymm2, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm2[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm2 = ymm14[1,1,1,1,5,5,5,5] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm14[1,1,1,1,5,5,5,5] ; AVX2-FAST-PERLANE-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm2 = ymm2[0,1,2],mem[3],ymm2[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} xmm14 = <5,3,u,u> @@ -4338,19 +4338,19 @@ ; AVX2-FAST-PERLANE-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm7, %ymm3, %ymm7 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm7[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm7 = ymm13[1,1,1,1,5,5,5,5] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm7 = ymm13[1,1,1,1,5,5,5,5] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2],ymm6[3],ymm7[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm7 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm10, %ymm3, %ymm7 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm7[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm7 = ymm12[1,1,1,1,5,5,5,5] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm7 = ymm12[1,1,1,1,5,5,5,5] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm7[0,1,2],ymm4[3],ymm7[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm9, %ymm14, %ymm7 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm7[0,1],ymm4[2,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm15, %ymm3, %ymm7 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm7[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm7 = ymm8[1,1,1,1,5,5,5,5] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm7 = ymm8[1,1,1,1,5,5,5,5] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1,2],ymm0[3],ymm7[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm14, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] @@ -6048,7 +6048,7 @@ ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm5[2,3],ymm4[0,1] ; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0],ymm5[0],ymm1[3],ymm5[2] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, 672(%rdi), %ymm3, %ymm1 @@ -6070,7 +6070,7 @@ ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[0,1] ; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[3],ymm2[2] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 1056(%rdi), %ymm1 @@ -6097,7 +6097,7 @@ ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[0,1] ; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[3],ymm2[2] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 1440(%rdi), %ymm1 @@ -6124,7 +6124,7 @@ ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[0,1] ; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[3],ymm2[2] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %ymm1 @@ -6151,7 +6151,7 @@ ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[0,1] ; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[3],ymm2[2] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %ymm1 @@ -6178,7 +6178,7 @@ ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[0,1] ; AVX1-ONLY-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[3],ymm2[2] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 864(%rdi), %ymm1 @@ -6205,7 +6205,7 @@ ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[0,1] ; AVX1-ONLY-NEXT: vmovupd %ymm1, (%rsp) # 32-byte Spill ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[3],ymm2[2] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 1248(%rdi), %ymm1 @@ -6233,7 +6233,7 @@ ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm15[0,1] ; AVX1-ONLY-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vshufpd {{.*#+}} ymm15 = ymm2[0],ymm0[0],ymm2[3],ymm0[2] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm15 = ymm15[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm15[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -6248,7 +6248,7 @@ ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps $215, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm15 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm15 = ymm1[3,1],mem[1,3],ymm1[7,5],mem[5,7] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm15 = ymm15[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm15[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -6262,7 +6262,7 @@ ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm1[3,1],ymm15[1,3],ymm1[7,5],ymm15[5,7] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm14 = ymm14[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm14[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -6275,7 +6275,7 @@ ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm1[3,1],ymm13[1,3],ymm1[7,5],ymm13[5,7] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm11 = ymm11[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm11[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm11[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -6288,7 +6288,7 @@ ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm1[3,1],ymm14[1,3],ymm1[7,5],ymm14[5,7] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm9 = ymm9[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm9[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm9[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -6301,7 +6301,7 @@ ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps $215, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm7 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm7 = ymm1[3,1],mem[1,3],ymm1[7,5],mem[5,7] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm7 = ymm7[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm7[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm7[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -6314,7 +6314,7 @@ ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps $215, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm5 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm5 = ymm1[3,1],mem[1,3],ymm1[7,5],mem[5,7] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm5 = ymm5[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm5[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload @@ -6327,7 +6327,7 @@ ; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps $215, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm3 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm3 = ymm1[3,1],mem[1,3],ymm1[7,5],mem[5,7] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm3 = ymm3[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm3[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload @@ -6341,7 +6341,7 @@ ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[3,1],ymm2[1,3],ymm1[7,5],ymm2[5,7] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -6591,7 +6591,7 @@ ; AVX1-ONLY-NEXT: # ymm2 = mem[0,1],ymm0[2,3],mem[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,2,3,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,2,3,3] ; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] @@ -6611,7 +6611,7 @@ ; AVX1-ONLY-NEXT: # ymm6 = mem[0,1],ymm0[2,3],mem[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,2,3,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,2,3,3] ; AVX1-ONLY-NEXT: vmovaps 208(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] @@ -6635,7 +6635,7 @@ ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,2,3,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,2,3,3] ; AVX1-ONLY-NEXT: vmovaps 400(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] @@ -6654,7 +6654,7 @@ ; AVX1-ONLY-NEXT: # ymm12 = mem[0,1],ymm0[2,3],mem[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 608(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,2,3,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,2,3,3] ; AVX1-ONLY-NEXT: vmovaps 592(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] @@ -6674,7 +6674,7 @@ ; AVX1-ONLY-NEXT: # ymm11 = mem[0,1],ymm0[2,3],mem[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 800(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,2,3,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,2,3,3] ; AVX1-ONLY-NEXT: vmovaps 784(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] @@ -6693,7 +6693,7 @@ ; AVX1-ONLY-NEXT: # ymm10 = mem[0,1],ymm0[2,3],mem[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 992(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,2,3,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,2,3,3] ; AVX1-ONLY-NEXT: vmovaps 976(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] @@ -6714,7 +6714,7 @@ ; AVX1-ONLY-NEXT: # ymm9 = mem[0,1],ymm0[2,3],mem[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 1184(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,2,3,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,2,3,3] ; AVX1-ONLY-NEXT: vmovaps 1168(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] @@ -6734,7 +6734,7 @@ ; AVX1-ONLY-NEXT: # ymm7 = mem[0,1],ymm0[2,3],mem[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 1376(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,2,3,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,2,3,3] ; AVX1-ONLY-NEXT: vmovaps 1360(%rdi), %xmm13 ; AVX1-ONLY-NEXT: vmovaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm13[0],xmm0[1],xmm13[2,3] @@ -6761,7 +6761,7 @@ ; AVX1-ONLY-NEXT: # ymm15 = ymm15[3,1],mem[1,3],ymm15[7,5],mem[5,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm14[1,1],ymm15[2,0],ymm14[5,5],ymm15[6,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm13 = xmm13[1,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm13 = xmm13[1,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1],ymm15[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm13[0,1,2,3,4],ymm8[5,6,7] ; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm8 # 32-byte Reload @@ -6775,7 +6775,7 @@ ; AVX1-ONLY-NEXT: # ymm15 = ymm8[3,1],mem[1,3],ymm8[7,5],mem[5,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm8[1,1],ymm15[2,0],ymm8[5,5],ymm15[6,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm13 = xmm13[1,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm13 = xmm13[1,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1],ymm15[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm13[0,1,2,3,4],ymm6[5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload @@ -6789,7 +6789,7 @@ ; AVX1-ONLY-NEXT: # ymm15 = ymm8[3,1],mem[1,3],ymm8[7,5],mem[5,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm8[1,1],ymm15[2,0],ymm8[5,5],ymm15[6,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm13 = xmm13[1,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm13 = xmm13[1,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1],ymm15[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm13[0,1,2,3,4],ymm5[5,6,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm12[3,0],ymm4[1,0],ymm12[7,4],ymm4[5,4] @@ -6802,7 +6802,7 @@ ; AVX1-ONLY-NEXT: # ymm13 = ymm8[3,1],mem[1,3],ymm8[7,5],mem[5,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm8[1,1],ymm13[2,0],ymm8[5,5],ymm13[6,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm12 = xmm12[1,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm12 = xmm12[1,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1],ymm13[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm12[0,1,2,3,4],ymm4[5,6,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm11[3,0],ymm3[1,0],ymm11[7,4],ymm3[5,4] @@ -6815,7 +6815,7 @@ ; AVX1-ONLY-NEXT: # ymm12 = ymm8[3,1],mem[1,3],ymm8[7,5],mem[5,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm8[1,1],ymm12[2,0],ymm8[5,5],ymm12[6,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm11 = xmm11[1,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm11 = xmm11[1,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1],ymm12[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm11[0,1,2,3,4],ymm3[5,6,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[3,0],ymm2[1,0],ymm10[7,4],ymm2[5,4] @@ -6828,7 +6828,7 @@ ; AVX1-ONLY-NEXT: # ymm11 = ymm8[3,1],mem[1,3],ymm8[7,5],mem[5,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm8[1,1],ymm11[2,0],ymm8[5,5],ymm11[6,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm10 = xmm10[1,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm10[1,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1],ymm11[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm10[0,1,2,3,4],ymm2[5,6,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm9[3,0],ymm1[1,0],ymm9[7,4],ymm1[5,4] @@ -6841,7 +6841,7 @@ ; AVX1-ONLY-NEXT: # ymm10 = ymm8[3,1],mem[1,3],ymm8[7,5],mem[5,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm8[1,1],ymm10[2,0],ymm8[5,5],ymm10[6,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm9 = xmm9[1,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm9 = xmm9[1,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1],ymm10[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm9[0,1,2,3,4],ymm1[5,6,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm7[3,0],ymm0[1,0],ymm7[7,4],ymm0[5,4] @@ -6854,7 +6854,7 @@ ; AVX1-ONLY-NEXT: # ymm9 = ymm8[3,1],mem[1,3],ymm8[7,5],mem[5,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm8[1,1],ymm9[2,0],ymm8[5,5],ymm9[6,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm7 = xmm7[1,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm7[1,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm9[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1,2,3,4],ymm0[5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload @@ -6976,7 +6976,7 @@ ; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm9[0,1],ymm8[0,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm8[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,2,2,2,4,6,6,6] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2,2,2,4,6,6,6] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm6[4,5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -6993,7 +6993,7 @@ ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5],ymm0[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpermps %ymm0, %ymm4, %ymm0 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,2,2,2,4,6,6,6] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2,2,2,4,6,6,6] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovaps 704(%rdi), %ymm1 ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -7018,7 +7018,7 @@ ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpermps %ymm0, %ymm4, %ymm0 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm2[0,2,2,2,4,6,6,6] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,2,2,2,4,6,6,6] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovaps 1088(%rdi), %ymm1 ; AVX2-SLOW-NEXT: vmovups %ymm1, (%rsp) # 32-byte Spill @@ -7043,7 +7043,7 @@ ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpermps %ymm0, %ymm4, %ymm0 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm2[0,2,2,2,4,6,6,6] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,2,2,2,4,6,6,6] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovaps 1472(%rdi), %ymm1 ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -7066,7 +7066,7 @@ ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm12 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] ; AVX2-SLOW-NEXT: vpermps %ymm12, %ymm4, %ymm0 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm13[0,2,2,2,4,6,6,6] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm13[0,2,2,2,4,6,6,6] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovaps 128(%rdi), %ymm1 ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -7088,7 +7088,7 @@ ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] ; AVX2-SLOW-NEXT: vpermps %ymm7, %ymm4, %ymm0 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm10[0,2,2,2,4,6,6,6] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm10[0,2,2,2,4,6,6,6] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovaps 512(%rdi), %ymm1 ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -7110,7 +7110,7 @@ ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] ; AVX2-SLOW-NEXT: vpermps %ymm3, %ymm4, %ymm0 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm6[0,2,2,2,4,6,6,6] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm6[0,2,2,2,4,6,6,6] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovaps 896(%rdi), %ymm1 ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -7132,7 +7132,7 @@ ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[0,1],ymm4[0,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm1[0,1,2,3,4,5],ymm4[6,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm4[0,2,2,2,4,6,6,6] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm4[0,2,2,2,4,6,6,6] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovaps 1280(%rdi), %ymm1 ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -7173,43 +7173,43 @@ ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpermps %ymm12, %ymm9, %ymm12 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm13 = ymm13[1,3,2,3,5,7,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm13 = ymm13[1,3,2,3,5,7,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2],ymm13[3,4,5,6,7] ; AVX2-SLOW-NEXT: vpermps %ymm11, %ymm0, %ymm11 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm11 = ymm12[0,1,2,3,4,5],ymm11[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpermps %ymm7, %ymm9, %ymm7 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm10 = ymm10[1,3,2,3,5,7,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm10 = ymm10[1,3,2,3,5,7,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2],ymm10[3,4,5,6,7] ; AVX2-SLOW-NEXT: vpermps %ymm8, %ymm0, %ymm8 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm8[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpermps %ymm3, %ymm9, %ymm3 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm6 = ymm6[1,3,2,3,5,7,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm6 = ymm6[1,3,2,3,5,7,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm6[3,4,5,6,7] ; AVX2-SLOW-NEXT: vpermps %ymm5, %ymm0, %ymm5 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm5[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm9, %ymm2 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm3 = ymm4[1,3,2,3,5,7,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm3 = ymm4[1,3,2,3,5,7,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3,4,5,6,7] ; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps 192(%rdi), %xmm6 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm6[2,3,2,3] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm6[2,3,2,3] ; AVX2-SLOW-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm1 = mem[0,0,2,3,4,4,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-SLOW-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm1 = mem[0,1],ymm1[2,3],mem[4,5],ymm1[6,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[2,0,2,3,6,4,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0,2,3,6,4,6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm5[0,0,0,0,4,4,4,4] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm5[0,0,0,0,4,4,4,4] ; AVX2-SLOW-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm2 = mem[0,0,2,3,4,4,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] @@ -7218,14 +7218,14 @@ ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps 576(%rdi), %xmm0 ; AVX2-SLOW-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX2-SLOW-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm1 = mem[0,0,2,3,4,4,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm13[0,1],ymm4[2,3],ymm13[4,5],ymm4[6,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[2,0,2,3,6,4,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0,2,3,6,4,6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] @@ -7239,19 +7239,19 @@ ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps 960(%rdi), %xmm0 ; AVX2-SLOW-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm12[0,0,2,3,4,4,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm12[0,0,2,3,4,4,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-SLOW-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm1 = mem[0,1],ymm1[2,3],mem[4,5],ymm1[6,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[2,0,2,3,6,4,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0,2,3,6,4,6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm3[0,0,0,0,4,4,4,4] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm3[0,0,0,0,4,4,4,4] ; AVX2-SLOW-NEXT: vpermilps $224, (%rsp), %ymm2 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm2 = mem[0,0,2,3,4,4,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] @@ -7260,34 +7260,34 @@ ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps 1344(%rdi), %xmm0 ; AVX2-SLOW-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX2-SLOW-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm1 = mem[0,0,2,3,4,4,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm11[0,1],ymm14[2,3],ymm11[4,5],ymm14[6,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[2,0,2,3,6,4,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0,2,3,6,4,6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX2-SLOW-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm1 = mem[0,0,0,0,4,4,4,4] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm2 = ymm15[0,0,2,3,4,4,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm15[0,0,2,3,4,4,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps 1152(%rdi), %xmm2 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm2[2,3,2,3] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm2[2,3,2,3] ; AVX2-SLOW-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm1 = mem[0,0,2,3,4,4,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-SLOW-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm1 = mem[0,1],ymm1[2,3],mem[4,5],ymm1[6,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[2,0,2,3,6,4,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0,2,3,6,4,6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] @@ -7301,14 +7301,14 @@ ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps (%rdi), %xmm0 ; AVX2-SLOW-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX2-SLOW-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm1 = mem[0,0,2,3,4,4,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-SLOW-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm1 = mem[0,1],ymm1[2,3],mem[4,5],ymm1[6,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[2,0,2,3,6,4,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0,2,3,6,4,6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] @@ -7322,14 +7322,14 @@ ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps 384(%rdi), %xmm0 ; AVX2-SLOW-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX2-SLOW-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm8 = mem[0,0,2,3,4,4,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm8[1,2,3],ymm0[4],ymm8[5,6,7] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-SLOW-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm8 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm8 = mem[0,1],ymm1[2,3],mem[4,5],ymm1[6,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm8 = ymm8[2,0,2,3,6,4,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm8 = ymm8[2,0,2,3,6,4,6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,2,0,3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm8[3,4,5,6,7] @@ -7342,26 +7342,26 @@ ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm8[5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps 768(%rdi), %xmm0 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm8 = xmm0[2,3,2,3] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm8 = xmm0[2,3,2,3] ; AVX2-SLOW-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm9 = mem[0,0,2,3,4,4,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0],ymm9[1,2,3],ymm8[4],ymm9[5,6,7] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-SLOW-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm9 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm9 = mem[0,1],ymm1[2,3],mem[4,5],ymm1[6,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm9 = ymm9[2,0,2,3,6,4,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm9 = ymm9[2,0,2,3,6,4,6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[0,2,0,3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,3,2,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2],ymm9[3,4,5,6,7] ; AVX2-SLOW-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm9 = mem[0,0,0,0,4,4,4,4] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm10 = ymm7[0,0,2,3,4,4,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm10 = ymm7[0,0,2,3,4,4,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1,2],ymm9[3],ymm10[4,5,6],ymm9[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[0,1,0,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm9[5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm6 = xmm6[3,3,3,3] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm6 = xmm6[3,3,3,3] ; AVX2-SLOW-NEXT: vpermilps $244, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm8 = mem[0,1,3,3,4,5,7,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm8[1,2,3],ymm6[4],ymm8[5,6,7] @@ -7372,7 +7372,7 @@ ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,2,0,3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,3,2,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2],ymm8[3,4,5,6,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm8 = ymm5[0,1,0,1,4,5,4,5] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm8 = ymm5[0,1,0,1,4,5,4,5] ; AVX2-SLOW-NEXT: vpermilps $244, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm9 = mem[0,1,3,3,4,5,7,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1,2],ymm8[3],ymm9[4,5,6],ymm8[7] @@ -7384,7 +7384,7 @@ ; AVX2-SLOW-NEXT: vpermilps $244, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm6 = mem[0,1,3,3,4,5,7,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm6[1,2,3],ymm5[4],ymm6[5,6,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm6 = ymm4[3,3,3,3,7,7,7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm6 = ymm4[3,3,3,3,7,7,7,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm13[1],ymm6[2,3,4],ymm13[5],ymm6[6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,2,0,3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,3,2,3] @@ -7399,7 +7399,7 @@ ; AVX2-SLOW-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm4 = mem[3,3,3,3] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm5 = ymm12[0,1,3,3,4,5,7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm5 = ymm12[0,1,3,3,4,5,7,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm5[1,2,3],ymm4[4],ymm5[5,6,7] ; AVX2-SLOW-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm5 = mem[3,3,3,3,7,7,7,7] @@ -7408,7 +7408,7 @@ ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,2,0,3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,3,2,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2],ymm5[3,4,5,6,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm5 = ymm3[0,1,0,1,4,5,4,5] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm5 = ymm3[0,1,0,1,4,5,4,5] ; AVX2-SLOW-NEXT: vpermilps $244, (%rsp), %ymm6 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm6 = mem[0,1,3,3,4,5,7,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2],ymm5[3],ymm6[4,5,6],ymm5[7] @@ -7420,24 +7420,24 @@ ; AVX2-SLOW-NEXT: vpermilps $244, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm4 = mem[0,1,3,3,4,5,7,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm4[1,2,3],ymm3[4],ymm4[5,6,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm4 = ymm14[3,3,3,3,7,7,7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm4 = ymm14[3,3,3,3,7,7,7,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm11[1],ymm4[2,3,4],ymm11[5],ymm4[6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,2,0,3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,3,2,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm4[3,4,5,6,7] ; AVX2-SLOW-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm4 = mem[0,1,0,1,4,5,4,5] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm5 = ymm15[0,1,3,3,4,5,7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm5 = ymm15[0,1,3,3,4,5,7,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3],ymm5[4,5,6],ymm4[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,0,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[3,3,3,3] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm2[3,3,3,3] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm3 = ymm11[0,1,3,3,4,5,7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm3 = ymm11[0,1,3,3,4,5,7,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm3[1,2,3],ymm2[4],ymm3[5,6,7] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm3 = ymm13[3,3,3,3,7,7,7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm3 = ymm13[3,3,3,3,7,7,7,7] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm12[1],ymm3[2,3,4],ymm12[5],ymm3[6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,2,0,3] @@ -7451,19 +7451,19 @@ ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm2 = ymm9[0,1,3,3,4,5,7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm9[0,1,3,3,4,5,7,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm2 = ymm1[3,3,3,3,7,7,7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm1[3,3,3,3,7,7,7,7] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm14[1],ymm2[2,3,4],ymm14[5],ymm2[6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,0,3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm2 = ymm10[0,1,0,1,4,5,4,5] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm3 = ymm7[0,1,3,3,4,5,7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm10[0,1,0,1,4,5,4,5] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm3 = ymm7[0,1,3,3,4,5,7,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] @@ -7471,19 +7471,19 @@ ; AVX2-SLOW-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm0 = mem[3,3,3,3] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm3[0,1,3,3,4,5,7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm3[0,1,3,3,4,5,7,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm5[3,3,3,3,7,7,7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm5[3,3,3,3,7,7,7,7] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm4[1],ymm1[2,3,4],ymm4[5],ymm1[6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm7[0,1,0,1,4,5,4,5] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm7[0,1,0,1,4,5,4,5] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm2 = ymm6[0,1,3,3,4,5,7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm6[0,1,3,3,4,5,7,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] @@ -7519,7 +7519,7 @@ ; AVX2-SLOW-NEXT: vmovaps 272(%rdi), %xmm0 ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5,6,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,1,0,2,4,5,4,6] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,0,2,4,5,4,6] ; AVX2-SLOW-NEXT: vmovaps {{.*#+}} xmm8 = <4,2,u,u> ; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm8, %ymm1 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm0[2,3,4,5,6,7] @@ -7539,7 +7539,7 @@ ; AVX2-SLOW-NEXT: vmovaps 464(%rdi), %xmm1 ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3],ymm3[4,5,6,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,1,0,2,4,5,4,6] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,0,2,4,5,4,6] ; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm8, %ymm2 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm7[0,1],ymm6[2,3],ymm7[4,5,6,7] @@ -7558,7 +7558,7 @@ ; AVX2-SLOW-NEXT: vmovaps 656(%rdi), %xmm1 ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3],ymm3[4,5,6,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,1,0,2,4,5,4,6] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,0,2,4,5,4,6] ; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm8, %ymm2 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload @@ -7577,7 +7577,7 @@ ; AVX2-SLOW-NEXT: vmovaps 848(%rdi), %xmm1 ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3],ymm3[4,5,6,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,1,0,2,4,5,4,6] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,0,2,4,5,4,6] ; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm8, %ymm2 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm2 # 32-byte Folded Reload @@ -7597,7 +7597,7 @@ ; AVX2-SLOW-NEXT: vmovaps 1040(%rdi), %xmm1 ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3],ymm3[4,5,6,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,1,0,2,4,5,4,6] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,0,2,4,5,4,6] ; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm8, %ymm2 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovups (%rsp), %ymm2 # 32-byte Reload @@ -7613,7 +7613,7 @@ ; AVX2-SLOW-NEXT: vmovaps 1232(%rdi), %xmm1 ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm11[0,1],ymm1[2,3],ymm11[4,5,6,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,1,0,2,4,5,4,6] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,0,2,4,5,4,6] ; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm8, %ymm2 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload @@ -7630,7 +7630,7 @@ ; AVX2-SLOW-NEXT: # ymm9 = ymm1[0,1,2,3],mem[4,5,6,7] ; AVX2-SLOW-NEXT: vmovaps 1424(%rdi), %xmm12 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm9[0,1],ymm12[2,3],ymm9[4,5,6,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,1,0,2,4,5,4,6] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,0,2,4,5,4,6] ; AVX2-SLOW-NEXT: vpermps %ymm13, %ymm8, %ymm2 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload @@ -7647,7 +7647,7 @@ ; AVX2-SLOW-NEXT: # ymm4 = ymm1[0,1,2,3],mem[4,5,6,7] ; AVX2-SLOW-NEXT: vmovaps 80(%rdi), %xmm2 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1],ymm2[2,3],ymm5[4,5,6,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,1,0,2,4,5,4,6] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,0,2,4,5,4,6] ; AVX2-SLOW-NEXT: vpermps %ymm4, %ymm8, %ymm3 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload @@ -7703,20 +7703,20 @@ ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm15[2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpermps %ymm14, %ymm0, %ymm14 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm14 = ymm7[0,1,2,3,4],ymm14[5,6,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm7 = ymm11[1,1,1,1,5,5,5,5] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm7 = ymm11[1,1,1,1,5,5,5,5] ; AVX2-SLOW-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm7 = ymm7[0,1,2],mem[3],ymm7[4,5,6,7] ; AVX2-SLOW-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm11 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm11[0,1],ymm7[2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpermps %ymm10, %ymm0, %ymm10 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm10 = ymm7[0,1,2,3,4],ymm10[5,6,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm7 = ymm9[1,1,1,1,5,5,5,5] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm7 = ymm9[1,1,1,1,5,5,5,5] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2],ymm12[3],ymm7[4,5,6,7] ; AVX2-SLOW-NEXT: vpermps %ymm13, %ymm1, %ymm8 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1],ymm7[2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpermps %ymm6, %ymm0, %ymm6 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3,4],ymm6[5,6,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm5 = ymm5[1,1,1,1,5,5,5,5] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm5 = ymm5[1,1,1,1,5,5,5,5] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1,2],ymm2[3],ymm5[4,5,6,7] ; AVX2-SLOW-NEXT: vpermps %ymm4, %ymm1, %ymm1 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3,4,5,6,7] @@ -7845,7 +7845,7 @@ ; AVX2-FAST-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm9[0,1],ymm8[0,1] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm8[6,7] ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,2,2,2,4,6,6,6] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2,2,2,4,6,6,6] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm6[4,5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -7862,7 +7862,7 @@ ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5],ymm0[6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpermps %ymm0, %ymm4, %ymm0 -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,2,2,2,4,6,6,6] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2,2,2,4,6,6,6] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX2-FAST-NEXT: vmovaps 704(%rdi), %ymm1 ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -7887,7 +7887,7 @@ ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpermps %ymm0, %ymm4, %ymm0 -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm1 = ymm2[0,2,2,2,4,6,6,6] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,2,2,2,4,6,6,6] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX2-FAST-NEXT: vmovaps 1088(%rdi), %ymm1 ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -7912,7 +7912,7 @@ ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpermps %ymm0, %ymm4, %ymm0 -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm1 = ymm2[0,2,2,2,4,6,6,6] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,2,2,2,4,6,6,6] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX2-FAST-NEXT: vmovaps 1472(%rdi), %ymm1 ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -7935,7 +7935,7 @@ ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm12 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] ; AVX2-FAST-NEXT: vpermps %ymm12, %ymm4, %ymm0 -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm1 = ymm13[0,2,2,2,4,6,6,6] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm13[0,2,2,2,4,6,6,6] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX2-FAST-NEXT: vmovaps 128(%rdi), %ymm1 ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -7957,7 +7957,7 @@ ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] ; AVX2-FAST-NEXT: vpermps %ymm7, %ymm4, %ymm0 -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm1 = ymm9[0,2,2,2,4,6,6,6] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm9[0,2,2,2,4,6,6,6] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX2-FAST-NEXT: vmovaps 512(%rdi), %ymm1 ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -7979,7 +7979,7 @@ ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] ; AVX2-FAST-NEXT: vpermps %ymm3, %ymm4, %ymm0 -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm1 = ymm6[0,2,2,2,4,6,6,6] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm6[0,2,2,2,4,6,6,6] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX2-FAST-NEXT: vmovaps 896(%rdi), %ymm1 ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -8001,7 +8001,7 @@ ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[0,1],ymm4[0,1] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm1[0,1,2,3,4,5],ymm4[6,7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm1 = ymm4[0,2,2,2,4,6,6,6] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm4[0,2,2,2,4,6,6,6] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX2-FAST-NEXT: vmovaps 1280(%rdi), %ymm1 ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -8042,31 +8042,31 @@ ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] ; AVX2-FAST-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpermps %ymm12, %ymm10, %ymm12 -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm13 = ymm13[1,3,2,3,5,7,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm13 = ymm13[1,3,2,3,5,7,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2],ymm13[3,4,5,6,7] ; AVX2-FAST-NEXT: vpermps %ymm11, %ymm0, %ymm11 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm11 = ymm12[0,1,2,3,4,5],ymm11[6,7] ; AVX2-FAST-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpermps %ymm7, %ymm10, %ymm7 -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm9 = ymm9[1,3,2,3,5,7,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm9 = ymm9[1,3,2,3,5,7,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2],ymm9[3,4,5,6,7] ; AVX2-FAST-NEXT: vpermps %ymm8, %ymm0, %ymm8 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm8[6,7] ; AVX2-FAST-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpermps %ymm3, %ymm10, %ymm3 -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm6 = ymm6[1,3,2,3,5,7,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm6 = ymm6[1,3,2,3,5,7,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm6[3,4,5,6,7] ; AVX2-FAST-NEXT: vpermps %ymm5, %ymm0, %ymm5 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm5[6,7] ; AVX2-FAST-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vpermps %ymm2, %ymm10, %ymm2 -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm3 = ymm4[1,3,2,3,5,7,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm3 = ymm4[1,3,2,3,5,7,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3,4,5,6,7] ; AVX2-FAST-NEXT: vpermps %ymm1, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps 192(%rdi), %xmm6 -; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm0 = xmm6[2,3,2,3] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm6[2,3,2,3] ; AVX2-FAST-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm1 = mem[0,0,2,3,4,4,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] @@ -8086,7 +8086,7 @@ ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps 576(%rdi), %xmm1 -; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[2,3,2,3] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm2 = xmm1[2,3,2,3] ; AVX2-FAST-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm3 = mem[0,0,2,3,4,4,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm3[1,2,3],ymm2[4],ymm3[5,6,7] @@ -8105,7 +8105,7 @@ ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps 960(%rdi), %xmm2 -; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm3 = xmm2[2,3,2,3] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm3 = xmm2[2,3,2,3] ; AVX2-FAST-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm4 = mem[0,0,2,3,4,4,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm4[1,2,3],ymm3[4],ymm4[5,6,7] @@ -8116,7 +8116,7 @@ ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,3,2,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm4[3,4,5,6,7] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm4 = ymm13[0,0,0,0,4,4,4,4] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm13[0,0,0,0,4,4,4,4] ; AVX2-FAST-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm5 = mem[0,0,2,3,4,4,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3],ymm5[4,5,6],ymm4[7] @@ -8125,7 +8125,7 @@ ; AVX2-FAST-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps 1344(%rdi), %xmm3 ; AVX2-FAST-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm4 = xmm3[2,3,2,3] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm4 = xmm3[2,3,2,3] ; AVX2-FAST-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm5 = mem[0,0,2,3,4,4,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm5[1,2,3],ymm4[4],ymm5[5,6,7] @@ -8136,16 +8136,16 @@ ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,3,2,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2],ymm5[3,4,5,6,7] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm5 = ymm3[0,0,0,0,4,4,4,4] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm5 = ymm3[0,0,0,0,4,4,4,4] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm7 = ymm15[0,0,2,3,4,4,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm7 = ymm15[0,0,2,3,4,4,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0,1,2],ymm5[3],ymm7[4,5,6],ymm5[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,0,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm5[5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps 1152(%rdi), %xmm4 ; AVX2-FAST-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm5 = xmm4[2,3,2,3] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm5 = xmm4[2,3,2,3] ; AVX2-FAST-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm7 = mem[0,0,2,3,4,4,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm7[1,2,3],ymm5[4],ymm7[5,6,7] @@ -8156,7 +8156,7 @@ ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,3,2,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2],ymm7[3,4,5,6,7] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm7 = ymm12[0,0,0,0,4,4,4,4] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm7 = ymm12[0,0,0,0,4,4,4,4] ; AVX2-FAST-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm8 = mem[0,0,2,3,4,4,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1,2],ymm7[3],ymm8[4,5,6],ymm7[7] @@ -8165,7 +8165,7 @@ ; AVX2-FAST-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps (%rdi), %xmm4 ; AVX2-FAST-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm5 = xmm4[2,3,2,3] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm5 = xmm4[2,3,2,3] ; AVX2-FAST-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm8 = mem[0,0,2,3,4,4,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm8[1,2,3],ymm5[4],ymm8[5,6,7] @@ -8185,7 +8185,7 @@ ; AVX2-FAST-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps 384(%rdi), %xmm4 ; AVX2-FAST-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm8 = xmm4[2,3,2,3] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm8 = xmm4[2,3,2,3] ; AVX2-FAST-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm9 = mem[0,0,2,3,4,4,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0],ymm9[1,2,3],ymm8[4],ymm9[5,6,7] @@ -8208,21 +8208,21 @@ ; AVX2-FAST-NEXT: # ymm8 = mem[0,1],ymm4[2,3],mem[4,5],ymm4[6,7] ; AVX2-FAST-NEXT: vpermps %ymm8, %ymm0, %ymm8 ; AVX2-FAST-NEXT: vmovaps 768(%rdi), %xmm0 -; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm9 = xmm0[2,3,2,3] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm9 = xmm0[2,3,2,3] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm10 = ymm7[0,0,2,3,4,4,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm10 = ymm7[0,0,2,3,4,4,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0],ymm10[1,2,3],ymm9[4],ymm10[5,6,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[0,3,2,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1,2],ymm8[3,4,5,6,7] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm9 = ymm5[0,0,0,0,4,4,4,4] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm9 = ymm5[0,0,0,0,4,4,4,4] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm10 = ymm4[0,0,2,3,4,4,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm10 = ymm4[0,0,2,3,4,4,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1,2],ymm9[3],ymm10[4,5,6],ymm9[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[0,1,0,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm9[5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm6 = xmm6[3,3,3,3] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm6 = xmm6[3,3,3,3] ; AVX2-FAST-NEXT: vpermilps $244, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm8 = mem[0,1,3,3,4,5,7,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm8[1,2,3],ymm6[4],ymm8[5,6,7] @@ -8241,7 +8241,7 @@ ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,1,0,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm8[5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,3,3,3] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3] ; AVX2-FAST-NEXT: vpermilps $244, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm6 = mem[0,1,3,3,4,5,7,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm6[1,2,3],ymm1[4],ymm6[5,6,7] @@ -8260,7 +8260,7 @@ ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,0,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm6[5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm1 = xmm2[3,3,3,3] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm2[3,3,3,3] ; AVX2-FAST-NEXT: vpermilps $244, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm2 = mem[0,1,3,3,4,5,7,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7] @@ -8271,7 +8271,7 @@ ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,0,3] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,3,2,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm2 = ymm13[0,1,0,1,4,5,4,5] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm13[0,1,0,1,4,5,4,5] ; AVX2-FAST-NEXT: vpermilps $244, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm6 = mem[0,1,3,3,4,5,7,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm6[0,1,2],ymm2[3],ymm6[4,5,6],ymm2[7] @@ -8283,13 +8283,13 @@ ; AVX2-FAST-NEXT: vpermilps $244, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm2 = mem[0,1,3,3,4,5,7,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm2 = ymm14[3,3,3,3,7,7,7,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm14[3,3,3,3,7,7,7,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm11[1],ymm2[2,3,4],ymm11[5],ymm2[6,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,0,3] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,3,2,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm2 = ymm3[0,1,0,1,4,5,4,5] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm3 = ymm15[0,1,3,3,4,5,7,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm3[0,1,0,1,4,5,4,5] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm3 = ymm15[0,1,3,3,4,5,7,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] @@ -8297,7 +8297,7 @@ ; AVX2-FAST-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; AVX2-FAST-NEXT: # xmm1 = mem[3,3,3,3] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm2 = ymm11[0,1,3,3,4,5,7,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm11[0,1,3,3,4,5,7,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4],ymm2[5,6,7] ; AVX2-FAST-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm2 = mem[3,3,3,3,7,7,7,7] @@ -8306,26 +8306,26 @@ ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,0,3] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,3,2,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm2 = ymm12[0,1,0,1,4,5,4,5] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm12[0,1,0,1,4,5,4,5] ; AVX2-FAST-NEXT: vpermilps $244, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm3 = mem[0,1,3,3,4,5,7,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm1 = ymm7[0,1,3,3,4,5,7,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm7[0,1,3,3,4,5,7,7] ; AVX2-FAST-NEXT: vmovaps %ymm7, %ymm12 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm1 = ymm10[3,3,3,3,7,7,7,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm10[3,3,3,3,7,7,7,7] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm14[1],ymm1[2,3,4],ymm14[5],ymm1[6,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm1 = ymm5[0,1,0,1,4,5,4,5] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm2 = ymm4[0,1,3,3,4,5,7,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm5[0,1,0,1,4,5,4,5] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm4[0,1,3,3,4,5,7,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] @@ -8333,19 +8333,19 @@ ; AVX2-FAST-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; AVX2-FAST-NEXT: # xmm0 = mem[3,3,3,3] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm1 = ymm3[0,1,3,3,4,5,7,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm3[0,1,3,3,4,5,7,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm1 = ymm6[3,3,3,3,7,7,7,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm6[3,3,3,3,7,7,7,7] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm5[1],ymm1[2,3,4],ymm5[5],ymm1[6,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm1 = ymm9[0,1,0,1,4,5,4,5] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm9[0,1,0,1,4,5,4,5] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm2 = ymm7[0,1,3,3,4,5,7,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm7[0,1,3,3,4,5,7,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] @@ -8381,7 +8381,7 @@ ; AVX2-FAST-NEXT: vmovaps 272(%rdi), %xmm0 ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5,6,7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,1,0,2,4,5,4,6] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,0,2,4,5,4,6] ; AVX2-FAST-NEXT: vmovaps {{.*#+}} xmm8 = <4,2,u,u> ; AVX2-FAST-NEXT: vpermps %ymm1, %ymm8, %ymm1 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm0[2,3,4,5,6,7] @@ -8402,7 +8402,7 @@ ; AVX2-FAST-NEXT: vmovaps 464(%rdi), %xmm0 ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,1,0,2,4,5,4,6] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,0,2,4,5,4,6] ; AVX2-FAST-NEXT: vpermps %ymm3, %ymm8, %ymm2 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1],ymm7[2,3],ymm9[4,5,6,7] @@ -8421,7 +8421,7 @@ ; AVX2-FAST-NEXT: vmovaps 656(%rdi), %xmm1 ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5,6,7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,1,0,2,4,5,4,6] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,0,2,4,5,4,6] ; AVX2-FAST-NEXT: vpermps %ymm0, %ymm8, %ymm2 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload @@ -8439,7 +8439,7 @@ ; AVX2-FAST-NEXT: vmovaps 848(%rdi), %xmm1 ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5,6,7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,1,0,2,4,5,4,6] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,0,2,4,5,4,6] ; AVX2-FAST-NEXT: vpermps %ymm0, %ymm8, %ymm2 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -8460,7 +8460,7 @@ ; AVX2-FAST-NEXT: vmovaps 1040(%rdi), %xmm1 ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5,6,7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,1,0,2,4,5,4,6] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,0,2,4,5,4,6] ; AVX2-FAST-NEXT: vpermps %ymm0, %ymm8, %ymm2 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload @@ -8477,7 +8477,7 @@ ; AVX2-FAST-NEXT: vmovaps 1232(%rdi), %xmm1 ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm11[0,1],ymm1[2,3],ymm11[4,5,6,7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,1,0,2,4,5,4,6] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,0,2,4,5,4,6] ; AVX2-FAST-NEXT: vpermps %ymm0, %ymm8, %ymm2 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload @@ -8495,7 +8495,7 @@ ; AVX2-FAST-NEXT: # ymm12 = ymm1[0,1,2,3],mem[4,5,6,7] ; AVX2-FAST-NEXT: vmovaps 1424(%rdi), %xmm13 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm12[0,1],ymm13[2,3],ymm12[4,5,6,7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,1,0,2,4,5,4,6] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,0,2,4,5,4,6] ; AVX2-FAST-NEXT: vpermps %ymm0, %ymm8, %ymm2 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload @@ -8511,7 +8511,7 @@ ; AVX2-FAST-NEXT: # ymm9 = ymm1[0,1,2,3],mem[4,5,6,7] ; AVX2-FAST-NEXT: vmovaps 80(%rdi), %xmm2 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1],ymm2[2,3],ymm5[4,5,6,7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,1,0,2,4,5,4,6] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,0,2,4,5,4,6] ; AVX2-FAST-NEXT: vpermps %ymm9, %ymm8, %ymm3 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3,4,5,6,7] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload @@ -8566,20 +8566,20 @@ ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm15[2,3,4,5,6,7] ; AVX2-FAST-NEXT: vpermps %ymm14, %ymm0, %ymm14 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm14 = ymm7[0,1,2,3,4],ymm14[5,6,7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm7 = ymm11[1,1,1,1,5,5,5,5] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm7 = ymm11[1,1,1,1,5,5,5,5] ; AVX2-FAST-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm7 = ymm7[0,1,2],mem[3],ymm7[4,5,6,7] ; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm11 # 32-byte Folded Reload ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm11[0,1],ymm7[2,3,4,5,6,7] ; AVX2-FAST-NEXT: vpermps %ymm10, %ymm0, %ymm10 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm10 = ymm7[0,1,2,3,4],ymm10[5,6,7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm7 = ymm12[1,1,1,1,5,5,5,5] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm7 = ymm12[1,1,1,1,5,5,5,5] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2],ymm13[3],ymm7[4,5,6,7] ; AVX2-FAST-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm8 # 32-byte Folded Reload ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1],ymm7[2,3,4,5,6,7] ; AVX2-FAST-NEXT: vpermps %ymm6, %ymm0, %ymm6 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3,4],ymm6[5,6,7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm5 = ymm5[1,1,1,1,5,5,5,5] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm5 = ymm5[1,1,1,1,5,5,5,5] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1,2],ymm2[3],ymm5[4,5,6,7] ; AVX2-FAST-NEXT: vpermps %ymm9, %ymm1, %ymm1 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3,4,5,6,7] @@ -8707,7 +8707,7 @@ ; AVX2-FAST-PERLANE-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm9[0,1],ymm8[0,1] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm8[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,2,2,2,4,6,6,6] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2,2,2,4,6,6,6] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm6[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -8724,7 +8724,7 @@ ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm0, %ymm4, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,2,2,2,4,6,6,6] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2,2,2,4,6,6,6] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps 704(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -8749,7 +8749,7 @@ ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm0, %ymm4, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm2[0,2,2,2,4,6,6,6] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,2,2,2,4,6,6,6] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps 1088(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, (%rsp) # 32-byte Spill @@ -8774,7 +8774,7 @@ ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm0, %ymm4, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm2[0,2,2,2,4,6,6,6] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,2,2,2,4,6,6,6] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps 1472(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -8797,7 +8797,7 @@ ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm12 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm12, %ymm4, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm13[0,2,2,2,4,6,6,6] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm13[0,2,2,2,4,6,6,6] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps 128(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -8819,7 +8819,7 @@ ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm7, %ymm4, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm10[0,2,2,2,4,6,6,6] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm10[0,2,2,2,4,6,6,6] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps 512(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -8841,7 +8841,7 @@ ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm3, %ymm4, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm6[0,2,2,2,4,6,6,6] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm6[0,2,2,2,4,6,6,6] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps 896(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -8863,7 +8863,7 @@ ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[0,1],ymm4[0,1] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm1[0,1,2,3,4,5],ymm4[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm4[0,2,2,2,4,6,6,6] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm4[0,2,2,2,4,6,6,6] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps 1280(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -8904,43 +8904,43 @@ ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm12, %ymm9, %ymm12 -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm13 = ymm13[1,3,2,3,5,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm13 = ymm13[1,3,2,3,5,7,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2],ymm13[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm11, %ymm0, %ymm11 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm11 = ymm12[0,1,2,3,4,5],ymm11[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm7, %ymm9, %ymm7 -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm10 = ymm10[1,3,2,3,5,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm10 = ymm10[1,3,2,3,5,7,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2],ymm10[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm8, %ymm0, %ymm8 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm8[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm3, %ymm9, %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm6 = ymm6[1,3,2,3,5,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm6 = ymm6[1,3,2,3,5,7,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm6[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm5, %ymm0, %ymm5 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm5[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm2, %ymm9, %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm3 = ymm4[1,3,2,3,5,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm3 = ymm4[1,3,2,3,5,7,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 192(%rdi), %xmm6 -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm0 = xmm6[2,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm6[2,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0,0,2,3,4,4,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0,1],ymm1[2,3],mem[4,5],ymm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[2,0,2,3,6,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0,2,3,6,4,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm5[0,0,0,0,4,4,4,4] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm5[0,0,0,0,4,4,4,4] ; AVX2-FAST-PERLANE-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[0,0,2,3,4,4,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] @@ -8949,14 +8949,14 @@ ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 576(%rdi), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0,0,2,3,4,4,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm13[0,1],ymm4[2,3],ymm13[4,5],ymm4[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[2,0,2,3,6,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0,2,3,6,4,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] @@ -8970,19 +8970,19 @@ ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 960(%rdi), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm12[0,0,2,3,4,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm12[0,0,2,3,4,4,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0,1],ymm1[2,3],mem[4,5],ymm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[2,0,2,3,6,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0,2,3,6,4,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm3[0,0,0,0,4,4,4,4] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm3[0,0,0,0,4,4,4,4] ; AVX2-FAST-PERLANE-NEXT: vpermilps $224, (%rsp), %ymm2 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[0,0,2,3,4,4,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] @@ -8991,34 +8991,34 @@ ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 1344(%rdi), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0,0,2,3,4,4,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm11[0,1],ymm14[2,3],ymm11[4,5],ymm14[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[2,0,2,3,6,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0,2,3,6,4,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0,0,0,0,4,4,4,4] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm2 = ymm15[0,0,2,3,4,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm15[0,0,2,3,4,4,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 1152(%rdi), %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm0 = xmm2[2,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm2[2,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0,0,2,3,4,4,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0,1],ymm1[2,3],mem[4,5],ymm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[2,0,2,3,6,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0,2,3,6,4,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] @@ -9032,14 +9032,14 @@ ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdi), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0,0,2,3,4,4,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm1 = mem[0,1],ymm1[2,3],mem[4,5],ymm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[2,0,2,3,6,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0,2,3,6,4,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] @@ -9053,14 +9053,14 @@ ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 384(%rdi), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm8 = mem[0,0,2,3,4,4,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm8[1,2,3],ymm0[4],ymm8[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm8 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm8 = mem[0,1],ymm1[2,3],mem[4,5],ymm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm8 = ymm8[2,0,2,3,6,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm8 = ymm8[2,0,2,3,6,4,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,2,0,3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm8[3,4,5,6,7] @@ -9073,26 +9073,26 @@ ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm8[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 768(%rdi), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm8 = xmm0[2,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm8 = xmm0[2,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vpermilps $224, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm9 = mem[0,0,2,3,4,4,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0],ymm9[1,2,3],ymm8[4],ymm9[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vblendps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm9 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm9 = mem[0,1],ymm1[2,3],mem[4,5],ymm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm9 = ymm9[2,0,2,3,6,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm9 = ymm9[2,0,2,3,6,4,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[0,2,0,3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2],ymm9[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm9 = mem[0,0,0,0,4,4,4,4] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm10 = ymm7[0,0,2,3,4,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm10 = ymm7[0,0,2,3,4,4,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1,2],ymm9[3],ymm10[4,5,6],ymm9[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[0,1,0,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm9[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm6 = xmm6[3,3,3,3] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm6 = xmm6[3,3,3,3] ; AVX2-FAST-PERLANE-NEXT: vpermilps $244, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm8 = mem[0,1,3,3,4,5,7,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm8[1,2,3],ymm6[4],ymm8[5,6,7] @@ -9103,7 +9103,7 @@ ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,2,0,3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2],ymm8[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm8 = ymm5[0,1,0,1,4,5,4,5] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm8 = ymm5[0,1,0,1,4,5,4,5] ; AVX2-FAST-PERLANE-NEXT: vpermilps $244, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm9 = mem[0,1,3,3,4,5,7,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1,2],ymm8[3],ymm9[4,5,6],ymm8[7] @@ -9115,7 +9115,7 @@ ; AVX2-FAST-PERLANE-NEXT: vpermilps $244, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm6 = mem[0,1,3,3,4,5,7,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm6[1,2,3],ymm5[4],ymm6[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm6 = ymm4[3,3,3,3,7,7,7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm6 = ymm4[3,3,3,3,7,7,7,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm13[1],ymm6[2,3,4],ymm13[5],ymm6[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,2,0,3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,3,2,3] @@ -9130,7 +9130,7 @@ ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm4 = mem[3,3,3,3] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm5 = ymm12[0,1,3,3,4,5,7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm5 = ymm12[0,1,3,3,4,5,7,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm5[1,2,3],ymm4[4],ymm5[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm5 = mem[3,3,3,3,7,7,7,7] @@ -9139,7 +9139,7 @@ ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,2,0,3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2],ymm5[3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm5 = ymm3[0,1,0,1,4,5,4,5] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm5 = ymm3[0,1,0,1,4,5,4,5] ; AVX2-FAST-PERLANE-NEXT: vpermilps $244, (%rsp), %ymm6 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm6 = mem[0,1,3,3,4,5,7,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2],ymm5[3],ymm6[4,5,6],ymm5[7] @@ -9151,24 +9151,24 @@ ; AVX2-FAST-PERLANE-NEXT: vpermilps $244, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm4 = mem[0,1,3,3,4,5,7,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm4[1,2,3],ymm3[4],ymm4[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm4 = ymm14[3,3,3,3,7,7,7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm4 = ymm14[3,3,3,3,7,7,7,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm11[1],ymm4[2,3,4],ymm11[5],ymm4[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,2,0,3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm4[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm4 = mem[0,1,0,1,4,5,4,5] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm5 = ymm15[0,1,3,3,4,5,7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm5 = ymm15[0,1,3,3,4,5,7,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3],ymm5[4,5,6],ymm4[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,0,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[3,3,3,3] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm2 = xmm2[3,3,3,3] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm3 = ymm11[0,1,3,3,4,5,7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm3 = ymm11[0,1,3,3,4,5,7,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm3[1,2,3],ymm2[4],ymm3[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm3 = ymm13[3,3,3,3,7,7,7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm3 = ymm13[3,3,3,3,7,7,7,7] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm12[1],ymm3[2,3,4],ymm12[5],ymm3[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,2,0,3] @@ -9182,19 +9182,19 @@ ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm2 = ymm9[0,1,3,3,4,5,7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm9[0,1,3,3,4,5,7,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4],ymm2[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm2 = ymm1[3,3,3,3,7,7,7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm1[3,3,3,3,7,7,7,7] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm14[1],ymm2[2,3,4],ymm14[5],ymm2[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,0,3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm2 = ymm10[0,1,0,1,4,5,4,5] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm3 = ymm7[0,1,3,3,4,5,7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm10[0,1,0,1,4,5,4,5] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm3 = ymm7[0,1,3,3,4,5,7,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] @@ -9202,19 +9202,19 @@ ; AVX2-FAST-PERLANE-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm0 = mem[3,3,3,3] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm3[0,1,3,3,4,5,7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm3[0,1,3,3,4,5,7,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm5[3,3,3,3,7,7,7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm5[3,3,3,3,7,7,7,7] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm4[1],ymm1[2,3,4],ymm4[5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,0,3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm7[0,1,0,1,4,5,4,5] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm7[0,1,0,1,4,5,4,5] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm2 = ymm6[0,1,3,3,4,5,7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm6[0,1,3,3,4,5,7,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] @@ -9250,7 +9250,7 @@ ; AVX2-FAST-PERLANE-NEXT: vmovaps 272(%rdi), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,1,0,2,4,5,4,6] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,0,2,4,5,4,6] ; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} xmm8 = <4,2,u,u> ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm8, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm0[2,3,4,5,6,7] @@ -9270,7 +9270,7 @@ ; AVX2-FAST-PERLANE-NEXT: vmovaps 464(%rdi), %xmm1 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3],ymm3[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,1,0,2,4,5,4,6] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,0,2,4,5,4,6] ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm2, %ymm8, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm7[0,1],ymm6[2,3],ymm7[4,5,6,7] @@ -9289,7 +9289,7 @@ ; AVX2-FAST-PERLANE-NEXT: vmovaps 656(%rdi), %xmm1 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3],ymm3[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,1,0,2,4,5,4,6] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,0,2,4,5,4,6] ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm2, %ymm8, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload @@ -9308,7 +9308,7 @@ ; AVX2-FAST-PERLANE-NEXT: vmovaps 848(%rdi), %xmm1 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3],ymm3[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,1,0,2,4,5,4,6] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,0,2,4,5,4,6] ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm2, %ymm8, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm2 # 32-byte Folded Reload @@ -9328,7 +9328,7 @@ ; AVX2-FAST-PERLANE-NEXT: vmovaps 1040(%rdi), %xmm1 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3],ymm3[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,1,0,2,4,5,4,6] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,0,2,4,5,4,6] ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm2, %ymm8, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups (%rsp), %ymm2 # 32-byte Reload @@ -9344,7 +9344,7 @@ ; AVX2-FAST-PERLANE-NEXT: vmovaps 1232(%rdi), %xmm1 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm11[0,1],ymm1[2,3],ymm11[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,1,0,2,4,5,4,6] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,0,2,4,5,4,6] ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm2, %ymm8, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload @@ -9361,7 +9361,7 @@ ; AVX2-FAST-PERLANE-NEXT: # ymm9 = ymm1[0,1,2,3],mem[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps 1424(%rdi), %xmm12 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm9[0,1],ymm12[2,3],ymm9[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,1,0,2,4,5,4,6] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,0,2,4,5,4,6] ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm13, %ymm8, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload @@ -9378,7 +9378,7 @@ ; AVX2-FAST-PERLANE-NEXT: # ymm4 = ymm1[0,1,2,3],mem[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps 80(%rdi), %xmm2 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1],ymm2[2,3],ymm5[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,1,0,2,4,5,4,6] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,0,2,4,5,4,6] ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm4, %ymm8, %ymm3 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload @@ -9434,20 +9434,20 @@ ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm15[2,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm14, %ymm0, %ymm14 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm14 = ymm7[0,1,2,3,4],ymm14[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm7 = ymm11[1,1,1,1,5,5,5,5] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm7 = ymm11[1,1,1,1,5,5,5,5] ; AVX2-FAST-PERLANE-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm7 = ymm7[0,1,2],mem[3],ymm7[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm11 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm11[0,1],ymm7[2,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm10, %ymm0, %ymm10 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm10 = ymm7[0,1,2,3,4],ymm10[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm7 = ymm9[1,1,1,1,5,5,5,5] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm7 = ymm9[1,1,1,1,5,5,5,5] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2],ymm12[3],ymm7[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm13, %ymm1, %ymm8 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1],ymm7[2,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm6, %ymm0, %ymm6 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3,4],ymm6[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm5 = ymm5[1,1,1,1,5,5,5,5] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm5 = ymm5[1,1,1,1,5,5,5,5] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1,2],ymm2[3],ymm5[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm4, %ymm1, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3,4,5,6,7] diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-7.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-7.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-7.ll @@ -58,21 +58,21 @@ ; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps 16(%rdi), %xmm3 ; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm4 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm5 = xmm3[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm3[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm2[0],xmm5[1],xmm2[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm4[0],xmm2[1],xmm4[2,3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm6 = xmm6[1,0,2,3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm7 = xmm2[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm6[1,0,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm2[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0],xmm4[1],xmm7[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm4[0,1,2],xmm2[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[3,2,2,3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm4 = xmm4[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm2[3,2,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm4[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2,3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm1[0,0],ymm0[1,0],ymm1[4,4],ymm0[5,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm4 = ymm4[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm4 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[1,0],ymm0[2,0],ymm1[5,4],ymm0[6,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-ONLY-NEXT: vmovlps %xmm5, (%rsi) ; AVX1-ONLY-NEXT: vmovlps %xmm6, (%rdx) @@ -95,18 +95,18 @@ ; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm4 ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm4[0],xmm3[1],xmm4[2,3] -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm5 = xmm5[1,0,2,3] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm5[1,0,2,3] ; AVX2-ONLY-NEXT: vbroadcastss 8(%rdi), %xmm6 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0],xmm4[1],xmm6[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[3] -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm3 = xmm3[3,2,2,3] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm3[3,2,2,3] ; AVX2-ONLY-NEXT: vmovaps {{.*#+}} xmm4 = <4,3,u,u> ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vpermps %ymm7, %ymm4, %ymm4 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7] -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} ymm7 = ymm7[1,0,2,3,5,4,6,7] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm7[1,0,2,3,5,4,6,7] ; AVX2-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm7 -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7] ; AVX2-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX2-ONLY-NEXT: vmovlps %xmm2, (%rsi) @@ -346,37 +346,37 @@ ; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm4 ; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm5 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[0,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,3,2,3] ; AVX1-ONLY-NEXT: vmovaps 80(%rdi), %xmm6 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm7 = xmm6[0,1,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm6[0,1,0,1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1,2],xmm7[3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm4[0],xmm3[1],xmm4[2,3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm8 = xmm8[1,0],mem[3,3] ; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm8 = xmm8[0,1,2],xmm6[2] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm9 = xmm5[0,1,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm9 = xmm5[0,1,0,1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm9[0,1,2],xmm6[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm9 = xmm3[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm9 = xmm3[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm9 = xmm9[0],xmm4[1],xmm9[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm9[0,1],xmm6[2,3] ; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm9 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm9[0],xmm5[1],xmm9[2,3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm10 = xmm10[0,1,1,0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm10[0,1,1,0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm3 = xmm3[3,2,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm3[3,2,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm10[2,3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm10 = xmm9[0,1,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm9[0,1,0,1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm5[0,1,2],xmm10[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm4 = xmm4[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm4[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = mem[0],xmm4[1],mem[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],xmm10[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm9[0,1,2],xmm5[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm0[0,0],ymm1[1,0],ymm0[4,4],ymm1[5,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm10 = ymm10[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm10[0,1],xmm5[3,2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1,2],xmm9[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,0],ymm1[2,0],ymm0[5,4],ymm1[6,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm7[2,3] ; AVX1-ONLY-NEXT: vmovaps %xmm2, (%rsi) @@ -401,9 +401,9 @@ ; AVX2-SLOW-NEXT: vbroadcastss 84(%rdi), %xmm3 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[3] ; AVX2-SLOW-NEXT: vmovaps 80(%rdi), %xmm4 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm3 = xmm4[2,2,2,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm4[2,2,2,2] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm6 = ymm5[1,0,3,3,5,4,7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm6 = ymm5[1,0,3,3,5,4,7,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,3,2,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm3 = xmm6[0,1,2],xmm3[3] ; AVX2-SLOW-NEXT: vbroadcastss 8(%rdi), %xmm6 @@ -415,9 +415,9 @@ ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm4 = xmm6[0,1],xmm4[2,3] ; AVX2-SLOW-NEXT: vmovaps 96(%rdi), %xmm6 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm9 = xmm6[0],xmm8[1],xmm6[2,3] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm9 = xmm9[0,1,1,0] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm9 = xmm9[0,1,1,0] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1,2],mem[3] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm7 = xmm7[3,2,2,3] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm7 = xmm7[3,2,2,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm9[2,3] ; AVX2-SLOW-NEXT: vbroadcastss 100(%rdi), %xmm9 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm9 = xmm8[0,1,2],xmm9[3] @@ -426,13 +426,13 @@ ; AVX2-SLOW-NEXT: vpermps %ymm11, %ymm10, %ymm10 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm9 = xmm10[0,1],xmm9[2,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm8 = xmm6[0,1,2],xmm8[3] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm8 = xmm8[0,1,3,2] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm5 = ymm5[1,0,2,3,5,4,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm8 = xmm8[0,1,3,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm5 = ymm5[1,0,2,3,5,4,6,7] ; AVX2-SLOW-NEXT: vextractf128 $1, %ymm5, %xmm5 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1],xmm8[2,3] ; AVX2-SLOW-NEXT: vbroadcastss 80(%rdi), %ymm8 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm6 = xmm8[0,1,2],xmm6[3] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7] ; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm6[2,3] @@ -461,7 +461,7 @@ ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7] ; AVX2-FAST-NEXT: vpermps %ymm4, %ymm3, %ymm3 ; AVX2-FAST-NEXT: vmovaps 80(%rdi), %xmm5 -; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm6 = xmm5[2,2,2,2] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm6 = xmm5[2,2,2,2] ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1,2],xmm6[3] ; AVX2-FAST-NEXT: vbroadcastss 8(%rdi), %xmm6 ; AVX2-FAST-NEXT: vmovaps 32(%rdi), %xmm7 @@ -472,9 +472,9 @@ ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3] ; AVX2-FAST-NEXT: vmovaps 96(%rdi), %xmm6 ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm9 = xmm6[0],xmm8[1],xmm6[2,3] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm9 = xmm9[0,1,1,0] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm9 = xmm9[0,1,1,0] ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1,2],mem[3] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm7 = xmm7[3,2,2,3] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm7 = xmm7[3,2,2,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm9[2,3] ; AVX2-FAST-NEXT: vbroadcastss 100(%rdi), %xmm9 ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm9 = xmm8[0,1,2],xmm9[3] @@ -483,13 +483,13 @@ ; AVX2-FAST-NEXT: vpermps %ymm11, %ymm10, %ymm10 ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm9 = xmm10[0,1],xmm9[2,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm8 = xmm6[0,1,2],xmm8[3] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm8 = xmm8[0,1,3,2] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm4 = ymm4[1,0,2,3,5,4,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm8 = xmm8[0,1,3,2] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm4[1,0,2,3,5,4,6,7] ; AVX2-FAST-NEXT: vextractf128 $1, %ymm4, %xmm4 ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],xmm8[2,3] ; AVX2-FAST-NEXT: vbroadcastss 80(%rdi), %ymm8 ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm6 = xmm8[0,1,2],xmm6[3] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7] ; AVX2-FAST-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm6[2,3] @@ -515,9 +515,9 @@ ; AVX2-FAST-PERLANE-NEXT: vbroadcastss 84(%rdi), %xmm3 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[3] ; AVX2-FAST-PERLANE-NEXT: vmovaps 80(%rdi), %xmm4 -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm3 = xmm4[2,2,2,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm3 = xmm4[2,2,2,2] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm6 = ymm5[1,0,3,3,5,4,7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm6 = ymm5[1,0,3,3,5,4,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm3 = xmm6[0,1,2],xmm3[3] ; AVX2-FAST-PERLANE-NEXT: vbroadcastss 8(%rdi), %xmm6 @@ -529,9 +529,9 @@ ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm4 = xmm6[0,1],xmm4[2,3] ; AVX2-FAST-PERLANE-NEXT: vmovaps 96(%rdi), %xmm6 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm9 = xmm6[0],xmm8[1],xmm6[2,3] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm9 = xmm9[0,1,1,0] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm9 = xmm9[0,1,1,0] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1,2],mem[3] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm7 = xmm7[3,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm7 = xmm7[3,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm9[2,3] ; AVX2-FAST-PERLANE-NEXT: vbroadcastss 100(%rdi), %xmm9 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm9 = xmm8[0,1,2],xmm9[3] @@ -540,13 +540,13 @@ ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm11, %ymm10, %ymm10 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm9 = xmm10[0,1],xmm9[2,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm8 = xmm6[0,1,2],xmm8[3] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm8 = xmm8[0,1,3,2] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm5 = ymm5[1,0,2,3,5,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm8 = xmm8[0,1,3,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm5 = ymm5[1,0,2,3,5,4,6,7] ; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm5, %xmm5 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1],xmm8[2,3] ; AVX2-FAST-PERLANE-NEXT: vbroadcastss 80(%rdi), %ymm8 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm6 = xmm8[0,1,2],xmm6[3] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm6[2,3] @@ -761,7 +761,7 @@ ; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm13 ; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm9 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm13[0,1],xmm5[2,3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm5 = xmm5[0,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm5[0,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2],ymm3[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %xmm5 ; AVX1-ONLY-NEXT: vmovaps 128(%rdi), %xmm6 @@ -781,7 +781,7 @@ ; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm14 = zero,xmm14[1,2],xmm10[2] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm14[5,6,7] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm14 = xmm13[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm13[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm14[0],xmm9[1],xmm14[2,3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm2[3,1],ymm11[0,3],ymm2[7,5],ymm11[4,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm12[2,1],ymm15[2,0],ymm12[6,5],ymm15[6,4] @@ -796,7 +796,7 @@ ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm11[1,0],ymm12[0,0],ymm11[5,4],ymm12[4,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm12[3,1],ymm11[0,2],ymm12[7,5],ymm11[4,6] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm12 = xmm9[0,1,2],xmm13[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm12 = xmm12[3,2,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm12 = xmm12[3,2,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm12[0,1],ymm11[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm14[0,1],ymm4[1,3],ymm14[4,5],ymm4[5,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm8[0,2],ymm12[2,0],ymm8[4,6],ymm12[6,4] @@ -806,9 +806,9 @@ ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[3,0],ymm13[0,0],ymm8[7,4],ymm13[4,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[2,0],ymm12[2,0],ymm8[6,4],ymm12[6,4] ; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm12 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm13 = xmm12[0,1,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm13 = xmm12[0,1,0,1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm13 = xmm15[0,1,2],xmm13[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm9 = xmm9[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm9 = xmm9[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm9 = mem[0],xmm9[1],mem[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm9 = xmm9[0,1],xmm13[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] @@ -818,20 +818,20 @@ ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm9[1,0],ymm4[2,0],ymm9[5,4],ymm4[6,4] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm9 = xmm12[0,1,2],xmm15[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm0[0,0],ymm1[1,0],ymm0[4,4],ymm1[5,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm13 = ymm13[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm13[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm13, %xmm13 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm9 = xmm13[0,1],xmm9[3,2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm9[0,1,2,3],ymm4[4,5,6,7] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm14[2,3,0,1] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm14[3,0],ymm9[0,0],ymm14[7,4],ymm9[4,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm6 = xmm6[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm6[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[0,1],ymm9[2,0],ymm5[4,5],ymm9[6,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[0,1,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,1,0,1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1,2],xmm12[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,0],ymm1[2,0],ymm0[5,4],ymm1[6,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7] @@ -1576,7 +1576,7 @@ ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm15 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm15[0,1],xmm1[2,3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -1595,7 +1595,7 @@ ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm9 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm9[0,1],xmm1[2,3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -1639,7 +1639,7 @@ ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm14[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm2 = xmm15[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm15[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm10[1],xmm2[2,3] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[3,1],ymm5[0,3],ymm14[7,5],ymm5[4,7] @@ -1651,7 +1651,7 @@ ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm12 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm12[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm2 = xmm9[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm9[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm11[1],xmm2[2,3] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm12[3,1],ymm4[0,3],ymm12[7,5],ymm4[4,7] @@ -1666,7 +1666,7 @@ ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm5[1,0],ymm7[0,0],ymm5[5,4],ymm7[4,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm7[3,1],ymm2[0,2],ymm7[7,5],ymm2[4,6] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm10[0,1,2],xmm15[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm5 = xmm5[3,2,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm5[3,2,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1],ymm2[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm7 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm7[0,1],ymm8[1,3],ymm7[4,5],ymm8[5,7] @@ -1676,7 +1676,7 @@ ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm4[1,0],ymm13[0,0],ymm4[5,4],ymm13[4,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm13[3,1],ymm2[0,2],ymm13[7,5],ymm2[4,6] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm11[0,1,2],xmm9[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm4 = xmm4[3,2,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm4[3,2,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %ymm5 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm5[0,1],ymm3[1,3],ymm5[4,5],ymm3[5,7] @@ -1689,9 +1689,9 @@ ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0],ymm2[2,0],ymm1[6,4],ymm2[6,4] ; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm9 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm4 = xmm9[0,1,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm9[0,1,0,1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm2[0,1,2],xmm4[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm6 = xmm10[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm10[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = mem[0],xmm6[1],mem[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm6[0,1],xmm4[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm1[4,5,6,7] @@ -1701,10 +1701,10 @@ ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm5[1,0],ymm3[2,0],ymm5[5,4],ymm3[6,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm1[2,0],ymm0[6,4],ymm1[6,4] ; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %xmm14 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm6 = xmm14[0,1,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm14[0,1,0,1] ; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %xmm10 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm10[0,1,2],xmm6[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm11 = xmm11[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm11 = xmm11[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm11 = mem[0],xmm11[1],mem[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm11[0,1],xmm6[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7] @@ -1719,7 +1719,7 @@ ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm13[0,0],ymm12[1,0],ymm13[4,4],ymm12[5,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm8 = ymm8[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm8[0,1],xmm2[3,2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm2[0,1,2,3],ymm6[4,5,6,7] @@ -1733,13 +1733,13 @@ ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm10[0,0],ymm11[1,0],ymm10[4,4],ymm11[5,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm8 = ymm8[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm8[0,1],xmm6[3,2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm6[0,1,2,3],ymm3[4,5,6,7] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm7[2,3,0,1] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm7[3,0],ymm6[0,0],ymm7[7,4],ymm6[4,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm7 = xmm0[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm0[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0],xmm1[1],xmm7[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,0],ymm7[4,5],ymm6[6,4] @@ -1747,13 +1747,13 @@ ; AVX1-ONLY-NEXT: # xmm7 = mem[0,1,0,1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1,2],xmm9[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm13[1,0],ymm12[2,0],ymm13[5,4],ymm12[6,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm8 = ymm8[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm8[0,1],xmm7[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm5[2,3,0,1] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[3,0],ymm7[0,0],ymm5[7,4],ymm7[4,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm7 = xmm2[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm2[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0],xmm4[1],xmm7[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm7[0,1],ymm5[2,0],ymm7[4,5],ymm5[6,4] @@ -1761,7 +1761,7 @@ ; AVX1-ONLY-NEXT: # xmm7 = mem[0,1,0,1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm7[0,1,2],xmm14[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm10[1,0],ymm11[2,0],ymm10[5,4],ymm11[6,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm7 = ymm7[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm7[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm7 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm7[0,1],xmm1[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5,6,7] @@ -3460,7 +3460,7 @@ ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm10 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm10[0,1],xmm1[2,3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -3481,7 +3481,7 @@ ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %xmm8 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm8[0,1],xmm1[2,3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 608(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -3504,7 +3504,7 @@ ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] ; AVX1-ONLY-NEXT: vmovaps %xmm1, %xmm14 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,3,2,3] ; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %ymm12 ; AVX1-ONLY-NEXT: vmovaps 304(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -3532,7 +3532,7 @@ ; AVX1-ONLY-NEXT: vmovaps 672(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,3,2,3] ; AVX1-ONLY-NEXT: vmovaps 768(%rdi), %ymm2 ; AVX1-ONLY-NEXT: vmovaps 752(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -3636,7 +3636,7 @@ ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4],ymm15[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm14 = xmm10[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm10[2,3,2,3] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm14[0],xmm5[1],xmm14[2,3] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload @@ -3652,7 +3652,7 @@ ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4],ymm15[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm14 = xmm8[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm8[2,3,2,3] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm14[0],xmm1[1],xmm14[2,3] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload @@ -3669,7 +3669,7 @@ ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4],ymm15[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm14 = xmm2[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm2[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm14[0],xmm12[1],xmm14[2,3] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[3,1],ymm6[0,3],ymm15[7,5],ymm6[4,7] @@ -3683,7 +3683,7 @@ ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm14[0,1,2,3,4],ymm6[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm6 = xmm13[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm13[2,3,2,3] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0],xmm0[1],xmm6[2,3] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload @@ -3705,7 +3705,7 @@ ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm10[3,1],ymm6[0,2],ymm10[7,5],ymm6[4,6] ; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm10 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm10 = xmm5[0,1,2],mem[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm10 = xmm10[3,2,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm10[3,2,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm10[0,1],ymm6[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -3719,7 +3719,7 @@ ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm8[3,1],ymm6[0,2],ymm8[7,5],ymm6[4,6] ; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm9 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm9 = xmm1[0,1,2],mem[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm9 = xmm9[3,2,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm9 = xmm9[3,2,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1],ymm6[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 640(%rdi), %ymm6 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload @@ -3730,7 +3730,7 @@ ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm3[1,0],ymm7[0,0],ymm3[5,4],ymm7[4,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm7[3,1],ymm8[0,2],ymm7[7,5],ymm8[4,6] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm0[0,1,2],xmm13[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm8 = xmm8[3,2,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm8 = xmm8[3,2,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1],ymm7[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %ymm9 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm9[0,1],ymm15[1,3],ymm9[4,5],ymm15[5,7] @@ -3745,7 +3745,7 @@ ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm5 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm5 = mem[0,1,2],xmm0[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm5 = xmm5[3,2,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm5[3,2,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1],ymm4[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 864(%rdi), %ymm15 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload @@ -3762,7 +3762,7 @@ ; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm10 ; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm4 = xmm0[0,1,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm0[0,1,0,1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm10[0,1,2],xmm4[3] ; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm7 = mem[2,3,2,3] @@ -3777,7 +3777,7 @@ ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm2[2,0],ymm3[2,0],ymm2[6,4],ymm3[6,4] ; AVX1-ONLY-NEXT: vmovaps 544(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm4 = xmm0[0,1,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm0[0,1,0,1] ; AVX1-ONLY-NEXT: vmovaps 512(%rdi), %xmm12 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm12[0,1,2],xmm4[3] ; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload @@ -3794,7 +3794,7 @@ ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm1[2,0],ymm3[2,0],ymm1[6,4],ymm3[6,4] ; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm4 = xmm0[0,1,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm0[0,1,0,1] ; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm1[0,1,2],xmm4[3] ; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload @@ -3809,7 +3809,7 @@ ; AVX1-ONLY-NEXT: vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm3[2,0],ymm0[6,4],ymm3[6,4] ; AVX1-ONLY-NEXT: vmovaps 768(%rdi), %xmm11 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm3 = xmm11[0,1,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm11[0,1,0,1] ; AVX1-ONLY-NEXT: vmovaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 736(%rdi), %xmm8 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm8[0,1,2],xmm3[3] @@ -3831,7 +3831,7 @@ ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm13 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm13 = ymm3[0,0],mem[1,0],ymm3[4,4],mem[5,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm13 = ymm13[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm13[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm13, %xmm13 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm13[0,1],xmm10[3,2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm0[4,5,6,7] @@ -3848,7 +3848,7 @@ ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm3[0,0],ymm4[1,0],ymm3[4,4],ymm4[5,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm13 = ymm13[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm13[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm13, %xmm13 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm12 = xmm13[0,1],xmm12[3,2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm12[0,1,2,3],ymm10[4,5,6,7] @@ -3864,7 +3864,7 @@ ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm10[0,0],ymm14[1,0],ymm10[4,4],ymm14[5,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm13 = ymm13[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm13[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm13, %xmm13 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm13[0,1],xmm1[3,2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm12[4,5,6,7] @@ -3880,13 +3880,13 @@ ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm15 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm15 = ymm11[0,0],mem[1,0],ymm11[4,4],mem[5,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm15 = ymm15[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm15, %xmm15 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm13 = xmm15[0,1],xmm13[3,2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm13[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm6[2,3,0,1] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm6[3,0],ymm1[0,0],ymm6[7,4],ymm1[4,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm6 = xmm0[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm0[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0],xmm2[1],xmm6[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm6[0,1],ymm1[2,0],ymm6[4,5],ymm1[6,4] @@ -3894,7 +3894,7 @@ ; AVX1-ONLY-NEXT: # xmm6 = mem[0,1,0,1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm6[0,1,2],xmm5[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm3[1,0],ymm4[2,0],ymm3[5,4],ymm4[6,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm6 = ymm6[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm6[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm6, %xmm6 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm6[0,1],xmm2[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm1[4,5,6,7] @@ -3914,14 +3914,14 @@ ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps $33, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm6 = ymm0[1,0],mem[2,0],ymm0[5,4],mem[6,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm6 = ymm6[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm6[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm6, %xmm6 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3,0,1] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm0[3,0],ymm1[0,0],ymm0[7,4],ymm1[4,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm6 = xmm7[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm7[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0],xmm9[1],xmm6[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm6[0,1],ymm1[2,0],ymm6[4,5],ymm1[6,4] @@ -3930,7 +3930,7 @@ ; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm6 = xmm6[0,1,2],mem[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm10[1,0],ymm14[2,0],ymm10[5,4],ymm14[6,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm9 = ymm9[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm9[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm9, %xmm9 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm9[0,1],xmm6[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm1[4,5,6,7] @@ -3948,7 +3948,7 @@ ; AVX1-ONLY-NEXT: # xmm7 = xmm8[0,1,2],mem[3] ; AVX1-ONLY-NEXT: vshufps $33, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm8 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm8 = ymm11[1,0],mem[2,0],ymm11[5,4],mem[6,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm8 = ymm8[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm8[0,1],xmm7[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] @@ -4285,8 +4285,8 @@ ; AVX2-SLOW-NEXT: # ymm0 = ymm0[0],mem[1],ymm0[2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1,2],mem[3] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,2,2,3] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,2,2,3] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovdqa %ymm9, %ymm14 @@ -4358,13 +4358,13 @@ ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm8[4,5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0],ymm6[1],ymm5[2,3,4],ymm6[5],ymm5[6,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm3 = ymm3[1,0,3,3,5,4,7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm3 = ymm3[1,0,3,3,5,4,7,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,3] ; AVX2-SLOW-NEXT: vbroadcastss 216(%rdi), %ymm5 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm5[7] ; AVX2-SLOW-NEXT: vmovaps 96(%rdi), %xmm14 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm14[0,1,2],xmm0[3] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,3,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1,3,2] ; AVX2-SLOW-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm5 = mem[1,0,2,3,5,4,6,7] ; AVX2-SLOW-NEXT: vextractf128 $1, %ymm5, %xmm5 @@ -4372,41 +4372,41 @@ ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] ; AVX2-SLOW-NEXT: vmovaps 544(%rdi), %xmm3 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,1,3,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,3,2] ; AVX2-SLOW-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm5 = mem[1,0,2,3,5,4,6,7] ; AVX2-SLOW-NEXT: vextractf128 $1, %ymm5, %xmm5 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm5[0,1],xmm1[2,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm12[0],ymm13[1],ymm12[2,3,4],ymm13[5],ymm12[6,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm5 = ymm5[1,0,3,3,5,4,7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm5 = ymm5[1,0,3,3,5,4,7,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,0,3] ; AVX2-SLOW-NEXT: vbroadcastss 664(%rdi), %ymm6 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm6[7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5,6,7] ; AVX2-SLOW-NEXT: vmovaps 320(%rdi), %xmm11 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm2 = xmm11[0,1,2],xmm2[3] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[0,1,3,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,1,3,2] ; AVX2-SLOW-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm5 = mem[1,0,2,3,5,4,6,7] ; AVX2-SLOW-NEXT: vextractf128 $1, %ymm5, %xmm5 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3] ; AVX2-SLOW-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm5 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm5 = ymm15[0],mem[1],ymm15[2,3,4],mem[5],ymm15[6,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm5 = ymm5[1,0,3,3,5,4,7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm5 = ymm5[1,0,3,3,5,4,7,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,0,3] ; AVX2-SLOW-NEXT: vbroadcastss 440(%rdi), %ymm6 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm6[7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm2[0,1,2,3],ymm5[4,5,6,7] ; AVX2-SLOW-NEXT: vmovaps 768(%rdi), %xmm2 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm5 = xmm2[0,1,2],xmm7[3] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm5 = xmm5[0,1,3,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm5 = xmm5[0,1,3,2] ; AVX2-SLOW-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm7 = mem[1,0,2,3,5,4,6,7] ; AVX2-SLOW-NEXT: vextractf128 $1, %ymm7, %xmm7 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm5 = xmm7[0,1],xmm5[2,3] ; AVX2-SLOW-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm7 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm7 = ymm10[0],mem[1],ymm10[2,3,4],mem[5],ymm10[6,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm7 = ymm7[1,0,3,3,5,4,7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm7 = ymm7[1,0,3,3,5,4,7,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,0,3] ; AVX2-SLOW-NEXT: vbroadcastss 888(%rdi), %ymm8 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm8[7] @@ -4796,8 +4796,8 @@ ; AVX2-FAST-NEXT: # ymm0 = ymm0[0],mem[1],ymm0[2,3,4,5,6,7] ; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1,2],mem[3] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,2,2,3] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,2,2,3] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload @@ -5308,8 +5308,8 @@ ; AVX2-FAST-PERLANE-NEXT: # ymm0 = ymm0[0],mem[1],ymm0[2,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1,2],mem[3] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm9, %ymm14 @@ -5381,13 +5381,13 @@ ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm8[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0],ymm6[1],ymm5[2,3,4],ymm6[5],ymm5[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm3 = ymm3[1,0,3,3,5,4,7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm3 = ymm3[1,0,3,3,5,4,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,3] ; AVX2-FAST-PERLANE-NEXT: vbroadcastss 216(%rdi), %ymm5 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm5[7] ; AVX2-FAST-PERLANE-NEXT: vmovaps 96(%rdi), %xmm14 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm0 = xmm14[0,1,2],xmm0[3] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,3,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1,3,2] ; AVX2-FAST-PERLANE-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm5 = mem[1,0,2,3,5,4,6,7] ; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm5, %xmm5 @@ -5395,41 +5395,41 @@ ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps 544(%rdi), %xmm3 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,1,3,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,3,2] ; AVX2-FAST-PERLANE-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm5 = mem[1,0,2,3,5,4,6,7] ; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm5, %xmm5 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm1 = xmm5[0,1],xmm1[2,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm12[0],ymm13[1],ymm12[2,3,4],ymm13[5],ymm12[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm5 = ymm5[1,0,3,3,5,4,7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm5 = ymm5[1,0,3,3,5,4,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,0,3] ; AVX2-FAST-PERLANE-NEXT: vbroadcastss 664(%rdi), %ymm6 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm6[7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps 320(%rdi), %xmm11 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm2 = xmm11[0,1,2],xmm2[3] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[0,1,3,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,1,3,2] ; AVX2-FAST-PERLANE-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm5 = mem[1,0,2,3,5,4,6,7] ; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm5, %xmm5 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3] ; AVX2-FAST-PERLANE-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm5 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm5 = ymm15[0],mem[1],ymm15[2,3,4],mem[5],ymm15[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm5 = ymm5[1,0,3,3,5,4,7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm5 = ymm5[1,0,3,3,5,4,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,0,3] ; AVX2-FAST-PERLANE-NEXT: vbroadcastss 440(%rdi), %ymm6 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm6[7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm2[0,1,2,3],ymm5[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps 768(%rdi), %xmm2 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm5 = xmm2[0,1,2],xmm7[3] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm5 = xmm5[0,1,3,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm5 = xmm5[0,1,3,2] ; AVX2-FAST-PERLANE-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm7 = mem[1,0,2,3,5,4,6,7] ; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm7, %xmm7 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm5 = xmm7[0,1],xmm5[2,3] ; AVX2-FAST-PERLANE-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm7 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm7 = ymm10[0],mem[1],ymm10[2,3,4],mem[5],ymm10[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm7 = ymm7[1,0,3,3,5,4,7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm7 = ymm7[1,0,3,3,5,4,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,0,3] ; AVX2-FAST-PERLANE-NEXT: vbroadcastss 888(%rdi), %ymm8 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm8[7] @@ -7429,7 +7429,7 @@ ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm12 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm12[0,1],xmm1[2,3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 384(%rdi), %xmm4 ; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -7452,7 +7452,7 @@ ; AVX1-ONLY-NEXT: vmovaps 672(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] ; AVX1-ONLY-NEXT: vmovaps %xmm2, %xmm4 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 832(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -7474,7 +7474,7 @@ ; AVX1-ONLY-NEXT: vmovaps 1120(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,3,2,3] ; AVX1-ONLY-NEXT: vmovaps 1216(%rdi), %ymm5 ; AVX1-ONLY-NEXT: vmovaps 1200(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -7501,7 +7501,7 @@ ; AVX1-ONLY-NEXT: vmovaps 1568(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,3,2,3] ; AVX1-ONLY-NEXT: vmovaps 1664(%rdi), %ymm8 ; AVX1-ONLY-NEXT: vmovaps 1648(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -7528,7 +7528,7 @@ ; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,3,2,3] ; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %ymm3 ; AVX1-ONLY-NEXT: vmovaps 80(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -7555,7 +7555,7 @@ ; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,3,2,3] ; AVX1-ONLY-NEXT: vmovaps 544(%rdi), %ymm2 ; AVX1-ONLY-NEXT: vmovaps 528(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -7583,7 +7583,7 @@ ; AVX1-ONLY-NEXT: vmovaps 896(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,3,2,3] ; AVX1-ONLY-NEXT: vmovaps 992(%rdi), %ymm6 ; AVX1-ONLY-NEXT: vmovaps 976(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -7610,7 +7610,7 @@ ; AVX1-ONLY-NEXT: vmovaps 1344(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,3,2,3] ; AVX1-ONLY-NEXT: vmovaps 1440(%rdi), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 1424(%rdi), %xmm1 @@ -7805,7 +7805,7 @@ ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4],ymm14[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm13 = xmm12[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm13 = xmm12[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm13 = xmm13[0],xmm15[1],xmm13[2,3] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps $199, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload @@ -7822,7 +7822,7 @@ ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4],ymm14[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm13 = xmm10[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm13 = xmm10[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm13 = xmm13[0],xmm11[1],xmm13[2,3] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps $199, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload @@ -7839,7 +7839,7 @@ ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4],ymm14[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm13 = xmm7[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm13 = xmm7[2,3,2,3] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm13 = xmm13[0],xmm11[1],xmm13[2,3] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload @@ -7857,7 +7857,7 @@ ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4],ymm14[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm13 = xmm9[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm13 = xmm9[2,3,2,3] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm13 = xmm13[0],xmm9[1],xmm13[2,3] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload @@ -7875,7 +7875,7 @@ ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4],ymm14[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm13 = xmm1[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm13 = xmm1[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm13 = xmm13[0],xmm4[1],xmm13[2,3] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[3,1],ymm6[0,3],ymm14[7,5],ymm6[4,7] @@ -7951,7 +7951,7 @@ ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm13 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm13 = mem[0,1,2],xmm0[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm13 = xmm13[3,2,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm13 = xmm13[3,2,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm13[0,1],ymm8[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 416(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -7968,7 +7968,7 @@ ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm14 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm14 = mem[0,1,2],xmm0[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm14 = xmm14[3,2,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm14[3,2,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm14[0,1],ymm8[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 864(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -7982,7 +7982,7 @@ ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm0[3,1],ymm8[0,2],ymm0[7,5],ymm8[4,6] ; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm14 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm14 = xmm11[0,1,2],mem[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm14 = xmm14[3,2,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm14[3,2,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm14[0,1],ymm8[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 1312(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -7996,7 +7996,7 @@ ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm0[3,1],ymm8[0,2],ymm0[7,5],ymm8[4,6] ; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm14 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm14 = xmm9[0,1,2],mem[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm14 = xmm14[3,2,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm14[3,2,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm14[0,1],ymm8[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 1760(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -8012,7 +8012,7 @@ ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm5 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm5 = mem[0,1,2],xmm0[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm5 = xmm5[3,2,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm5[3,2,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1],ymm2[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 1536(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -8026,7 +8026,7 @@ ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm3[3,1],ymm2[0,2],ymm3[7,5],ymm2[4,6] ; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm5 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm5 = xmm1[0,1,2],mem[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm5 = xmm5[3,2,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm5[3,2,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1],ymm2[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 1088(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovaps %ymm6, %ymm14 @@ -8044,7 +8044,7 @@ ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload ; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm5 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm5 = xmm9[0,1,2],mem[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm5 = xmm5[3,2,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm5[3,2,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1],ymm2[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 640(%rdi), %ymm0 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload @@ -8062,7 +8062,7 @@ ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm5 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm5 = xmm4[0,1,2],mem[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm5 = xmm5[3,2,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm5[3,2,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm2[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %ymm13 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload @@ -8079,9 +8079,9 @@ ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm5 = xmm1[0,1,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm1[0,1,0,1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm2[0,1,2],xmm5[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm7 = xmm4[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm4[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = mem[0],xmm7[1],mem[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm7[0,1],xmm5[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7] @@ -8095,7 +8095,7 @@ ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm4[2,0],ymm0[6,4],ymm4[6,4] ; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm4 = xmm1[0,1,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm1[0,1,0,1] ; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm1[0,1,2],xmm4[3] @@ -8110,13 +8110,13 @@ ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm12[1,0],ymm3[2,0],ymm12[5,4],ymm3[6,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm1[2,0],ymm0[6,4],ymm1[6,4] ; AVX1-ONLY-NEXT: vmovaps 544(%rdi), %xmm2 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm2[0,1,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm2[0,1,0,1] ; AVX1-ONLY-NEXT: vmovaps %xmm2, %xmm6 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 512(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm4 = xmm9[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm9[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = mem[0],xmm4[1],mem[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm4[0,1],xmm1[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] @@ -8130,7 +8130,7 @@ ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm1[2,0],ymm0[6,4],ymm1[6,4] ; AVX1-ONLY-NEXT: vmovaps 768(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,1,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,0,1] ; AVX1-ONLY-NEXT: vmovaps 736(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3] @@ -8146,7 +8146,7 @@ ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm1[2,0],ymm0[6,4],ymm1[6,4] ; AVX1-ONLY-NEXT: vmovaps 992(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,1,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,0,1] ; AVX1-ONLY-NEXT: vmovaps 960(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3] @@ -8165,7 +8165,7 @@ ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm1[2,0],ymm0[6,4],ymm1[6,4] ; AVX1-ONLY-NEXT: vmovaps 1216(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,1,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,0,1] ; AVX1-ONLY-NEXT: vmovaps 1184(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3] @@ -8183,7 +8183,7 @@ ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm1[2,0],ymm0[6,4],ymm1[6,4] ; AVX1-ONLY-NEXT: vmovaps 1440(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm10 = xmm1[0,1,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm1[0,1,0,1] ; AVX1-ONLY-NEXT: vmovaps 1408(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm1[0,1,2],xmm10[3] ; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload @@ -8200,7 +8200,7 @@ ; AVX1-ONLY-NEXT: # ymm3 = ymm2[1,0],mem[2,0],ymm2[5,4],mem[6,4] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm0[2,0],ymm3[2,0],ymm0[6,4],ymm3[6,4] ; AVX1-ONLY-NEXT: vmovaps 1664(%rdi), %xmm9 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm10 = xmm9[0,1,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm9[0,1,0,1] ; AVX1-ONLY-NEXT: vmovaps 1632(%rdi), %xmm0 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm0[0,1,2],xmm10[3] ; AVX1-ONLY-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload @@ -8222,7 +8222,7 @@ ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm14[0,0],ymm15[1,0],ymm14[4,4],ymm15[5,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm11 = ymm11[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm11[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm11, %xmm11 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm11[0,1],xmm10[3,2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm10[0,1,2,3],ymm3[4,5,6,7] @@ -8239,7 +8239,7 @@ ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm10 = ymm10[0,0],mem[1,0],ymm10[4,4],mem[5,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm10 = ymm10[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm8 = xmm10[0,1],xmm8[3,2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm8[0,1,2,3],ymm3[4,5,6,7] @@ -8257,7 +8257,7 @@ ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm8 = ymm8[0,0],mem[1,0],ymm8[4,4],mem[5,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm8 = ymm8[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm8[0,1],xmm7[3,2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm7[0,1,2,3],ymm3[4,5,6,7] @@ -8275,7 +8275,7 @@ ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm7 = ymm7[0,0],mem[1,0],ymm7[4,4],mem[5,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm7 = ymm7[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm7[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm7 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm7[0,1],xmm6[3,2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm6[0,1,2,3],ymm3[4,5,6,7] @@ -8294,7 +8294,7 @@ ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm6 = ymm6[0,0],mem[1,0],ymm6[4,4],mem[5,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm6 = ymm6[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm6[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm6, %xmm6 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm6[0,1],xmm5[3,2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5,6,7] @@ -8312,7 +8312,7 @@ ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm5 = ymm5[0,0],mem[1,0],ymm5[4,4],mem[5,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm5 = ymm5[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm5 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm5[0,1],xmm4[3,2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] @@ -8330,7 +8330,7 @@ ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm4 = ymm4[0,0],mem[1,0],ymm4[4,4],mem[5,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm4 = ymm4[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm4 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm4[0,1],xmm1[3,2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] @@ -8346,14 +8346,14 @@ ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm3 = ymm3[0,0],mem[1,0],ymm3[4,4],mem[5,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm3 = ymm3[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm3 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm3[0,1],xmm0[3,2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm13[2,3,0,1] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm13[3,0],ymm0[0,0],ymm13[7,4],ymm0[4,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm12[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm12[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm1 = xmm1[0],mem[1],xmm1[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 @@ -8363,14 +8363,14 @@ ; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm1 = xmm1[0,1,2],mem[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm14[1,0],ymm15[2,0],ymm14[5,4],ymm15[6,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm2 = ymm2[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm2[2,3,0,1] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[3,0],ymm1[0,0],ymm2[7,4],ymm1[4,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm2 = xmm11[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm11[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm2 = xmm2[0],mem[1],xmm2[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 @@ -8382,7 +8382,7 @@ ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps $33, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm3 = ymm3[1,0],mem[2,0],ymm3[5,4],mem[6,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm3 = ymm3[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm3 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm1[4,5,6,7] @@ -8401,14 +8401,14 @@ ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps $33, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm4 = ymm4[1,0],mem[2,0],ymm4[5,4],mem[6,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm4 = ymm4[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm4 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm4[2,3,0,1] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm4[3,0],ymm3[0,0],ymm4[7,4],ymm3[4,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm4 = xmm8[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm8[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm4 = xmm4[0],mem[1],xmm4[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 @@ -8420,7 +8420,7 @@ ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps $33, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm5 = ymm5[1,0],mem[2,0],ymm5[5,4],mem[6,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm5 = ymm5[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm5 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm3[4,5,6,7] @@ -8440,14 +8440,14 @@ ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps $33, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm6 = ymm6[1,0],mem[2,0],ymm6[5,4],mem[6,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm6 = ymm6[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm6[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm6, %xmm6 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm3[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm6[2,3,0,1] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm6[3,0],ymm3[0,0],ymm6[7,4],ymm3[4,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm6 = xmm7[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm7[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm6 = xmm6[0],mem[1],xmm6[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 @@ -8459,7 +8459,7 @@ ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps $33, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm7 = ymm7[1,0],mem[2,0],ymm7[5,4],mem[6,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm7 = ymm7[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm7[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm7 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm7[0,1],xmm6[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm3[4,5,6,7] @@ -8479,7 +8479,7 @@ ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps $33, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm8 = ymm8[1,0],mem[2,0],ymm8[5,4],mem[6,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm8 = ymm8[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm8[0,1],xmm7[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm7[0,1,2,3],ymm3[4,5,6,7] @@ -8498,7 +8498,7 @@ ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps $33, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm9 = ymm9[1,0],mem[2,0],ymm9[5,4],mem[6,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm9 = ymm9[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm9[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm9, %xmm9 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] @@ -9092,8 +9092,8 @@ ; AVX2-SLOW-NEXT: # ymm13 = ymm5[0],mem[1],ymm5[2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm6 = xmm5[0,1,2],mem[3] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm6 = xmm6[3,2,2,3] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm13 = ymm13[3,1,1,0,7,5,5,4] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm6 = xmm6[3,2,2,3] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm13 = ymm13[3,1,1,0,7,5,5,4] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm13[2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm13 = ymm10[0,2],ymm4[1,3],ymm10[4,6],ymm4[5,7] ; AVX2-SLOW-NEXT: vbroadcastss 432(%rdi), %ymm14 @@ -9105,8 +9105,8 @@ ; AVX2-SLOW-NEXT: # ymm6 = ymm4[0],mem[1],ymm4[2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm5 = xmm4[0,1,2],mem[3] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm5 = xmm5[3,2,2,3] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm6 = ymm6[3,1,1,0,7,5,5,4] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm5 = xmm5[3,2,2,3] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm6 = ymm6[3,1,1,0,7,5,5,4] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm6[2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vshufps $216, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm6 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm6 = ymm3[0,2],mem[1,3],ymm3[4,6],mem[5,7] @@ -9119,8 +9119,8 @@ ; AVX2-SLOW-NEXT: # ymm5 = ymm3[0],mem[1],ymm3[2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm4 = xmm3[0,1,2],mem[3] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm4 = xmm4[3,2,2,3] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm5 = ymm5[3,1,1,0,7,5,5,4] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm4 = xmm4[3,2,2,3] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm5 = ymm5[3,1,1,0,7,5,5,4] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm5[2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload @@ -9134,8 +9134,8 @@ ; AVX2-SLOW-NEXT: # ymm4 = ymm3[0],mem[1],ymm3[2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1,2],mem[3] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm3 = xmm3[3,2,2,3] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm4 = ymm4[3,1,1,0,7,5,5,4] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm3[3,2,2,3] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm4 = ymm4[3,1,1,0,7,5,5,4] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX2-SLOW-NEXT: vshufps $216, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload @@ -9175,8 +9175,8 @@ ; AVX2-SLOW-NEXT: # ymm0 = ymm0[0],mem[1],ymm0[2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1,2],mem[3] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,2,2,3] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,2,2,3] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovdqa %ymm11, %ymm7 ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm12[0,2],ymm11[1,3],ymm12[4,6],ymm11[5,7] @@ -9189,8 +9189,8 @@ ; AVX2-SLOW-NEXT: # ymm0 = ymm0[0],mem[1],ymm0[2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1,2],mem[3] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,2,2,3] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,2,2,3] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload @@ -9321,13 +9321,13 @@ ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload ; AVX2-SLOW-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm5 = ymm5[0],mem[1],ymm5[2,3,4],mem[5],ymm5[6,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm5 = ymm5[1,0,3,3,5,4,7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm5 = ymm5[1,0,3,3,5,4,7,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,0,3] ; AVX2-SLOW-NEXT: vbroadcastss 216(%rdi), %ymm6 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm6[7] ; AVX2-SLOW-NEXT: vmovaps 96(%rdi), %xmm6 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm6[0,1,2],xmm0[3] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,3,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1,3,2] ; AVX2-SLOW-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm8 = mem[1,0,2,3,5,4,6,7] ; AVX2-SLOW-NEXT: vextractf128 $1, %ymm8, %xmm8 @@ -9336,14 +9336,14 @@ ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps 320(%rdi), %xmm0 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm0[0,1,2],xmm1[3] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,1,3,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,3,2] ; AVX2-SLOW-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm5 = mem[1,0,2,3,5,4,6,7] ; AVX2-SLOW-NEXT: vextractf128 $1, %ymm5, %xmm5 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm5[0,1],xmm1[2,3] ; AVX2-SLOW-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm5 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm5 = ymm11[0],mem[1],ymm11[2,3,4],mem[5],ymm11[6,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm5 = ymm5[1,0,3,3,5,4,7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm5 = ymm5[1,0,3,3,5,4,7,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,0,3] ; AVX2-SLOW-NEXT: vbroadcastss 440(%rdi), %ymm8 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm8[7] @@ -9351,7 +9351,7 @@ ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps 544(%rdi), %xmm5 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm5[0,1,2],xmm2[3] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,1,3,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,3,2] ; AVX2-SLOW-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm2 = mem[1,0,2,3,5,4,6,7] ; AVX2-SLOW-NEXT: vextractf128 $1, %ymm2, %xmm2 @@ -9359,7 +9359,7 @@ ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-SLOW-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm2 = ymm2[0],mem[1],ymm2[2,3,4],mem[5],ymm2[6,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm2 = ymm2[1,0,3,3,5,4,7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm2[1,0,3,3,5,4,7,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3] ; AVX2-SLOW-NEXT: vbroadcastss 664(%rdi), %ymm8 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm8[7] @@ -9367,7 +9367,7 @@ ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps 768(%rdi), %xmm8 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm8[0,1,2],xmm3[3] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,1,3,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,3,2] ; AVX2-SLOW-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm2 = mem[1,0,2,3,5,4,6,7] ; AVX2-SLOW-NEXT: vextractf128 $1, %ymm2, %xmm2 @@ -9375,7 +9375,7 @@ ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-SLOW-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm2 = ymm2[0],mem[1],ymm2[2,3,4],mem[5],ymm2[6,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm2 = ymm2[1,0,3,3,5,4,7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm2[1,0,3,3,5,4,7,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3] ; AVX2-SLOW-NEXT: vbroadcastss 888(%rdi), %ymm3 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] @@ -9383,7 +9383,7 @@ ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps 992(%rdi), %xmm1 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm2 = xmm1[0,1,2],xmm4[3] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[0,1,3,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,1,3,2] ; AVX2-SLOW-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm3 = mem[1,0,2,3,5,4,6,7] ; AVX2-SLOW-NEXT: vextractf128 $1, %ymm3, %xmm3 @@ -9391,14 +9391,14 @@ ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-SLOW-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm3 = ymm3[0],mem[1],ymm3[2,3,4],mem[5],ymm3[6,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm3 = ymm3[1,0,3,3,5,4,7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm3 = ymm3[1,0,3,3,5,4,7,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,3] ; AVX2-SLOW-NEXT: vbroadcastss 1112(%rdi), %ymm4 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm4[7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm11 = ymm2[0,1,2,3],ymm3[4,5,6,7] ; AVX2-SLOW-NEXT: vmovaps 1216(%rdi), %xmm2 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm3 = xmm2[0,1,2],xmm7[3] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm3 = xmm3[0,1,3,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,1,3,2] ; AVX2-SLOW-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm4 = mem[1,0,2,3,5,4,6,7] ; AVX2-SLOW-NEXT: vextractf128 $1, %ymm4, %xmm4 @@ -9406,7 +9406,7 @@ ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX2-SLOW-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm4 = ymm4[0],mem[1],ymm4[2,3,4],mem[5],ymm4[6,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm4 = ymm4[1,0,3,3,5,4,7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm4 = ymm4[1,0,3,3,5,4,7,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,0,3] ; AVX2-SLOW-NEXT: vbroadcastss 1336(%rdi), %ymm7 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm7[7] @@ -9414,27 +9414,27 @@ ; AVX2-SLOW-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps 1440(%rdi), %xmm3 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm4 = xmm3[0,1,2],xmm13[3] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm4 = xmm4[0,1,3,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm4 = xmm4[0,1,3,2] ; AVX2-SLOW-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm7 = mem[1,0,2,3,5,4,6,7] ; AVX2-SLOW-NEXT: vextractf128 $1, %ymm7, %xmm7 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm4 = xmm7[0,1],xmm4[2,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm12[0],ymm14[1],ymm12[2,3,4],ymm14[5],ymm12[6,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm7 = ymm7[1,0,3,3,5,4,7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm7 = ymm7[1,0,3,3,5,4,7,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,0,3] ; AVX2-SLOW-NEXT: vbroadcastss 1560(%rdi), %ymm13 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm13[7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm4[0,1,2,3],ymm7[4,5,6,7] ; AVX2-SLOW-NEXT: vmovaps 1664(%rdi), %xmm4 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm13 = xmm4[0,1,2],xmm15[3] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm13 = xmm13[0,1,3,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm13 = xmm13[0,1,3,2] ; AVX2-SLOW-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm15 = mem[1,0,2,3,5,4,6,7] ; AVX2-SLOW-NEXT: vextractf128 $1, %ymm15, %xmm15 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm13 = xmm15[0,1],xmm13[2,3] ; AVX2-SLOW-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm15 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm15 = ymm10[0],mem[1],ymm10[2,3,4],mem[5],ymm10[6,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm15 = ymm15[1,0,3,3,5,4,7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm15 = ymm15[1,0,3,3,5,4,7,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[0,1,0,3] ; AVX2-SLOW-NEXT: vbroadcastss 1784(%rdi), %ymm12 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm12 = ymm15[0,1,2,3,4,5,6],ymm12[7] @@ -10134,8 +10134,8 @@ ; AVX2-FAST-NEXT: # ymm13 = ymm4[0],mem[1],ymm4[2,3,4,5,6,7] ; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm6 = xmm4[0,1,2],mem[3] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm6 = xmm6[3,2,2,3] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm13 = ymm13[3,1,1,0,7,5,5,4] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm6 = xmm6[3,2,2,3] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm13 = ymm13[3,1,1,0,7,5,5,4] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm13[2,3,4,5,6,7] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm13 = ymm15[0,2],ymm3[1,3],ymm15[4,6],ymm3[5,7] @@ -10161,8 +10161,8 @@ ; AVX2-FAST-NEXT: # ymm5 = ymm3[0],mem[1],ymm3[2,3,4,5,6,7] ; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm4 = xmm3[0,1,2],mem[3] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm4 = xmm4[3,2,2,3] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm5 = ymm5[3,1,1,0,7,5,5,4] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm4 = xmm4[3,2,2,3] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm5 = ymm5[3,1,1,0,7,5,5,4] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm5[2,3,4,5,6,7] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-FAST-NEXT: vshufps $216, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm5 # 32-byte Folded Reload @@ -10176,8 +10176,8 @@ ; AVX2-FAST-NEXT: # ymm4 = ymm3[0],mem[1],ymm3[2,3,4,5,6,7] ; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1,2],mem[3] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm3 = xmm3[3,2,2,3] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm4 = ymm4[3,1,1,0,7,5,5,4] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm3 = xmm3[3,2,2,3] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm4[3,1,1,0,7,5,5,4] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3,4,5,6,7] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX2-FAST-NEXT: vshufps $216, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload @@ -10217,8 +10217,8 @@ ; AVX2-FAST-NEXT: # ymm0 = ymm0[0],mem[1],ymm0[2,3,4,5,6,7] ; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1,2],mem[3] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,2,2,3] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,2,2,3] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm9[0,2],ymm7[1,3],ymm9[4,6],ymm7[5,7] @@ -10231,8 +10231,8 @@ ; AVX2-FAST-NEXT: # ymm0 = ymm0[0],mem[1],ymm0[2,3,4,5,6,7] ; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1,2],mem[3] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,2,2,3] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,2,2,3] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-FAST-NEXT: vmovups (%rsp), %ymm4 # 32-byte Reload @@ -10370,7 +10370,7 @@ ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm6[7] ; AVX2-FAST-NEXT: vmovaps 96(%rdi), %xmm9 ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm0 = xmm9[0,1,2],xmm0[3] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,3,2] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1,3,2] ; AVX2-FAST-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm6 = mem[1,0,2,3,5,4,6,7] ; AVX2-FAST-NEXT: vextractf128 $1, %ymm6, %xmm6 @@ -10379,7 +10379,7 @@ ; AVX2-FAST-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps 320(%rdi), %xmm6 ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm0 = xmm6[0,1,2],xmm1[3] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,3,2] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1,3,2] ; AVX2-FAST-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm1 = mem[1,0,2,3,5,4,6,7] ; AVX2-FAST-NEXT: vextractf128 $1, %ymm1, %xmm1 @@ -10394,7 +10394,7 @@ ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps 544(%rdi), %xmm5 ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm0 = xmm5[0,1,2],xmm2[3] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,3,2] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1,3,2] ; AVX2-FAST-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm1 = mem[1,0,2,3,5,4,6,7] ; AVX2-FAST-NEXT: vextractf128 $1, %ymm1, %xmm1 @@ -10409,7 +10409,7 @@ ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps 768(%rdi), %xmm0 ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm0[0,1,2],xmm3[3] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,1,3,2] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,3,2] ; AVX2-FAST-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm2 = mem[1,0,2,3,5,4,6,7] ; AVX2-FAST-NEXT: vextractf128 $1, %ymm2, %xmm2 @@ -10423,7 +10423,7 @@ ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps 992(%rdi), %xmm1 ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm2 = xmm1[0,1,2],xmm4[3] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[0,1,3,2] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,1,3,2] ; AVX2-FAST-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm3 = mem[1,0,2,3,5,4,6,7] ; AVX2-FAST-NEXT: vextractf128 $1, %ymm3, %xmm3 @@ -10438,7 +10438,7 @@ ; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps 1216(%rdi), %xmm2 ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm3 = xmm2[0,1,2],xmm7[3] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm3 = xmm3[0,1,3,2] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,1,3,2] ; AVX2-FAST-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm4 = mem[1,0,2,3,5,4,6,7] ; AVX2-FAST-NEXT: vextractf128 $1, %ymm4, %xmm4 @@ -10452,7 +10452,7 @@ ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm3[0,1,2,3],ymm4[4,5,6,7] ; AVX2-FAST-NEXT: vmovaps 1440(%rdi), %xmm3 ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm4 = xmm3[0,1,2],xmm8[3] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm4 = xmm4[0,1,3,2] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm4 = xmm4[0,1,3,2] ; AVX2-FAST-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm8 = mem[1,0,2,3,5,4,6,7] ; AVX2-FAST-NEXT: vextractf128 $1, %ymm8, %xmm8 @@ -10465,7 +10465,7 @@ ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm4[0,1,2,3],ymm8[4,5,6,7] ; AVX2-FAST-NEXT: vmovaps 1664(%rdi), %xmm4 ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm11 = xmm4[0,1,2],xmm11[3] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm11 = xmm11[0,1,3,2] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm11 = xmm11[0,1,3,2] ; AVX2-FAST-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm15 = mem[1,0,2,3,5,4,6,7] ; AVX2-FAST-NEXT: vextractf128 $1, %ymm15, %xmm15 @@ -11183,8 +11183,8 @@ ; AVX2-FAST-PERLANE-NEXT: # ymm13 = ymm5[0],mem[1],ymm5[2,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm6 = xmm5[0,1,2],mem[3] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm6 = xmm6[3,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm13 = ymm13[3,1,1,0,7,5,5,4] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm6 = xmm6[3,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm13 = ymm13[3,1,1,0,7,5,5,4] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm13[2,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm13 = ymm10[0,2],ymm4[1,3],ymm10[4,6],ymm4[5,7] ; AVX2-FAST-PERLANE-NEXT: vbroadcastss 432(%rdi), %ymm14 @@ -11196,8 +11196,8 @@ ; AVX2-FAST-PERLANE-NEXT: # ymm6 = ymm4[0],mem[1],ymm4[2,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm5 = xmm4[0,1,2],mem[3] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm5 = xmm5[3,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm6 = ymm6[3,1,1,0,7,5,5,4] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm5 = xmm5[3,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm6 = ymm6[3,1,1,0,7,5,5,4] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm6[2,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vshufps $216, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm6 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm6 = ymm3[0,2],mem[1,3],ymm3[4,6],mem[5,7] @@ -11210,8 +11210,8 @@ ; AVX2-FAST-PERLANE-NEXT: # ymm5 = ymm3[0],mem[1],ymm3[2,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm4 = xmm3[0,1,2],mem[3] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm4 = xmm4[3,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm5 = ymm5[3,1,1,0,7,5,5,4] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm4 = xmm4[3,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm5 = ymm5[3,1,1,0,7,5,5,4] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm5[2,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload @@ -11225,8 +11225,8 @@ ; AVX2-FAST-PERLANE-NEXT: # ymm4 = ymm3[0],mem[1],ymm3[2,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1,2],mem[3] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm3 = xmm3[3,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm4 = ymm4[3,1,1,0,7,5,5,4] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm3 = xmm3[3,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm4 = ymm4[3,1,1,0,7,5,5,4] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vshufps $216, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload @@ -11266,8 +11266,8 @@ ; AVX2-FAST-PERLANE-NEXT: # ymm0 = ymm0[0],mem[1],ymm0[2,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1,2],mem[3] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm11, %ymm7 ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm12[0,2],ymm11[1,3],ymm12[4,6],ymm11[5,7] @@ -11280,8 +11280,8 @@ ; AVX2-FAST-PERLANE-NEXT: # ymm0 = ymm0[0],mem[1],ymm0[2,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1,2],mem[3] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,2,2,3] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,1,1,0,7,5,5,4] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload @@ -11412,13 +11412,13 @@ ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm5 = ymm5[0],mem[1],ymm5[2,3,4],mem[5],ymm5[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm5 = ymm5[1,0,3,3,5,4,7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm5 = ymm5[1,0,3,3,5,4,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,0,3] ; AVX2-FAST-PERLANE-NEXT: vbroadcastss 216(%rdi), %ymm6 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm6[7] ; AVX2-FAST-PERLANE-NEXT: vmovaps 96(%rdi), %xmm6 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm0 = xmm6[0,1,2],xmm0[3] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,3,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1,3,2] ; AVX2-FAST-PERLANE-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm8 = mem[1,0,2,3,5,4,6,7] ; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm8, %xmm8 @@ -11427,14 +11427,14 @@ ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 320(%rdi), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm1 = xmm0[0,1,2],xmm1[3] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,1,3,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,3,2] ; AVX2-FAST-PERLANE-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm5 = mem[1,0,2,3,5,4,6,7] ; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm5, %xmm5 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm1 = xmm5[0,1],xmm1[2,3] ; AVX2-FAST-PERLANE-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm5 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm5 = ymm11[0],mem[1],ymm11[2,3,4],mem[5],ymm11[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm5 = ymm5[1,0,3,3,5,4,7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm5 = ymm5[1,0,3,3,5,4,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,0,3] ; AVX2-FAST-PERLANE-NEXT: vbroadcastss 440(%rdi), %ymm8 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm8[7] @@ -11442,7 +11442,7 @@ ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 544(%rdi), %xmm5 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm1 = xmm5[0,1,2],xmm2[3] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,1,3,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,3,2] ; AVX2-FAST-PERLANE-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[1,0,2,3,5,4,6,7] ; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm2, %xmm2 @@ -11450,7 +11450,7 @@ ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm2 = ymm2[0],mem[1],ymm2[2,3,4],mem[5],ymm2[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm2 = ymm2[1,0,3,3,5,4,7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm2[1,0,3,3,5,4,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3] ; AVX2-FAST-PERLANE-NEXT: vbroadcastss 664(%rdi), %ymm8 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm8[7] @@ -11458,7 +11458,7 @@ ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 768(%rdi), %xmm8 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm1 = xmm8[0,1,2],xmm3[3] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,1,3,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,3,2] ; AVX2-FAST-PERLANE-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[1,0,2,3,5,4,6,7] ; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm2, %xmm2 @@ -11466,7 +11466,7 @@ ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm2 = ymm2[0],mem[1],ymm2[2,3,4],mem[5],ymm2[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm2 = ymm2[1,0,3,3,5,4,7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm2[1,0,3,3,5,4,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3] ; AVX2-FAST-PERLANE-NEXT: vbroadcastss 888(%rdi), %ymm3 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] @@ -11474,7 +11474,7 @@ ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 992(%rdi), %xmm1 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm2 = xmm1[0,1,2],xmm4[3] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[0,1,3,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,1,3,2] ; AVX2-FAST-PERLANE-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm3 = mem[1,0,2,3,5,4,6,7] ; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm3, %xmm3 @@ -11482,14 +11482,14 @@ ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm3 = ymm3[0],mem[1],ymm3[2,3,4],mem[5],ymm3[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm3 = ymm3[1,0,3,3,5,4,7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm3 = ymm3[1,0,3,3,5,4,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,3] ; AVX2-FAST-PERLANE-NEXT: vbroadcastss 1112(%rdi), %ymm4 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm4[7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm11 = ymm2[0,1,2,3],ymm3[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps 1216(%rdi), %xmm2 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm3 = xmm2[0,1,2],xmm7[3] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm3 = xmm3[0,1,3,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,1,3,2] ; AVX2-FAST-PERLANE-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm4 = mem[1,0,2,3,5,4,6,7] ; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm4, %xmm4 @@ -11497,7 +11497,7 @@ ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm4 = ymm4[0],mem[1],ymm4[2,3,4],mem[5],ymm4[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm4 = ymm4[1,0,3,3,5,4,7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm4 = ymm4[1,0,3,3,5,4,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,0,3] ; AVX2-FAST-PERLANE-NEXT: vbroadcastss 1336(%rdi), %ymm7 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm7[7] @@ -11505,27 +11505,27 @@ ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 1440(%rdi), %xmm3 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm4 = xmm3[0,1,2],xmm13[3] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm4 = xmm4[0,1,3,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm4 = xmm4[0,1,3,2] ; AVX2-FAST-PERLANE-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm7 = mem[1,0,2,3,5,4,6,7] ; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm7, %xmm7 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm4 = xmm7[0,1],xmm4[2,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm12[0],ymm14[1],ymm12[2,3,4],ymm14[5],ymm12[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm7 = ymm7[1,0,3,3,5,4,7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm7 = ymm7[1,0,3,3,5,4,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,0,3] ; AVX2-FAST-PERLANE-NEXT: vbroadcastss 1560(%rdi), %ymm13 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm13[7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm4[0,1,2,3],ymm7[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps 1664(%rdi), %xmm4 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm13 = xmm4[0,1,2],xmm15[3] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm13 = xmm13[0,1,3,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm13 = xmm13[0,1,3,2] ; AVX2-FAST-PERLANE-NEXT: vpermilps $225, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm15 = mem[1,0,2,3,5,4,6,7] ; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm15, %xmm15 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm13 = xmm15[0,1],xmm13[2,3] ; AVX2-FAST-PERLANE-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm15 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm15 = ymm10[0],mem[1],ymm10[2,3,4],mem[5],ymm10[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm15 = ymm15[1,0,3,3,5,4,7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm15 = ymm15[1,0,3,3,5,4,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[0,1,0,3] ; AVX2-FAST-PERLANE-NEXT: vbroadcastss 1784(%rdi), %ymm12 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm12 = ymm15[0,1,2,3,4,5,6],ymm12[7] diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-8.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-8.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-8.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-8.ll @@ -61,12 +61,12 @@ ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm3 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm3 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm1[1,0],ymm0[1,0],ymm1[5,4],ymm0[5,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm6 = ymm6[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm6[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm6, %xmm6 ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm7 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm7 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[3,0],ymm0[3,0],ymm1[7,4],ymm0[7,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-ONLY-NEXT: vmovq %xmm4, (%rsi) ; AVX1-ONLY-NEXT: vmovq %xmm5, (%rdx) @@ -95,12 +95,12 @@ ; AVX2-ONLY-NEXT: vbroadcastss 48(%rdi), %xmm3 ; AVX2-ONLY-NEXT: vbroadcastss 16(%rdi), %ymm6 ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm3 = xmm6[0],xmm3[0],xmm6[1],xmm3[1] -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} ymm6 = ymm1[1,1,1,1,5,5,5,5] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm1[1,1,1,1,5,5,5,5] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm0[1],ymm6[2,3,4],ymm0[5],ymm6[6,7] ; AVX2-ONLY-NEXT: vextractf128 $1, %ymm6, %xmm6 ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm0 = ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[6],ymm0[6],ymm1[7],ymm0[7] ; AVX2-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] ; AVX2-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX2-ONLY-NEXT: vmovq %xmm4, (%rsi) ; AVX2-ONLY-NEXT: vmovq %xmm5, (%rdx) @@ -128,12 +128,12 @@ ; AVX512F-SLOW-NEXT: vmovaps (%rdi), %ymm4 ; AVX512F-SLOW-NEXT: vunpcklps {{.*#+}} ymm5 = ymm4[0],ymm1[0],ymm4[1],ymm1[1],ymm4[4],ymm1[4],ymm4[5],ymm1[5] ; AVX512F-SLOW-NEXT: vextractf128 $1, %ymm5, %xmm5 -; AVX512F-SLOW-NEXT: vpermilps {{.*#+}} ymm6 = ymm4[1,1,1,1,5,5,5,5] +; AVX512F-SLOW-NEXT: vshufps {{.*#+}} ymm6 = ymm4[1,1,1,1,5,5,5,5] ; AVX512F-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm1[1],ymm6[2,3,4],ymm1[5],ymm6[6,7] ; AVX512F-SLOW-NEXT: vextractf128 $1, %ymm6, %xmm6 ; AVX512F-SLOW-NEXT: vunpckhps {{.*#+}} ymm1 = ymm4[2],ymm1[2],ymm4[3],ymm1[3],ymm4[6],ymm1[6],ymm4[7],ymm1[7] ; AVX512F-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm4 -; AVX512F-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[2,3,2,3,6,7,6,7] +; AVX512F-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,3,2,3,6,7,6,7] ; AVX512F-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX512F-SLOW-NEXT: vmovq %xmm2, (%rsi) ; AVX512F-SLOW-NEXT: vmovq %xmm3, (%rdx) @@ -195,12 +195,12 @@ ; AVX512BW-SLOW-NEXT: vmovaps (%rdi), %ymm4 ; AVX512BW-SLOW-NEXT: vunpcklps {{.*#+}} ymm5 = ymm4[0],ymm1[0],ymm4[1],ymm1[1],ymm4[4],ymm1[4],ymm4[5],ymm1[5] ; AVX512BW-SLOW-NEXT: vextractf128 $1, %ymm5, %xmm5 -; AVX512BW-SLOW-NEXT: vpermilps {{.*#+}} ymm6 = ymm4[1,1,1,1,5,5,5,5] +; AVX512BW-SLOW-NEXT: vshufps {{.*#+}} ymm6 = ymm4[1,1,1,1,5,5,5,5] ; AVX512BW-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm1[1],ymm6[2,3,4],ymm1[5],ymm6[6,7] ; AVX512BW-SLOW-NEXT: vextractf128 $1, %ymm6, %xmm6 ; AVX512BW-SLOW-NEXT: vunpckhps {{.*#+}} ymm1 = ymm4[2],ymm1[2],ymm4[3],ymm1[3],ymm4[6],ymm1[6],ymm4[7],ymm1[7] ; AVX512BW-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm4 -; AVX512BW-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[2,3,2,3,6,7,6,7] +; AVX512BW-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,3,2,3,6,7,6,7] ; AVX512BW-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX512BW-SLOW-NEXT: vmovq %xmm2, (%rsi) ; AVX512BW-SLOW-NEXT: vmovq %xmm3, (%rdx) @@ -330,17 +330,17 @@ ; AVX1-ONLY-NEXT: vmovaps 64(%rdi), %xmm8 ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm9 = xmm8[0],xmm7[0],xmm8[1],xmm7[1] ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm4 = xmm4[0],xmm9[0] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm10 = xmm6[1,1,1,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm6[1,1,1,1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm10[0],xmm5[1],xmm10[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm9 = xmm10[0,1],xmm9[2,3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm10 = xmm7[2,2,2,2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm7[2,2,2,2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm8[0,1,2],xmm10[3] ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm5 = xmm6[2],xmm5[2],xmm6[3],xmm5[3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm5[0,1],xmm10[2,3] ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm7 = xmm8[2],xmm7[2],xmm8[3],xmm7[3] ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm5 = xmm5[1],xmm7[1] ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm7 = ymm7[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm7[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm7 ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm8 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 @@ -348,11 +348,11 @@ ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm8 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[4],ymm3[4],ymm2[5],ymm3[5] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm1[1,0],ymm0[1,0],ymm1[5,4],ymm0[5,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm10 = ymm10[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm10[0,1],xmm8[2,3] ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm10 = ymm3[1],ymm2[1],ymm3[3],ymm2[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm10 = ymm10[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm11 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm11, %xmm11 @@ -360,7 +360,7 @@ ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm2 = ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[6],ymm3[6],ymm2[7],ymm3[7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[3,0],ymm0[3,0],ymm1[7,4],ymm0[7,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3] ; AVX1-ONLY-NEXT: vmovaps %xmm4, (%rsi) @@ -393,10 +393,10 @@ ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm5 = xmm7[0],xmm8[0],xmm7[1],xmm8[1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm5 = xmm9[0],xmm6[0],xmm9[1],xmm6[1] -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm10 = xmm7[1,1,1,1] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm7[1,1,1,1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm10[0],xmm8[1],xmm10[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm10[0,1],xmm5[2,3] -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm10 = xmm6[2,2,2,2] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm6[2,2,2,2] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm9[0,1,2],xmm10[3] ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm7 = xmm7[2],xmm8[2],xmm7[3],xmm8[3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm7[0,1],xmm10[2,3] @@ -605,18 +605,18 @@ ; AVX1-ONLY-NEXT: vmovaps 224(%rdi), %xmm14 ; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %xmm15 ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm6 = xmm15[0],xmm14[0],xmm15[1],xmm14[1] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm7 = xmm6[0,1,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm6[0,1,0,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm7[6,7] ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm7 = xmm11[0],xmm9[0],xmm11[1],xmm9[1] ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm4 = xmm4[0],xmm7[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm5 = xmm10[1,1,1,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm10[1,1,1,1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0],xmm8[1],xmm5[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1],xmm7[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm7 = xmm13[1,1,1,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm13[1,1,1,1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0],xmm12[1],xmm7[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm6[6,7] @@ -626,11 +626,11 @@ ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm8 = xmm10[2],xmm8[2],xmm10[3],xmm8[3] ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm12 = xmm13[2],xmm12[2],xmm13[3],xmm12[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm10 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm13 = xmm14[2,2,2,2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm13 = xmm14[2,2,2,2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm13 = xmm15[0,1,2],xmm13[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm0, %ymm13 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm13[6,7] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm13 = xmm9[2,2,2,2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm13 = xmm9[2,2,2,2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm13 = xmm11[0,1,2],xmm13[3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm13 = xmm8[0,1],xmm13[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm10[4,5,6,7] @@ -640,7 +640,7 @@ ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm8 = xmm8[1],xmm9[1] ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm9 = xmm15[2],xmm14[2],xmm15[3],xmm14[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm9 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm12 = xmm12[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm12 = xmm12[2,3,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm12 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm12[0,1,2,3,4,5],ymm9[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm8[0,1,2,3],ymm9[4,5,6,7] @@ -648,7 +648,7 @@ ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm12 = ymm7[0],ymm6[0],ymm7[1],ymm6[1],ymm7[4],ymm6[4],ymm7[5],ymm6[5] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm12[0,1],ymm8[2,0],ymm12[4,5],ymm8[6,4] ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm12 = ymm12[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm12[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm12, %xmm12 ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm14 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm14 @@ -660,7 +660,7 @@ ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm14 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[4],ymm3[4],ymm2[5],ymm3[5] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm14 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm1[1,0],ymm0[1,0],ymm1[5,4],ymm0[5,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm15 = ymm15[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm15, %xmm15 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm15[0,1],xmm14[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm14[0,1,2,3],ymm8[4,5,6,7] @@ -668,7 +668,7 @@ ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm15 = ymm7[2],ymm6[2],ymm7[3],ymm6[3],ymm7[6],ymm6[6],ymm7[7],ymm6[7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm15[0,1],ymm14[2,0],ymm15[4,5],ymm14[6,4] ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm15 = ymm3[1],ymm2[1],ymm3[3],ymm2[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm15 = ymm15[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm15, %xmm15 ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm4 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm4 @@ -680,7 +680,7 @@ ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm2 = ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[6],ymm3[6],ymm2[7],ymm3[7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[3,0],ymm0[3,0],ymm1[7,4],ymm0[7,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm1 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7] @@ -728,12 +728,12 @@ ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm5 = xmm14[0],xmm11[0],xmm14[1],xmm11[1] -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm6 = xmm12[1,1,1,1] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm12[1,1,1,1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0],xmm13[1],xmm6[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3] ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm6 = xmm10[0],xmm8[0],xmm10[1],xmm8[1] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm15 = xmm9[1,1,1,1] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm9[1,1,1,1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm15[0],xmm7[1],xmm15[2,3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0,1,2,3,4,5],ymm6[6,7] @@ -741,13 +741,13 @@ ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm15[4,5,6,7] ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm15 = xmm9[2],xmm7[2],xmm9[3],xmm7[3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm7 -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm9 = xmm8[2,2,2,2] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm9 = xmm8[2,2,2,2] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm9 = xmm10[0,1,2],xmm9[3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm9 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm9[6,7] ; AVX2-ONLY-NEXT: vmovdqa 192(%rdi), %ymm9 ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm12[2],xmm13[2],xmm12[3],xmm13[3] -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm12 = xmm11[2,2,2,2] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm12 = xmm11[2,2,2,2] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm12 = xmm14[0,1,2],xmm12[3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm12 = xmm0[0,1],xmm12[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm12[0,1,2,3],ymm7[4,5,6,7] @@ -755,7 +755,7 @@ ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm8 = xmm10[2],xmm8[2],xmm10[3],xmm8[3] ; AVX2-ONLY-NEXT: vmovdqa 128(%rdi), %ymm13 ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm10 = xmm15[2,3,2,3] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm15[2,3,2,3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm10 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm10[0,1,2,3,4,5],ymm8[6,7] ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm10 = xmm14[2],xmm11[2],xmm14[3],xmm11[3] @@ -1146,7 +1146,7 @@ ; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %xmm3 ; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm5 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm7 = xmm5[0,1,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm5[0,1,0,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm7[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] @@ -1160,7 +1160,7 @@ ; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm4 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm13 = xmm4[0,1,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm13 = xmm4[0,1,0,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm0, %ymm13 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],ymm13[6,7] ; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %xmm15 @@ -1172,22 +1172,22 @@ ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm9[1,1,1,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm9[1,1,1,1] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm12[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm5 = xmm6[1,1,1,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm6[1,1,1,1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0],xmm8[1],xmm5[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1,2,3,4,5],ymm2[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm7[1,1,1,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm7[1,1,1,1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm15[1],xmm1[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm2 = xmm11[1,1,1,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm11[1,1,1,1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm10[1],xmm2[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] @@ -1196,14 +1196,14 @@ ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm9[2],xmm3[2],xmm9[3],xmm3[3] ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm6[2],xmm8[2],xmm6[3],xmm8[3] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm2 = xmm12[2,2,2,2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm12[2,2,2,2] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm9[0,1,2],xmm2[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm3 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm3 = xmm8[2,2,2,2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm8[2,2,2,2] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm6[0,1,2],xmm3[3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm0[0,1],xmm3[2,3] @@ -1212,13 +1212,13 @@ ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm7[2],xmm15[2],xmm7[3],xmm15[3] ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm3 = xmm11[2],xmm10[2],xmm11[3],xmm10[3] ; AVX1-ONLY-NEXT: vmovaps (%rsp), %xmm10 # 16-byte Reload -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm4 = xmm10[2,2,2,2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm10[2,2,2,2] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm7[0,1,2],xmm4[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm5 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm5 = xmm13[2,2,2,2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm13[2,2,2,2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm14[0,1,2],xmm5[3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm2[0,1],xmm5[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] @@ -1229,7 +1229,7 @@ ; AVX1-ONLY-NEXT: vmovaps 288(%rdi), %ymm5 ; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm4[6,7] ; AVX1-ONLY-NEXT: vmovaps 256(%rdi), %ymm4 @@ -1247,7 +1247,7 @@ ; AVX1-ONLY-NEXT: vmovaps 480(%rdi), %ymm7 ; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm2 = xmm3[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm3[2,3,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] @@ -1261,7 +1261,7 @@ ; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %ymm10 ; AVX1-ONLY-NEXT: vmovaps 352(%rdi), %ymm9 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm9[0],ymm10[0],ymm9[2],ymm10[2] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm4 = ymm4[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm4 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm2[4,5,6,7] @@ -1280,7 +1280,7 @@ ; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm2[0],ymm0[0],ymm2[2],ymm0[2] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm0 ; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm14 ; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %ymm13 @@ -1300,7 +1300,7 @@ ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm0[1,0],ymm2[1,0],ymm0[5,4],ymm2[5,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm12 = ymm12[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm12[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm12, %xmm12 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm12[0,1],xmm1[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm6[4,5,6,7] @@ -1313,7 +1313,7 @@ ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm6[0],ymm8[0],ymm6[1],ymm8[1],ymm6[4],ymm8[4],ymm6[5],ymm8[5] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm13[1,0],ymm14[1,0],ymm13[5,4],ymm14[5,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm12 = ymm12[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm12[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm12, %xmm12 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm12[0,1],xmm1[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm7[4,5,6,7] @@ -1326,7 +1326,7 @@ ; AVX1-ONLY-NEXT: vmovaps %ymm0, %ymm5 ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm12 = ymm9[1],ymm10[1],ymm9[3],ymm10[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm12 = ymm12[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm12[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm12, %xmm12 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm12[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm1[0,1,2,3],ymm3[4,5,6,7] @@ -1336,7 +1336,7 @@ ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[6],ymm2[6],ymm3[7],ymm2[7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4] ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm8[1],ymm6[1],ymm8[3],ymm6[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm12 = ymm14[2],ymm13[2],ymm14[3],ymm13[3],ymm14[6],ymm13[6],ymm14[7],ymm13[7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm12, %xmm12 @@ -1352,7 +1352,7 @@ ; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm5 = ymm5[3,0],mem[3,0],ymm5[7,4],mem[7,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm5 = ymm5[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm5 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm5[0,1],xmm1[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] @@ -1362,7 +1362,7 @@ ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm3 = ymm6[2],ymm8[2],ymm6[3],ymm8[3],ymm6[6],ymm8[6],ymm6[7],ymm8[7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm13[3,0],ymm14[3,0],ymm13[7,4],ymm14[7,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm3 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm2 = ymm2[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] @@ -1461,19 +1461,19 @@ ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm6[0,1],xmm0[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm8[1,1,1,1] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm8[1,1,1,1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm7[1],xmm0[2,3] ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm14[0],xmm12[0],xmm14[1],xmm12[1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm11[0],xmm10[0],xmm11[1],xmm10[1] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm6 = xmm4[1,1,1,1] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm4[1,1,1,1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0],xmm5[1],xmm6[2,3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm15[1,1,1,1] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm15[1,1,1,1] ; AVX2-ONLY-NEXT: vmovaps %xmm15, %xmm10 ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm5[1],xmm0[2,3] @@ -1485,7 +1485,7 @@ ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm9[0],xmm2[1],xmm9[1] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm6 = xmm13[1,1,1,1] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm13[1,1,1,1] ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0],xmm4[1],xmm6[2,3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 @@ -1494,12 +1494,12 @@ ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm10[2],xmm5[2],xmm10[3],xmm5[3] ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm13[2],xmm4[2],xmm13[3],xmm4[3] -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm6 = xmm9[2,2,2,2] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm9[2,2,2,2] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm2[0,1,2],xmm6[3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm13 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm13[0,1,2,3,4,5],ymm6[6,7] -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm13 = xmm15[2,2,2,2] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm13 = xmm15[2,2,2,2] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm13 = xmm3[0,1,2],xmm13[3] ; AVX2-ONLY-NEXT: vmovaps %xmm3, %xmm9 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm13 = xmm0[0,1],xmm13[2,3] @@ -1509,13 +1509,13 @@ ; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm4 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm4 = xmm2[2],mem[2],xmm2[3],mem[3] ; AVX2-ONLY-NEXT: vmovaps (%rsp), %xmm10 # 16-byte Reload -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm5 = xmm10[2,2,2,2] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm10[2,2,2,2] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm11[0,1,2],xmm5[3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm6 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5],ymm5[6,7] ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm8[2],xmm7[2],xmm8[3],xmm7[3] -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm3 = xmm12[2,2,2,2] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm12[2,2,2,2] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm14[0,1,2],xmm3[3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm2[0,1],xmm3[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm5[4,5,6,7] @@ -1528,7 +1528,7 @@ ; AVX2-ONLY-NEXT: vmovdqa 96(%rdi), %ymm5 ; AVX2-ONLY-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[2,3,2,3] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7] ; AVX2-ONLY-NEXT: vmovdqa 64(%rdi), %ymm6 @@ -1540,7 +1540,7 @@ ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm11[2],xmm10[2],xmm11[3],xmm10[3] ; AVX2-ONLY-NEXT: vmovdqa 192(%rdi), %ymm13 ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm4[2,3,2,3] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm4[2,3,2,3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovdqa 160(%rdi), %ymm7 @@ -2473,7 +2473,7 @@ ; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %xmm3 ; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm15 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm2 = xmm15[0,1,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm15[0,1,0,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] @@ -2490,7 +2490,7 @@ ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,1,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,0,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovaps 800(%rdi), %xmm1 @@ -2519,7 +2519,7 @@ ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,1,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,0,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm11 @@ -2546,7 +2546,7 @@ ; AVX1-ONLY-NEXT: vmovaps 704(%rdi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm12 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm8 = xmm12[0,1,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm8 = xmm12[0,1,0,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm7[0,1,2,3,4,5],ymm8[6,7] ; AVX1-ONLY-NEXT: vmovaps 544(%rdi), %xmm7 @@ -2560,19 +2560,19 @@ ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm3[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm14[1,1,1,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm14[1,1,1,1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm9[1],xmm0[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm10[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm15 = xmm10[1,1,1,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm10[1,1,1,1] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm15[0],xmm8[1],xmm15[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm15[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm6[1,1,1,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm6[1,1,1,1] ; AVX1-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = xmm0[0],mem[1],xmm0[2,3] ; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload @@ -2586,7 +2586,7 @@ ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm4[1,1,1,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm4[1,1,1,1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm11[1],xmm0[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm13[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 16-byte Folded Reload @@ -2598,7 +2598,7 @@ ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm5[1,1,1,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm5[1,1,1,1] ; AVX1-ONLY-NEXT: vmovaps %xmm5, %xmm11 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm7[1],xmm0[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3] @@ -2622,7 +2622,7 @@ ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm3 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm3 = xmm0[2,2,2,2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm0[2,2,2,2] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm15[0,1,2],xmm3[3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm1[0,1],xmm3[2,3] @@ -2644,7 +2644,7 @@ ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm3 = xmm10[2,2,2,2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm10[2,2,2,2] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm5[0,1],xmm3[2,3] @@ -2658,14 +2658,14 @@ ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm3 = xmm2[2],xmm6[2],xmm2[3],xmm6[3] ; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm2 = xmm13[2,2,2,2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm13[2,2,2,2] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm12[0,1,2],xmm2[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm3 = xmm6[2,2,2,2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm6[2,2,2,2] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm5[0,1,2],xmm3[3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm8[0,1],xmm3[2,3] @@ -2677,14 +2677,14 @@ ; AVX1-ONLY-NEXT: # xmm2 = xmm2[2],mem[2],xmm2[3],mem[3] ; AVX1-ONLY-NEXT: vmovaps %xmm2, (%rsp) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm7 = xmm11[2,2,2,2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm11[2,2,2,2] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm9[0,1,2],xmm7[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm8 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1,2,3,4,5],ymm7[6,7] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm8 = xmm3[2,2,2,2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm8 = xmm3[2,2,2,2] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm2[0,1,2],xmm8[3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm14[0,1],xmm8[2,3] @@ -2759,7 +2759,7 @@ ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm4[0],ymm2[0],ymm4[2],ymm2[2] ; AVX1-ONLY-NEXT: vmovaps %ymm4, %ymm14 ; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] @@ -2786,7 +2786,7 @@ ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm2[0],ymm4[0],ymm2[2],ymm4[2] ; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] @@ -2813,7 +2813,7 @@ ; AVX1-ONLY-NEXT: vmovaps 608(%rdi), %ymm5 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm5[0],ymm0[0],ymm5[2],ymm0[2] ; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm13 = ymm13[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm13[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm13, %xmm13 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm12 = xmm12[0,1],xmm13[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm10[4,5,6,7] @@ -2834,7 +2834,7 @@ ; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm2[0],ymm0[0],ymm2[2],ymm0[2] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm12 = ymm12[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm12[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm12, %xmm0 ; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -2854,7 +2854,7 @@ ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm7[1,0],ymm6[1,0],ymm7[5,4],ymm6[5,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm13 = ymm13[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm13[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm13, %xmm13 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm13[0,1],xmm1[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] @@ -2871,7 +2871,7 @@ ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm7[1,0],ymm9[1,0],ymm7[5,4],ymm9[5,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm13 = ymm13[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm13[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm13, %xmm13 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm13[0,1],xmm1[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] @@ -2888,7 +2888,7 @@ ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm5[1,0],ymm2[1,0],ymm5[5,4],ymm2[5,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm13 = ymm13[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm13[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm13, %xmm13 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm13[0,1],xmm1[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] @@ -2906,7 +2906,7 @@ ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-ONLY-NEXT: vshufps $17, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm13 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm13 = ymm15[1,0],mem[1,0],ymm15[5,4],mem[5,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm13 = ymm13[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm13[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm13, %xmm13 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm13[0,1],xmm1[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] @@ -2922,7 +2922,7 @@ ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm13 = ymm3[1],ymm12[1],ymm3[3],ymm12[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm13 = ymm13[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm13[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm13, %xmm13 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm13[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] @@ -2936,7 +2936,7 @@ ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm13 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm13 = ymm3[1],mem[1],ymm3[3],mem[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm13 = ymm13[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm13[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm13, %xmm13 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm13[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] @@ -2952,7 +2952,7 @@ ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm13 = ymm2[1],ymm4[1],ymm2[3],ymm4[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm13 = ymm13[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm13[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm13, %xmm13 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm13[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] @@ -2966,7 +2966,7 @@ ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm1 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm1 = ymm5[1],mem[1],ymm5[3],mem[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm13 = ymm9[2],ymm15[2],ymm9[3],ymm15[3],ymm9[6],ymm15[6],ymm9[7],ymm15[7] @@ -2986,7 +2986,7 @@ ; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm11 = ymm11[3,0],mem[3,0],ymm11[7,4],mem[7,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm11 = ymm11[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm11[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm11, %xmm11 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm11[0,1],xmm1[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm1[0,1,2,3],ymm0[4,5,6,7] @@ -2999,7 +2999,7 @@ ; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm2 = ymm2[3,0],mem[3,0],ymm2[7,4],mem[7,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm2 = ymm2[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] @@ -3017,7 +3017,7 @@ ; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm3 = ymm3[3,0],mem[3,0],ymm3[7,4],mem[7,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm3 = ymm3[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm3 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] @@ -3029,7 +3029,7 @@ ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm3 = ymm3[2],ymm5[2],ymm3[3],ymm5[3],ymm3[6],ymm5[6],ymm3[7],ymm5[7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm15[3,0],ymm9[3,0],ymm15[7,4],ymm9[7,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm3 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm4 = ymm4[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm4 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] @@ -3225,7 +3225,7 @@ ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm3[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm11[1,1,1,1] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm11[1,1,1,1] ; AVX2-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm0 = xmm0[0],mem[1],xmm0[2,3] ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm10[0],xmm13[0],xmm10[1],xmm13[1] @@ -3261,21 +3261,21 @@ ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm14[1,1,1,1] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm14[1,1,1,1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm4[1],xmm0[2,3] ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] ; AVX2-ONLY-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm1 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm1 = xmm8[0],mem[0],xmm8[1],mem[1] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm2 = xmm7[1,1,1,1] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm7[1,1,1,1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm9[1],xmm2[2,3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm8[1,1,1,1] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm8[1,1,1,1] ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm7[1],xmm0[2,3] ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload @@ -3287,7 +3287,7 @@ ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm2 = xmm6[1,1,1,1] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm6[1,1,1,1] ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm5[1],xmm2[2,3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 @@ -3301,13 +3301,13 @@ ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm0[2],xmm11[2],xmm0[3],xmm11[3] ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm15[2,2,2,2] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm15[2,2,2,2] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm13[0,1,2],xmm0[3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm15 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3,4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm15 = xmm2[2,2,2,2] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm2[2,2,2,2] ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm1[0,1,2],xmm15[3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm14[0,1],xmm15[2,3] @@ -3320,13 +3320,13 @@ ; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm0 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm0 = xmm11[2],mem[2],xmm11[3],mem[3] ; AVX2-ONLY-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm14 = xmm12[2,2,2,2] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm12[2,2,2,2] ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm12[0,1,2],xmm14[3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14 ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm13 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],ymm14[6,7] -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm14 = xmm10[2,2,2,2] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm10[2,2,2,2] ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm0[0,1,2],xmm14[3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm15[0,1],xmm14[2,3] @@ -3334,12 +3334,12 @@ ; AVX2-ONLY-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm13 = xmm8[2],xmm7[2],xmm8[3],xmm7[3] ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm10 = xmm6[2],xmm5[2],xmm6[3],xmm5[3] -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm8 = xmm3[2,2,2,2] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm8 = xmm3[2,2,2,2] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm4[0,1,2],xmm8[3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm14 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm14[0,1,2,3,4,5],ymm8[6,7] -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm14 = xmm9[2,2,2,2] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm9[2,2,2,2] ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm3[0,1,2],xmm14[3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm13[0,1],xmm14[2,3] @@ -3349,7 +3349,7 @@ ; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm9 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm9 = xmm4[2],mem[2],xmm4[3],mem[3] ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm6 = xmm14[2,2,2,2] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm14[2,2,2,2] ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm5[0,1,2],xmm6[3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 @@ -3359,7 +3359,7 @@ ; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm7 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm7 = xmm4[2],mem[2],xmm4[3],mem[3] ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm4 = xmm8[2,2,2,2] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm8[2,2,2,2] ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm11[0,1,2],xmm4[3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm7[0,1],xmm4[2,3] @@ -3397,14 +3397,14 @@ ; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm2 = xmm10[2,3,2,3] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm10[2,3,2,3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm5[2],xmm14[2],xmm5[3],xmm14[3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm9[2,3,2,3] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm9[2,3,2,3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm11[2],xmm8[2],xmm11[3],xmm8[3] @@ -5428,7 +5428,7 @@ ; AVX1-ONLY-NEXT: vmovaps 448(%rdi), %xmm3 ; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm14 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm2 = xmm14[0,1,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm14[0,1,0,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] @@ -5445,7 +5445,7 @@ ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,1,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,0,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovaps 800(%rdi), %xmm1 @@ -5474,7 +5474,7 @@ ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,1,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,0,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovaps 1312(%rdi), %xmm1 @@ -5502,7 +5502,7 @@ ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,1,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,0,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovaps 1824(%rdi), %xmm1 @@ -5530,7 +5530,7 @@ ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,1,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,0,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm6 @@ -5559,7 +5559,7 @@ ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,1,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,0,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovaps 544(%rdi), %xmm4 @@ -5588,7 +5588,7 @@ ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm8 = xmm1[0,1,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm8 = xmm1[0,1,0,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm8[6,7] ; AVX1-ONLY-NEXT: vmovaps 1056(%rdi), %xmm12 @@ -5616,7 +5616,7 @@ ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm11 = xmm1[0,1,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm11 = xmm1[0,1,0,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm11 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5],ymm11[6,7] ; AVX1-ONLY-NEXT: vmovaps 1568(%rdi), %xmm8 @@ -5632,7 +5632,7 @@ ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps %xmm15, %xmm11 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm15[1,1,1,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm15[1,1,1,1] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm7[1],xmm0[2,3] ; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload @@ -5668,7 +5668,7 @@ ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm13[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm13 = xmm15[1,1,1,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm13 = xmm15[1,1,1,1] ; AVX1-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm13 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm13 = xmm13[0],mem[1],xmm13[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm0, %ymm13 @@ -5717,7 +5717,7 @@ ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm3[1,1,1,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm3[1,1,1,1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm12[1],xmm0[2,3] ; AVX1-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = xmm0[0,1],mem[2,3] @@ -5730,11 +5730,11 @@ ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm5[1,1,1,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm5[1,1,1,1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm8[1],xmm0[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 16-byte Folded Reload -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm2 = xmm9[1,1,1,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm9[1,1,1,1] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm5[1],xmm2[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 @@ -5860,7 +5860,7 @@ ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm14 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm14[0,1,2,3,4,5],ymm12[6,7] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm14 = xmm13[2,2,2,2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm13[2,2,2,2] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm1[0,1,2],xmm14[3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm2[0,1],xmm14[2,3] @@ -5882,7 +5882,7 @@ ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm11 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],ymm12[6,7] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm12 = xmm10[2,2,2,2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm12 = xmm10[2,2,2,2] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm12 = xmm7[0,1,2],xmm12[3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm12 = xmm6[0,1],xmm12[2,3] @@ -5894,14 +5894,14 @@ ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm9[2],xmm5[2],xmm9[3],xmm5[3] ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm9 = xmm12[2,2,2,2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm9 = xmm12[2,2,2,2] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm9 = xmm11[0,1,2],xmm9[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm9 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm8 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm9[6,7] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm9 = xmm6[2,2,2,2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm9 = xmm6[2,2,2,2] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm9 = xmm5[0,1,2],xmm9[3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm9 = xmm0[0,1],xmm9[2,3] @@ -6035,7 +6035,7 @@ ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm2[0],ymm12[0],ymm2[2],ymm12[2] ; AVX1-ONLY-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] @@ -6062,7 +6062,7 @@ ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm2[0],ymm9[0],ymm2[2],ymm9[2] ; AVX1-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] @@ -6089,7 +6089,7 @@ ; AVX1-ONLY-NEXT: vmovaps 864(%rdi), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] @@ -6116,7 +6116,7 @@ ; AVX1-ONLY-NEXT: vmovaps 1120(%rdi), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[2],ymm3[2] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] @@ -6143,7 +6143,7 @@ ; AVX1-ONLY-NEXT: vmovaps 1376(%rdi), %ymm3 ; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm7 = ymm7[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm7[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm7 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm7[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] @@ -6170,7 +6170,7 @@ ; AVX1-ONLY-NEXT: vmovaps 1632(%rdi), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm14 = ymm14[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm14 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm14[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7] @@ -6197,7 +6197,7 @@ ; AVX1-ONLY-NEXT: vmovaps 1888(%rdi), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm8 = ymm8[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm8[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm14[4,5,6,7] @@ -6218,7 +6218,7 @@ ; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm8 = ymm8[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 ; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -6242,7 +6242,7 @@ ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps $17, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm14 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm14 = ymm1[1,0],mem[1,0],ymm1[5,4],mem[5,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm14 = ymm14[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm14 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm14[0,1],xmm8[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7] @@ -6259,7 +6259,7 @@ ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm1[1,0],ymm5[1,0],ymm1[5,4],ymm5[5,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm14 = ymm14[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm14 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm14[0,1],xmm8[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7] @@ -6278,7 +6278,7 @@ ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps $17, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm14 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm14 = ymm1[1,0],mem[1,0],ymm1[5,4],mem[5,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm14 = ymm14[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm14 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm14[0,1],xmm8[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7] @@ -6297,7 +6297,7 @@ ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps $17, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm14 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm14 = ymm1[1,0],mem[1,0],ymm1[5,4],mem[5,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm14 = ymm14[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm14 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm14[0,1],xmm8[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7] @@ -6316,7 +6316,7 @@ ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps $17, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm14 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm14 = ymm1[1,0],mem[1,0],ymm1[5,4],mem[5,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm14 = ymm14[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm14 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm14[0,1],xmm8[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7] @@ -6334,7 +6334,7 @@ ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps $17, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm14 = ymm0[1,0],mem[1,0],ymm0[5,4],mem[5,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm14 = ymm14[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm14 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm14[0,1],xmm8[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm2[4,5,6,7] @@ -6353,7 +6353,7 @@ ; AVX1-ONLY-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps $17, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm14 = ymm0[1,0],mem[1,0],ymm0[5,4],mem[5,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm14 = ymm14[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm14 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm14[0,1],xmm8[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm1[4,5,6,7] @@ -6372,7 +6372,7 @@ ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps $17, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm14 = ymm14[1,0],mem[1,0],ymm14[5,4],mem[5,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm14 = ymm14[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm14 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm14[0,1],xmm8[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7] @@ -6389,7 +6389,7 @@ ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm14 = ymm10[1],ymm6[1],ymm10[3],ymm6[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm14 = ymm14[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm14 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0,1],xmm14[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7] @@ -6407,7 +6407,7 @@ ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm14 = ymm3[1],ymm10[1],ymm3[3],ymm10[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm14 = ymm14[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm14 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0,1],xmm14[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7] @@ -6423,7 +6423,7 @@ ; AVX1-ONLY-NEXT: # ymm8 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm14 = ymm7[1],ymm11[1],ymm7[3],ymm11[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm14 = ymm14[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm14 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0,1],xmm14[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7] @@ -6440,7 +6440,7 @@ ; AVX1-ONLY-NEXT: # ymm8 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm14 = ymm5[1],ymm12[1],ymm5[3],ymm12[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm14 = ymm14[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm14 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0,1],xmm14[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7] @@ -6458,7 +6458,7 @@ ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm14 = ymm9[1],ymm2[1],ymm9[3],ymm2[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm14 = ymm14[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm14 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0,1],xmm14[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7] @@ -6475,7 +6475,7 @@ ; AVX1-ONLY-NEXT: # ymm8 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm14 = ymm13[1],ymm15[1],ymm13[3],ymm15[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm14 = ymm14[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm14 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0,1],xmm14[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7] @@ -6494,7 +6494,7 @@ ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm14 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm14 = ymm1[1],mem[1],ymm1[3],mem[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm14 = ymm14[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm14, %xmm14 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0,1],xmm14[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7] @@ -6509,7 +6509,7 @@ ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm8 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm8 = ymm3[1],mem[1],ymm3[3],mem[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm8 = ymm8[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm14 # 32-byte Folded Reload @@ -6530,7 +6530,7 @@ ; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm8 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm8 = ymm1[3,0],mem[3,0],ymm1[7,4],mem[7,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm6, %xmm6 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm8 = ymm8[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm8[0,1],xmm6[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm0[4,5,6,7] @@ -6547,7 +6547,7 @@ ; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm8 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm8 = ymm1[3,0],mem[3,0],ymm1[7,4],mem[7,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm5 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm8 = ymm8[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm8[0,1],xmm5[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm0[4,5,6,7] @@ -6564,7 +6564,7 @@ ; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm8 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm8 = ymm1[3,0],mem[3,0],ymm1[7,4],mem[7,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm4 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm8 = ymm8[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm8[0,1],xmm4[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm0[4,5,6,7] @@ -6582,7 +6582,7 @@ ; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm8 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm8 = ymm1[3,0],mem[3,0],ymm1[7,4],mem[7,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm3 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm8 = ymm8[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm8[0,1],xmm3[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm0[4,5,6,7] @@ -6599,7 +6599,7 @@ ; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm8 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm8 = ymm1[3,0],mem[3,0],ymm1[7,4],mem[7,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm8 = ymm8[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm8[0,1],xmm2[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm0[4,5,6,7] @@ -6617,7 +6617,7 @@ ; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm8 = ymm8[3,0],mem[3,0],ymm8[7,4],mem[7,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm8 = ymm8[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm8[0,1],xmm1[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7] @@ -6633,7 +6633,7 @@ ; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm8 = ymm8[3,0],mem[3,0],ymm8[7,4],mem[7,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm7 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm8 = ymm8[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm8[0,1],xmm7[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7] @@ -6647,7 +6647,7 @@ ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm9[3,0],ymm15[3,0],ymm9[7,4],ymm15[7,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm9 = ymm9[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm9[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm9, %xmm9 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7] @@ -7020,7 +7020,7 @@ ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm3[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm12[1,1,1,1] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm12[1,1,1,1] ; AVX2-ONLY-NEXT: vblendps $2, (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm0 = xmm0[0],mem[1],xmm0[2,3] ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload @@ -7058,7 +7058,7 @@ ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm13[1,1,1,1] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm13[1,1,1,1] ; AVX2-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm0 = xmm0[0],mem[1],xmm0[2,3] ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload @@ -7077,7 +7077,7 @@ ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm15[1,1,1,1] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm15[1,1,1,1] ; AVX2-ONLY-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm0 = xmm0[0],mem[1],xmm0[2,3] ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload @@ -7096,13 +7096,13 @@ ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm14[1,1,1,1] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm14[1,1,1,1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm4[1],xmm0[2,3] ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm10[0],xmm6[0],xmm10[1],xmm6[1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm7[0],xmm5[0],xmm7[1],xmm5[1] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm2 = xmm8[1,1,1,1] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm8[1,1,1,1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm9[1],xmm2[2,3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] @@ -7176,7 +7176,7 @@ ; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm2 = xmm0[2],mem[2],xmm0[3],mem[3] ; AVX2-ONLY-NEXT: vmovaps %xmm2, (%rsp) # 16-byte Spill -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm12[2,2,2,2] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm12[2,2,2,2] ; AVX2-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm0 = mem[0,1,2],xmm0[3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 @@ -7197,14 +7197,14 @@ ; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm2 = xmm0[2],mem[2],xmm0[3],mem[3] ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm11[2,2,2,2] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm11[2,2,2,2] ; AVX2-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm0 = mem[0,1,2],xmm0[3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm11[2,2,2,2] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm11[2,2,2,2] ; AVX2-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm1 = mem[0,1,2],xmm1[3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3] @@ -7218,7 +7218,7 @@ ; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm2 = xmm0[2],mem[2],xmm0[3],mem[3] ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm13[2,2,2,2] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm13[2,2,2,2] ; AVX2-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm0 = mem[0,1,2],xmm0[3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 @@ -7239,7 +7239,7 @@ ; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm2 = xmm0[2],mem[2],xmm0[3],mem[3] ; AVX2-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm15[2,2,2,2] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm15[2,2,2,2] ; AVX2-ONLY-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm0 = mem[0,1,2],xmm0[3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 @@ -7259,13 +7259,13 @@ ; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm1 = xmm0[2],mem[2],xmm0[3],mem[3] ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm6[2,2,2,2] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm6[2,2,2,2] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm8[0,1,2],xmm0[3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm6[2,2,2,2] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm6[2,2,2,2] ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] @@ -7279,14 +7279,14 @@ ; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm1 = xmm0[2],mem[2],xmm0[3],mem[3] ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm4[2,2,2,2] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm4[2,2,2,2] ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm4[0,1,2],xmm0[3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm2[2,2,2,2] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm2[2,2,2,2] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm10[0,1,2],xmm1[3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm8[0,1],xmm1[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] @@ -7299,13 +7299,13 @@ ; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm1 = xmm0[2],mem[2],xmm0[3],mem[3] ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm5[2,2,2,2] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm5[2,2,2,2] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm7[0,1,2],xmm0[3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm15 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3,4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm15 = xmm1[2,2,2,2] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm1[2,2,2,2] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm9[0,1,2],xmm15[3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm8[0,1],xmm15[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7] diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-3.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-3.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-3.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-3.ll @@ -169,7 +169,7 @@ ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5] ; AVX2-ONLY-NEXT: vbroadcastsd 80(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovaps 16(%rdi), %xmm1 @@ -298,11 +298,11 @@ ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5],ymm5[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3],ymm2[4,5,6,7] -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} ymm2 = ymm2[2,3,0,1,6,7,4,5] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,3,0,1,6,7,4,5] ; AVX2-ONLY-NEXT: vbroadcastsd 80(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5] ; AVX2-ONLY-NEXT: vbroadcastsd 176(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovaps 16(%rdi), %xmm1 @@ -543,19 +543,19 @@ ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm8[4,5,6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm12[0,1,2,3,4,5],ymm4[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm11[2,3],ymm3[4,5,6,7] -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} ymm3 = ymm3[2,3,0,1,6,7,4,5] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[2,3,0,1,6,7,4,5] ; AVX2-ONLY-NEXT: vbroadcastsd 176(%rdi), %ymm11 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm11[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm10[2,3],ymm5[4,5,6,7] -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} ymm5 = ymm5[2,3,0,1,6,7,4,5] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[2,3,0,1,6,7,4,5] ; AVX2-ONLY-NEXT: vbroadcastsd 368(%rdi), %ymm10 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm10[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm9[2,3],ymm8[4,5,6,7] -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} ymm8 = ymm8[2,3,0,1,6,7,4,5] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[2,3,0,1,6,7,4,5] ; AVX2-ONLY-NEXT: vbroadcastsd 80(%rdi), %ymm9 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm9[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm7[2,3],ymm6[4,5,6,7] -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} ymm6 = ymm6[2,3,0,1,6,7,4,5] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm6[2,3,0,1,6,7,4,5] ; AVX2-ONLY-NEXT: vbroadcastsd 272(%rdi), %ymm7 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm7[6,7] ; AVX2-ONLY-NEXT: vmovaps 112(%rdi), %xmm7 @@ -1070,39 +1070,39 @@ ; AVX2-ONLY-NEXT: vinsertf128 $1, 640(%rdi), %ymm0, %ymm8 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm8[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm5[2,3],ymm4[4,5,6,7] -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} ymm4 = ymm4[2,3,0,1,6,7,4,5] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[2,3,0,1,6,7,4,5] ; AVX2-ONLY-NEXT: vbroadcastsd 176(%rdi), %ymm5 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm9[0,1],ymm7[2,3],ymm9[4,5,6,7] -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} ymm4 = ymm4[2,3,0,1,6,7,4,5] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[2,3,0,1,6,7,4,5] ; AVX2-ONLY-NEXT: vbroadcastsd 368(%rdi), %ymm5 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm11[0,1],ymm10[2,3],ymm11[4,5,6,7] -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} ymm4 = ymm4[2,3,0,1,6,7,4,5] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[2,3,0,1,6,7,4,5] ; AVX2-ONLY-NEXT: vbroadcastsd 560(%rdi), %ymm5 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm4[0,1,2,3,4,5],ymm5[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm13[0,1],ymm12[2,3],ymm13[4,5,6,7] -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} ymm4 = ymm4[2,3,0,1,6,7,4,5] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[2,3,0,1,6,7,4,5] ; AVX2-ONLY-NEXT: vbroadcastsd 752(%rdi), %ymm8 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm4[0,1,2,3,4,5],ymm8[6,7] ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX2-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm4 = ymm4[0,1],mem[2,3],ymm4[4,5,6,7] -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} ymm4 = ymm4[2,3,0,1,6,7,4,5] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[2,3,0,1,6,7,4,5] ; AVX2-ONLY-NEXT: vbroadcastsd 80(%rdi), %ymm8 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm4[0,1,2,3,4,5],ymm8[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm14[0,1],ymm15[2,3],ymm14[4,5,6,7] -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} ymm4 = ymm4[2,3,0,1,6,7,4,5] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[2,3,0,1,6,7,4,5] ; AVX2-ONLY-NEXT: vbroadcastsd 272(%rdi), %ymm8 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm4[0,1,2,3,4,5],ymm8[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3],ymm3[4,5,6,7] -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5] ; AVX2-ONLY-NEXT: vbroadcastsd 464(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm0[0,1,2,3,4,5],ymm3[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm1[2,3],ymm2[4,5,6,7] -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5] ; AVX2-ONLY-NEXT: vbroadcastsd 656(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovaps 112(%rdi), %xmm0 @@ -2193,97 +2193,97 @@ ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload ; AVX2-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm11 = ymm11[0,1],mem[2,3],ymm11[4,5,6,7] -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} ymm11 = ymm11[2,3,0,1,6,7,4,5] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm11[2,3,0,1,6,7,4,5] ; AVX2-ONLY-NEXT: vbroadcastsd 176(%rdi), %ymm12 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],ymm12[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload ; AVX2-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm11 = ymm11[0,1],mem[2,3],ymm11[4,5,6,7] -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} ymm11 = ymm11[2,3,0,1,6,7,4,5] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm11[2,3,0,1,6,7,4,5] ; AVX2-ONLY-NEXT: vbroadcastsd 368(%rdi), %ymm12 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],ymm12[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload ; AVX2-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm11 = ymm11[0,1],mem[2,3],ymm11[4,5,6,7] -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} ymm11 = ymm11[2,3,0,1,6,7,4,5] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm11[2,3,0,1,6,7,4,5] ; AVX2-ONLY-NEXT: vbroadcastsd 560(%rdi), %ymm12 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],ymm12[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload ; AVX2-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm11 = ymm11[0,1],mem[2,3],ymm11[4,5,6,7] -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} ymm11 = ymm11[2,3,0,1,6,7,4,5] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm11[2,3,0,1,6,7,4,5] ; AVX2-ONLY-NEXT: vbroadcastsd 752(%rdi), %ymm12 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],ymm12[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload ; AVX2-ONLY-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm11 = ymm11[0,1],mem[2,3],ymm11[4,5,6,7] -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} ymm11 = ymm11[2,3,0,1,6,7,4,5] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm11[2,3,0,1,6,7,4,5] ; AVX2-ONLY-NEXT: vbroadcastsd 944(%rdi), %ymm12 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],ymm12[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm10 = mem[0,1],ymm10[2,3],mem[4,5,6,7] -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} ymm10 = ymm10[2,3,0,1,6,7,4,5] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[2,3,0,1,6,7,4,5] ; AVX2-ONLY-NEXT: vbroadcastsd 1136(%rdi), %ymm11 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm11[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm9 = mem[0,1],ymm9[2,3],mem[4,5,6,7] -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} ymm9 = ymm9[2,3,0,1,6,7,4,5] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm9[2,3,0,1,6,7,4,5] ; AVX2-ONLY-NEXT: vbroadcastsd 1328(%rdi), %ymm10 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm10[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm8 = mem[0,1],ymm8[2,3],mem[4,5,6,7] -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} ymm8 = ymm8[2,3,0,1,6,7,4,5] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[2,3,0,1,6,7,4,5] ; AVX2-ONLY-NEXT: vbroadcastsd 1520(%rdi), %ymm9 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm9[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm7 = mem[0,1],ymm7[2,3],mem[4,5,6,7] -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} ymm7 = ymm7[2,3,0,1,6,7,4,5] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm7[2,3,0,1,6,7,4,5] ; AVX2-ONLY-NEXT: vbroadcastsd 80(%rdi), %ymm8 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm8[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm6 = mem[0,1],ymm6[2,3],mem[4,5,6,7] -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} ymm6 = ymm6[2,3,0,1,6,7,4,5] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm6[2,3,0,1,6,7,4,5] ; AVX2-ONLY-NEXT: vbroadcastsd 272(%rdi), %ymm7 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm7[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm14[0,1],ymm5[2,3],ymm14[4,5,6,7] -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} ymm5 = ymm5[2,3,0,1,6,7,4,5] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[2,3,0,1,6,7,4,5] ; AVX2-ONLY-NEXT: vbroadcastsd 464(%rdi), %ymm6 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm6[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vblendps $243, (%rsp), %ymm4, %ymm4 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm4 = mem[0,1],ymm4[2,3],mem[4,5,6,7] -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} ymm4 = ymm4[2,3,0,1,6,7,4,5] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[2,3,0,1,6,7,4,5] ; AVX2-ONLY-NEXT: vbroadcastsd 656(%rdi), %ymm5 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm4, (%rsp) # 32-byte Spill ; AVX2-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm3 = mem[0,1],ymm3[2,3],mem[4,5,6,7] -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} ymm3 = ymm3[2,3,0,1,6,7,4,5] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[2,3,0,1,6,7,4,5] ; AVX2-ONLY-NEXT: vbroadcastsd 848(%rdi), %ymm4 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm4[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm15[0,1],ymm2[2,3],ymm15[4,5,6,7] -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} ymm2 = ymm2[2,3,0,1,6,7,4,5] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,3,0,1,6,7,4,5] ; AVX2-ONLY-NEXT: vbroadcastsd 1040(%rdi), %ymm3 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm13[0,1],ymm1[2,3],ymm13[4,5,6,7] -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[2,3,0,1,6,7,4,5] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,3,0,1,6,7,4,5] ; AVX2-ONLY-NEXT: vbroadcastsd 1232(%rdi), %ymm2 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm1[0,1,2,3,4,5],ymm2[6,7] ; AVX2-ONLY-NEXT: vblendps $243, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-ONLY-NEXT: # ymm0 = mem[0,1],ymm0[2,3],mem[4,5,6,7] -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5] ; AVX2-ONLY-NEXT: vbroadcastsd 1424(%rdi), %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovaps 112(%rdi), %xmm0 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-7.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-7.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-7.ll @@ -249,9 +249,9 @@ ; AVX1-ONLY: # %bb.0: ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX1-ONLY-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX1-ONLY-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero +; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero ; AVX1-ONLY-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero ; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] @@ -263,10 +263,10 @@ ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[3,1,2,1] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[2,0,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3,4],xmm3[5,6],xmm4[7] -; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[3,1,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm0[3,1,2,3] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm4[0,1,2,0,4,5,6,7] ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} xmm6 = mem[0,0] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm7 = xmm6[1,1,1,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm6[1,1,1,1] ; AVX1-ONLY-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm7[0] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm5[2,3,4],xmm3[5,6,7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,3,1,3,4,5,6,7] @@ -1690,8 +1690,8 @@ ; AVX1-ONLY-NEXT: vpslldq {{.*#+}} xmm2 = zero,zero,xmm4[0,1,2,3,4,5,6,7,8,9,10,11,12,13] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm12[0,1,0,1] ; AVX1-ONLY-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm3[5],xmm2[6,7] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm4[0,1,0,1] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm3 = xmm12[0,0,0,0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm4[0,1,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm12[0,0,0,0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535,0,0,0,65535,65535,65535,65535,0,0,0,65535,65535] @@ -1738,8 +1738,8 @@ ; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm4, %ymm2 ; AVX1-ONLY-NEXT: vandps %ymm4, %ymm3, %ymm3 ; AVX1-ONLY-NEXT: vorps %ymm2, %ymm3, %ymm2 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm3 = xmm14[0,1,0,1] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm4 = xmm13[0,0,0,0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm14[0,1,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm13[0,0,0,0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[3] ; AVX1-ONLY-NEXT: vpsrld $16, %xmm5, %xmm4 ; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm11[2],xmm4[2],xmm11[3],xmm4[3] @@ -3838,7 +3838,7 @@ ; AVX1-ONLY-NEXT: vorps %ymm4, %ymm5, %ymm4 ; AVX1-ONLY-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm5 = mem[0,1,0,1] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm6 = xmm13[0,0,0,0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm13[0,0,0,0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1,2],xmm6[3] ; AVX1-ONLY-NEXT: vpsrld $16, %xmm2, %xmm6 ; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm6 = xmm1[2],xmm6[2],xmm1[3],xmm6[3] @@ -8101,7 +8101,7 @@ ; AVX1-ONLY-NEXT: vorps %ymm12, %ymm13, %ymm12 ; AVX1-ONLY-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm13 = mem[0,1,0,1] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm14 = xmm11[0,0,0,0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm11[0,0,0,0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm13 = xmm13[0,1,2],xmm14[3] ; AVX1-ONLY-NEXT: vpsrld $16, %xmm1, %xmm14 ; AVX1-ONLY-NEXT: vpunpckhdq {{.*#+}} xmm14 = xmm0[2],xmm14[2],xmm0[3],xmm14[3] diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-8.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-8.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-8.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-8.ll @@ -1172,8 +1172,8 @@ ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3,4],ymm3[5],ymm2[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm2 = xmm3[0,0,0,0] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm3 = xmm3[0,1,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm3[0,0,0,0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,1,0,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 ; AVX1-ONLY-NEXT: vmovdqu (%rsp), %ymm5 # 32-byte Reload ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm5[0,1,0,1] @@ -1181,10 +1181,10 @@ ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm3, %ymm3 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm3 = xmm5[1,1,1,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm5[1,1,1,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm5, %ymm3 ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm5 = xmm6[0,0,1,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm6[0,0,1,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm5[1],ymm3[2,3,4],ymm5[5],ymm3[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7] diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-2.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-2.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-2.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-2.ll @@ -57,8 +57,8 @@ ; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm1 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm2 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,0,3,1,6,4,7,5] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm1 = ymm2[0,2,1,3,4,6,5,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0,3,1,6,4,7,5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,2,1,3,4,6,5,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6],ymm1[7] ; AVX1-ONLY-NEXT: vmovaps %ymm0, (%rdx) ; AVX1-ONLY-NEXT: vzeroupper diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-3.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-3.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-3.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-3.ll @@ -39,7 +39,7 @@ ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm2 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,1,0,3,7,5,4,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,1,0,3,7,5,4,7] ; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm1 = ymm2[0,2,u,1,u,5,u,u] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3],ymm0[4],ymm1[5,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm1 @@ -287,12 +287,12 @@ ; AVX2-SLOW-NEXT: vbroadcastsd (%rdx), %ymm4 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7] ; AVX2-SLOW-NEXT: vbroadcastsd 24(%rdi), %ymm4 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm5 = ymm1[1,2,3,3,5,6,7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm5 = ymm1[1,2,3,3,5,6,7,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[2,2,2,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm5 = ymm2[2,1,3,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm5[1],ymm4[2,3],ymm5[4],ymm4[5,6],ymm5[7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,0,3,3,4,4,7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,0,3,3,4,4,7,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,1,2,2] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm2[1,1,2,2] @@ -315,7 +315,7 @@ ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6],ymm3[7] ; AVX2-FAST-NEXT: vbroadcastsd (%rdx), %ymm4 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm4 = ymm1[0,0,3,3,4,4,7,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm1[0,0,3,3,4,4,7,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,1,2,2] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm4[2],ymm0[3,4],ymm4[5],ymm0[6,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm4 = ymm2[1,1,2,2] @@ -344,12 +344,12 @@ ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd (%rdx), %ymm4 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 24(%rdi), %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm5 = ymm1[1,2,3,3,5,6,7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm5 = ymm1[1,2,3,3,5,6,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[2,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm5 = ymm2[2,1,3,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm5[1],ymm4[2,3],ymm5[4],ymm4[5,6],ymm5[7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,0,3,3,4,4,7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,0,3,3,4,4,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm2[1,1,2,2] @@ -540,7 +540,7 @@ ; AVX2-SLOW-NEXT: vbroadcastsd (%rdx), %ymm7 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm7[2],ymm2[3,4],ymm7[5],ymm2[6,7] ; AVX2-SLOW-NEXT: vbroadcastsd 56(%rdi), %ymm7 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm8 = ymm3[1,2,3,3,5,6,7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm8 = ymm3[1,2,3,3,5,6,7,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[2,2,2,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1],ymm7[2],ymm8[3,4],ymm7[5],ymm8[6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm8 = ymm1[2,1,3,3] @@ -552,17 +552,17 @@ ; AVX2-SLOW-NEXT: vbroadcastsd 32(%rdx), %ymm9 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm9[2],ymm8[3,4],ymm9[5],ymm8[6,7] ; AVX2-SLOW-NEXT: vbroadcastsd 24(%rdi), %ymm9 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm10 = ymm5[1,2,3,3,5,6,7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm10 = ymm5[1,2,3,3,5,6,7,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[2,2,2,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1],ymm9[2],ymm10[3,4],ymm9[5],ymm10[6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm10 = ymm6[2,1,3,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0],ymm10[1],ymm9[2,3],ymm10[4],ymm9[5,6],ymm10[7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm5 = ymm5[0,0,3,3,4,4,7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm5 = ymm5[0,0,3,3,4,4,7,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[1,1,2,2] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm5 = ymm6[1,1,2,2] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0],ymm4[1,2],ymm5[3],ymm4[4,5],ymm5[6],ymm4[7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm3 = ymm3[0,0,3,3,4,4,7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,0,3,3,4,4,7,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,1,2,2] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm3[2],ymm0[3,4],ymm3[5],ymm0[6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[1,1,2,2] @@ -584,7 +584,7 @@ ; AVX2-FAST-NEXT: vmovaps 32(%rsi), %ymm4 ; AVX2-FAST-NEXT: vmovaps (%rdx), %ymm5 ; AVX2-FAST-NEXT: vmovaps 32(%rdx), %ymm6 -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm0 = ymm3[0,0,3,3,4,4,7,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm3[0,0,3,3,4,4,7,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm7 = ymm1[1,1,2,2] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1],ymm0[2],ymm7[3,4],ymm0[5],ymm7[6,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm7 = ymm5[1,1,2,2] @@ -596,7 +596,7 @@ ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm8[1],ymm1[2,3],ymm8[4],ymm1[5,6],ymm8[7] ; AVX2-FAST-NEXT: vbroadcastsd (%rdx), %ymm8 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm8[2],ymm1[3,4],ymm8[5],ymm1[6,7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm8 = ymm4[0,0,3,3,4,4,7,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm8 = ymm4[0,0,3,3,4,4,7,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm9 = ymm2[1,1,2,2] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1],ymm8[2],ymm9[3,4],ymm8[5],ymm9[6,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm9 = ymm6[1,1,2,2] @@ -641,7 +641,7 @@ ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd (%rdx), %ymm7 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm7[2],ymm2[3,4],ymm7[5],ymm2[6,7] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 56(%rdi), %ymm7 -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm8 = ymm3[1,2,3,3,5,6,7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm8 = ymm3[1,2,3,3,5,6,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[2,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1],ymm7[2],ymm8[3,4],ymm7[5],ymm8[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm8 = ymm1[2,1,3,3] @@ -653,17 +653,17 @@ ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 32(%rdx), %ymm9 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm9[2],ymm8[3,4],ymm9[5],ymm8[6,7] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 24(%rdi), %ymm9 -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm10 = ymm5[1,2,3,3,5,6,7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm10 = ymm5[1,2,3,3,5,6,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[2,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1],ymm9[2],ymm10[3,4],ymm9[5],ymm10[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm10 = ymm6[2,1,3,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0],ymm10[1],ymm9[2,3],ymm10[4],ymm9[5,6],ymm10[7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm5 = ymm5[0,0,3,3,4,4,7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm5 = ymm5[0,0,3,3,4,4,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[1,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm5 = ymm6[1,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0],ymm4[1,2],ymm5[3],ymm4[4,5],ymm5[6],ymm4[7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm3 = ymm3[0,0,3,3,4,4,7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,0,3,3,4,4,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm3[2],ymm0[3,4],ymm3[5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[1,1,2,2] @@ -1038,7 +1038,7 @@ ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm4[2],ymm0[3,4],ymm4[5],ymm0[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill ; AVX2-SLOW-NEXT: vbroadcastsd 88(%rdi), %ymm4 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm11 = ymm9[1,2,3,3,5,6,7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm11 = ymm9[1,2,3,3,5,6,7,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[2,2,2,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm11[0,1],ymm4[2],ymm11[3,4],ymm4[5],ymm11[6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm11 = ymm10[2,1,3,3] @@ -1052,7 +1052,7 @@ ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm11[0,1],ymm12[2],ymm11[3,4],ymm12[5],ymm11[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vbroadcastsd 56(%rdi), %ymm12 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm13 = ymm5[1,2,3,3,5,6,7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm13 = ymm5[1,2,3,3,5,6,7,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[2,2,2,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0,1],ymm12[2],ymm13[3,4],ymm12[5],ymm13[6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm13 = ymm7[2,1,3,3] @@ -1066,7 +1066,7 @@ ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1],ymm14[2],ymm13[3,4],ymm14[5],ymm13[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vbroadcastsd 120(%rdi), %ymm14 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm15 = ymm2[1,2,3,3,5,6,7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm15 = ymm2[1,2,3,3,5,6,7,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[2,2,2,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1],ymm14[2],ymm15[3,4],ymm14[5],ymm15[6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm15 = ymm3[2,1,3,3] @@ -1079,29 +1079,29 @@ ; AVX2-SLOW-NEXT: vbroadcastsd 96(%rdx), %ymm15 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm15[2],ymm4[3,4],ymm15[5],ymm4[6,7] ; AVX2-SLOW-NEXT: vmovaps (%rsi), %ymm15 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm11 = ymm15[1,2,3,3,5,6,7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm11 = ymm15[1,2,3,3,5,6,7,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[2,2,2,3] ; AVX2-SLOW-NEXT: vbroadcastsd 24(%rdi), %ymm12 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1],ymm12[2],ymm11[3,4],ymm12[5],ymm11[6,7] ; AVX2-SLOW-NEXT: vmovaps (%rdx), %ymm12 ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm13 = ymm12[2,1,3,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0],ymm13[1],ymm11[2,3],ymm13[4],ymm11[5,6],ymm13[7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm13 = ymm15[0,0,3,3,4,4,7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm13 = ymm15[0,0,3,3,4,4,7,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[1,1,2,2] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm13[2],ymm8[3,4],ymm13[5],ymm8[6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm12 = ymm12[1,1,2,2] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm12[0],ymm8[1,2],ymm12[3],ymm8[4,5],ymm12[6],ymm8[7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm9 = ymm9[0,0,3,3,4,4,7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm9 = ymm9[0,0,3,3,4,4,7,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[1,1,2,2] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm9[2],ymm6[3,4],ymm9[5],ymm6[6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm9 = ymm10[1,1,2,2] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm9[0],ymm6[1,2],ymm9[3],ymm6[4,5],ymm9[6],ymm6[7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm5 = ymm5[0,0,3,3,4,4,7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm5 = ymm5[0,0,3,3,4,4,7,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[1,1,2,2] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm5[2],ymm1[3,4],ymm5[5],ymm1[6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm5 = ymm7[1,1,2,2] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0],ymm1[1,2],ymm5[3],ymm1[4,5],ymm5[6],ymm1[7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm2 = ymm2[0,0,3,3,4,4,7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,0,3,3,4,4,7,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,1,2,2] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2],ymm0[3,4],ymm2[5],ymm0[6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm3[1,1,2,2] @@ -1139,7 +1139,7 @@ ; AVX2-FAST-NEXT: vmovaps (%rdx), %ymm2 ; AVX2-FAST-NEXT: vmovaps 32(%rdx), %ymm12 ; AVX2-FAST-NEXT: vmovaps 64(%rdx), %ymm14 -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm0 = ymm1[0,0,3,3,4,4,7,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,0,3,3,4,4,7,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm5 = ymm4[1,1,2,2] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1],ymm0[2],ymm5[3,4],ymm0[5],ymm5[6,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm5 = ymm2[1,1,2,2] @@ -1164,12 +1164,12 @@ ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm15[0],ymm7[1],ymm15[2,3],ymm7[4],ymm15[5,6],ymm7[7] ; AVX2-FAST-NEXT: vbroadcastsd 64(%rdx), %ymm15 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm15[2],ymm7[3,4],ymm15[5],ymm7[6,7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm13 = ymm13[0,0,3,3,4,4,7,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm13 = ymm13[0,0,3,3,4,4,7,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[1,1,2,2] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1],ymm13[2],ymm11[3,4],ymm13[5],ymm11[6,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm13 = ymm14[1,1,2,2] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm11 = ymm13[0],ymm11[1,2],ymm13[3],ymm11[4,5],ymm13[6],ymm11[7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm13 = ymm10[0,0,3,3,4,4,7,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm13 = ymm10[0,0,3,3,4,4,7,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm14 = ymm6[1,1,2,2] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0,1],ymm13[2],ymm14[3,4],ymm13[5],ymm14[6,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm14 = ymm12[1,1,2,2] @@ -1185,7 +1185,7 @@ ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm10[1],ymm6[2,3],ymm10[4],ymm6[5,6],ymm10[7] ; AVX2-FAST-NEXT: vbroadcastsd 32(%rdx), %ymm10 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm10[2],ymm6[3,4],ymm10[5],ymm6[6,7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm10 = ymm15[0,0,3,3,4,4,7,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm10 = ymm15[0,0,3,3,4,4,7,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm14 = ymm3[1,1,2,2] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm10 = ymm14[0,1],ymm10[2],ymm14[3,4],ymm10[5],ymm14[6,7] ; AVX2-FAST-NEXT: vmovaps 96(%rdx), %ymm14 @@ -1243,7 +1243,7 @@ ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm4[2],ymm0[3,4],ymm4[5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 88(%rdi), %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm11 = ymm9[1,2,3,3,5,6,7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm11 = ymm9[1,2,3,3,5,6,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[2,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm11[0,1],ymm4[2],ymm11[3,4],ymm4[5],ymm11[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm11 = ymm10[2,1,3,3] @@ -1257,7 +1257,7 @@ ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm11[0,1],ymm12[2],ymm11[3,4],ymm12[5],ymm11[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 56(%rdi), %ymm12 -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm13 = ymm5[1,2,3,3,5,6,7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm13 = ymm5[1,2,3,3,5,6,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[2,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0,1],ymm12[2],ymm13[3,4],ymm12[5],ymm13[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm13 = ymm7[2,1,3,3] @@ -1271,7 +1271,7 @@ ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1],ymm14[2],ymm13[3,4],ymm14[5],ymm13[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 120(%rdi), %ymm14 -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm15 = ymm2[1,2,3,3,5,6,7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm15 = ymm2[1,2,3,3,5,6,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[2,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1],ymm14[2],ymm15[3,4],ymm14[5],ymm15[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm15 = ymm3[2,1,3,3] @@ -1284,29 +1284,29 @@ ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 96(%rdx), %ymm15 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm15[2],ymm4[3,4],ymm15[5],ymm4[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps (%rsi), %ymm15 -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm11 = ymm15[1,2,3,3,5,6,7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm11 = ymm15[1,2,3,3,5,6,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[2,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 24(%rdi), %ymm12 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1],ymm12[2],ymm11[3,4],ymm12[5],ymm11[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdx), %ymm12 ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm13 = ymm12[2,1,3,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0],ymm13[1],ymm11[2,3],ymm13[4],ymm11[5,6],ymm13[7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm13 = ymm15[0,0,3,3,4,4,7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm13 = ymm15[0,0,3,3,4,4,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[1,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm13[2],ymm8[3,4],ymm13[5],ymm8[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm12 = ymm12[1,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm12[0],ymm8[1,2],ymm12[3],ymm8[4,5],ymm12[6],ymm8[7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm9 = ymm9[0,0,3,3,4,4,7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm9 = ymm9[0,0,3,3,4,4,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[1,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm9[2],ymm6[3,4],ymm9[5],ymm6[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm9 = ymm10[1,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm9[0],ymm6[1,2],ymm9[3],ymm6[4,5],ymm9[6],ymm6[7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm5 = ymm5[0,0,3,3,4,4,7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm5 = ymm5[0,0,3,3,4,4,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[1,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm5[2],ymm1[3,4],ymm5[5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm5 = ymm7[1,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0],ymm1[1,2],ymm5[3],ymm1[4,5],ymm5[6],ymm1[7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm2 = ymm2[0,0,3,3,4,4,7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,0,3,3,4,4,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2],ymm0[3,4],ymm2[5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm2 = ymm3[1,1,2,2] @@ -2062,7 +2062,7 @@ ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vbroadcastsd 24(%rdi), %ymm0 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm2[1,2,3,3,5,6,7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm2[1,2,3,3,5,6,7,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm6[2,1,3,3] @@ -2076,7 +2076,7 @@ ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vbroadcastsd 56(%rdi), %ymm0 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm4[1,2,3,3,5,6,7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm4[1,2,3,3,5,6,7,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm7[2,1,3,3] @@ -2090,7 +2090,7 @@ ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vbroadcastsd 88(%rdi), %ymm0 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm8[1,2,3,3,5,6,7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm8[1,2,3,3,5,6,7,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm5[2,1,3,3] @@ -2106,7 +2106,7 @@ ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps 96(%rsi), %ymm13 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm13[1,2,3,3,5,6,7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm13[1,2,3,3,5,6,7,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,3] ; AVX2-SLOW-NEXT: vbroadcastsd 120(%rdi), %ymm1 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] @@ -2124,7 +2124,7 @@ ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps 128(%rsi), %ymm10 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm10[1,2,3,3,5,6,7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm10[1,2,3,3,5,6,7,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,3] ; AVX2-SLOW-NEXT: vbroadcastsd 152(%rdi), %ymm1 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] @@ -2141,7 +2141,7 @@ ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps 160(%rsi), %ymm7 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm7[1,2,3,3,5,6,7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm7[1,2,3,3,5,6,7,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,3] ; AVX2-SLOW-NEXT: vbroadcastsd 184(%rdi), %ymm1 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] @@ -2158,7 +2158,7 @@ ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps 192(%rsi), %ymm4 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm4[1,2,3,3,5,6,7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm4[1,2,3,3,5,6,7,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,3] ; AVX2-SLOW-NEXT: vbroadcastsd 216(%rdi), %ymm1 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] @@ -2175,7 +2175,7 @@ ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps 224(%rsi), %ymm1 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm1[1,2,3,3,5,6,7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm1[1,2,3,3,5,6,7,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,3] ; AVX2-SLOW-NEXT: vbroadcastsd 248(%rdi), %ymm11 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm11 = ymm0[0,1],ymm11[2],ymm0[3,4],ymm11[5],ymm0[6,7] @@ -2207,29 +2207,29 @@ ; AVX2-SLOW-NEXT: vpermpd $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm15 = mem[1,1,2,2] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0],ymm14[1,2],ymm15[3],ymm14[4,5],ymm15[6],ymm14[7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm15 = ymm13[0,0,3,3,4,4,7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm15 = ymm13[0,0,3,3,4,4,7,7] ; AVX2-SLOW-NEXT: vpermpd $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm13 = mem[1,1,2,2] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1],ymm15[2],ymm13[3,4],ymm15[5],ymm13[6,7] ; AVX2-SLOW-NEXT: vpermpd $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm15 = mem[1,1,2,2] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm13 = ymm15[0],ymm13[1,2],ymm15[3],ymm13[4,5],ymm15[6],ymm13[7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm10 = ymm10[0,0,3,3,4,4,7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm10 = ymm10[0,0,3,3,4,4,7,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm12 = ymm12[1,1,2,2] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm10 = ymm12[0,1],ymm10[2],ymm12[3,4],ymm10[5],ymm12[6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[1,1,2,2] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0],ymm10[1,2],ymm9[3],ymm10[4,5],ymm9[6],ymm10[7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm7 = ymm7[0,0,3,3,4,4,7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm7 = ymm7[0,0,3,3,4,4,7,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[1,1,2,2] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1],ymm7[2],ymm8[3,4],ymm7[5],ymm8[6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[1,1,2,2] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm7[1,2],ymm6[3],ymm7[4,5],ymm6[6],ymm7[7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm4 = ymm4[0,0,3,3,4,4,7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,0,3,3,4,4,7,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[1,1,2,2] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[1,1,2,2] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm4[1,2],ymm3[3],ymm4[4,5],ymm3[6],ymm4[7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,0,3,3,4,4,7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,0,3,3,4,4,7,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[1,1,2,2] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,1,2,2] @@ -2302,7 +2302,7 @@ ; AVX2-FAST-NEXT: vbroadcastsd (%rdx), %ymm11 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm11[2],ymm7[3,4],ymm11[5],ymm7[6,7] ; AVX2-FAST-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm7 = ymm8[0,0,3,3,4,4,7,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm7 = ymm8[0,0,3,3,4,4,7,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[1,1,2,2] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm7[2],ymm4[3,4],ymm7[5],ymm4[6,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm7 = ymm10[1,1,2,2] @@ -2321,7 +2321,7 @@ ; AVX2-FAST-NEXT: vbroadcastsd 32(%rdx), %ymm8 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm8[2],ymm4[3,4],ymm8[5],ymm4[6,7] ; AVX2-FAST-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm4 = ymm5[0,0,3,3,4,4,7,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm5[0,0,3,3,4,4,7,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[1,1,2,2] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm4[2],ymm2[3,4],ymm4[5],ymm2[6,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm4 = ymm3[1,1,2,2] @@ -2339,7 +2339,7 @@ ; AVX2-FAST-NEXT: vbroadcastsd 64(%rdx), %ymm3 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7] ; AVX2-FAST-NEXT: vmovups %ymm2, (%rsp) # 32-byte Spill -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm2 = ymm0[0,0,3,3,4,4,7,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm0[0,0,3,3,4,4,7,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm3 = ymm9[1,1,2,2] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm3 = ymm1[1,1,2,2] @@ -2357,7 +2357,7 @@ ; AVX2-FAST-NEXT: vbroadcastsd 96(%rdx), %ymm1 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm0 = ymm13[0,0,3,3,4,4,7,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm13[0,0,3,3,4,4,7,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm12[1,1,2,2] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm14[1,1,2,2] @@ -2376,7 +2376,7 @@ ; AVX2-FAST-NEXT: vbroadcastsd 128(%rdx), %ymm3 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm14 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[1,1,2,2] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm2 = ymm0[0,0,3,3,4,4,7,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm0[0,0,3,3,4,4,7,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7] ; AVX2-FAST-NEXT: vmovaps 128(%rdx), %ymm2 ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm3 = ymm2[1,1,2,2] @@ -2394,7 +2394,7 @@ ; AVX2-FAST-NEXT: vbroadcastsd 160(%rdx), %ymm4 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm2[0,1],ymm4[2],ymm2[3,4],ymm4[5],ymm2[6,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[1,1,2,2] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm4 = ymm0[0,0,3,3,4,4,7,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm0[0,0,3,3,4,4,7,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm4[2],ymm1[3,4],ymm4[5],ymm1[6,7] ; AVX2-FAST-NEXT: vmovaps 160(%rdx), %ymm4 ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm5 = ymm4[1,1,2,2] @@ -2412,7 +2412,7 @@ ; AVX2-FAST-NEXT: vbroadcastsd 192(%rdx), %ymm9 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm9[2],ymm0[3,4],ymm9[5],ymm0[6,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[1,1,2,2] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm9 = ymm5[0,0,3,3,4,4,7,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm9 = ymm5[0,0,3,3,4,4,7,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm9[2],ymm4[3,4],ymm9[5],ymm4[6,7] ; AVX2-FAST-NEXT: vmovaps 192(%rdx), %ymm9 ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm10 = ymm9[1,1,2,2] @@ -2430,7 +2430,7 @@ ; AVX2-FAST-NEXT: vbroadcastsd 224(%rdx), %ymm11 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm11[2],ymm6[3,4],ymm11[5],ymm6[6,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[1,1,2,2] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm11 = ymm9[0,0,3,3,4,4,7,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm11 = ymm9[0,0,3,3,4,4,7,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1],ymm11[2],ymm10[3,4],ymm11[5],ymm10[6,7] ; AVX2-FAST-NEXT: vmovaps 224(%rdx), %ymm11 ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm12 = ymm11[1,1,2,2] @@ -2507,7 +2507,7 @@ ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 24(%rdi), %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm2[1,2,3,3,5,6,7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm2[1,2,3,3,5,6,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm6[2,1,3,3] @@ -2521,7 +2521,7 @@ ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 56(%rdi), %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm4[1,2,3,3,5,6,7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm4[1,2,3,3,5,6,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm7[2,1,3,3] @@ -2535,7 +2535,7 @@ ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 88(%rdi), %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm8[1,2,3,3,5,6,7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm8[1,2,3,3,5,6,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm5[2,1,3,3] @@ -2551,7 +2551,7 @@ ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 96(%rsi), %ymm13 -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm0 = ymm13[1,2,3,3,5,6,7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm13[1,2,3,3,5,6,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 120(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] @@ -2569,7 +2569,7 @@ ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 128(%rsi), %ymm10 -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm0 = ymm10[1,2,3,3,5,6,7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm10[1,2,3,3,5,6,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 152(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] @@ -2586,7 +2586,7 @@ ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 160(%rsi), %ymm7 -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm0 = ymm7[1,2,3,3,5,6,7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm7[1,2,3,3,5,6,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 184(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] @@ -2603,7 +2603,7 @@ ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 192(%rsi), %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm0 = ymm4[1,2,3,3,5,6,7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm4[1,2,3,3,5,6,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 216(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] @@ -2620,7 +2620,7 @@ ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 224(%rsi), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm0 = ymm1[1,2,3,3,5,6,7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm1[1,2,3,3,5,6,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 248(%rdi), %ymm11 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm11 = ymm0[0,1],ymm11[2],ymm0[3,4],ymm11[5],ymm0[6,7] @@ -2652,29 +2652,29 @@ ; AVX2-FAST-PERLANE-NEXT: vpermpd $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm15 = mem[1,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0],ymm14[1,2],ymm15[3],ymm14[4,5],ymm15[6],ymm14[7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm15 = ymm13[0,0,3,3,4,4,7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm15 = ymm13[0,0,3,3,4,4,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm13 = mem[1,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1],ymm15[2],ymm13[3,4],ymm15[5],ymm13[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm15 = mem[1,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm13 = ymm15[0],ymm13[1,2],ymm15[3],ymm13[4,5],ymm15[6],ymm13[7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm10 = ymm10[0,0,3,3,4,4,7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm10 = ymm10[0,0,3,3,4,4,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm12 = ymm12[1,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm10 = ymm12[0,1],ymm10[2],ymm12[3,4],ymm10[5],ymm12[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[1,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0],ymm10[1,2],ymm9[3],ymm10[4,5],ymm9[6],ymm10[7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm7 = ymm7[0,0,3,3,4,4,7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm7 = ymm7[0,0,3,3,4,4,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[1,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1],ymm7[2],ymm8[3,4],ymm7[5],ymm8[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[1,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm7[1,2],ymm6[3],ymm7[4,5],ymm6[6],ymm7[7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm4 = ymm4[0,0,3,3,4,4,7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,0,3,3,4,4,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[1,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[1,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm4[1,2],ymm3[3],ymm4[4,5],ymm3[6],ymm4[7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,0,3,3,4,4,7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,0,3,3,4,4,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[1,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,1,2,2] diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-4.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-4.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-4.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-4.ll @@ -41,8 +41,8 @@ ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm2 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,3,0,2,5,7,4,6] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm1 = ymm2[0,2,1,3,4,6,5,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3,0,2,5,7,4,6] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,2,1,3,4,6,5,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovaps %ymm0, (%r8) ; AVX1-ONLY-NEXT: vzeroupper @@ -57,7 +57,7 @@ ; AVX2-SLOW-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero ; AVX2-SLOW-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3] ; AVX2-SLOW-NEXT: vmovaps %ymm0, (%r8) ; AVX2-SLOW-NEXT: vzeroupper @@ -87,7 +87,7 @@ ; AVX2-FAST-PERLANE-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero ; AVX2-FAST-PERLANE-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] ; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3] ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm0, (%r8) ; AVX2-FAST-PERLANE-NEXT: vzeroupper @@ -152,16 +152,16 @@ ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm4 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm5 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm3 = ymm2[0,1,1,0,4,5,5,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm2[0,1,1,0,4,5,5,4] ; AVX1-ONLY-NEXT: vmovddup {{.*#+}} ymm6 = ymm5[0,0,2,2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm6[0,1,2],ymm3[3],ymm6[4,5],ymm3[6],ymm6[7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm1 = ymm0[1,0,2,3,5,4,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm0[1,0,2,3,5,4,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0],ymm1[1],ymm4[2,3],ymm1[4],ymm4[5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3],ymm1[4,5],ymm3[6,7] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm2 = ymm2[0,1,3,2,4,5,7,6] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,1,3,2,4,5,7,6] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1,2],ymm2[3],ymm5[4,5],ymm2[6],ymm5[7] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,2,3,7,6,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,2,2,3,7,6,6,7] ; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} ymm3 = ymm4[1,0,3,2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0],ymm0[1],ymm3[2,3],ymm0[4],ymm3[5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5],ymm2[6,7] @@ -274,7 +274,7 @@ ; AVX1-ONLY-NEXT: vmovaps 16(%rdx), %xmm8 ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm9 = xmm7[0],xmm5[0],xmm7[1],xmm5[1] ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm10 = xmm5[0],xmm7[0] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm10 = xmm10[0,1,2,0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm10[0,1,2,0] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm10, %ymm9 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm9[2,3],ymm3[4,5],ymm9[6,7] ; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm9 = xmm0[1],xmm1[1],zero,zero @@ -282,7 +282,7 @@ ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm10, %ymm9 ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm10 = xmm8[0],xmm6[0],xmm8[1],xmm6[1] ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm11 = xmm6[0],xmm8[0] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm11 = xmm11[0,1,2,0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm11 = xmm11[0,1,2,0] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm11, %ymm10 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1],ymm10[2,3],ymm9[4,5],ymm10[6,7] ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm10 = xmm7[2],xmm5[2],xmm7[3],xmm5[3] @@ -290,7 +290,7 @@ ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm5, %ymm5 ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm7 = xmm2[2],xmm4[2],xmm2[3],xmm4[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm4[3,0],xmm2[3,0] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[2,0,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm2[2,0,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm7, %ymm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm5[2,3],ymm2[4,5],ymm5[6,7] ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm4 = xmm8[2],xmm6[2],xmm8[3],xmm6[3] @@ -298,7 +298,7 @@ ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm5, %ymm4 ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm5 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm1[3,0],xmm0[3,0] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,0,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,0,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm5, %ymm0 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm4[2,3],ymm0[4,5],ymm4[6,7] ; AVX1-ONLY-NEXT: vmovaps %ymm0, 96(%r8) @@ -482,7 +482,7 @@ ; AVX1-ONLY-NEXT: vmovaps 48(%rdx), %xmm12 ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm15[0],xmm2[1],xmm15[1] ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm13 = xmm15[0],xmm2[0] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm13 = xmm13[0,1,2,0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm13 = xmm13[0,1,2,0] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm13, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -492,7 +492,7 @@ ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm12[0],xmm11[0],xmm12[1],xmm11[1] ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm13 = xmm11[0],xmm12[0] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm13 = xmm13[0,1,2,0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm13 = xmm13[0,1,2,0] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm13, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -502,7 +502,7 @@ ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm6[0],xmm8[0] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,1,2,0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,0] ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm13 = xmm8[0],xmm6[0],xmm8[1],xmm6[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] @@ -515,7 +515,7 @@ ; AVX1-ONLY-NEXT: vmovaps 16(%rcx), %xmm1 ; AVX1-ONLY-NEXT: vmovaps 16(%rdx), %xmm0 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm10 = xmm1[0],xmm0[0] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm10 = xmm10[0,1,2,0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm10[0,1,2,0] ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm4 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm10, %ymm4 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm3[0,1],ymm4[2,3],ymm3[4,5],ymm4[6,7] @@ -524,7 +524,7 @@ ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm3 = xmm7[2],xmm9[2],xmm7[3],xmm9[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm9[3,0],xmm7[3,0] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm4 = xmm4[2,0,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm4[2,0,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7] ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm3 = xmm12[2],xmm11[2],xmm12[3],xmm11[3] @@ -533,7 +533,7 @@ ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm4 = xmm7[2],xmm5[2],xmm7[3],xmm5[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm5[3,0],xmm7[3,0] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm5 = xmm5[2,0,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm5[2,0,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm4 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1],ymm3[2,3],ymm4[4,5],ymm3[6,7] ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm4 = xmm8[2],xmm6[2],xmm8[3],xmm6[3] @@ -543,7 +543,7 @@ ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm5 = xmm7[2],xmm6[2],xmm7[3],xmm6[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm6[3,0],xmm7[3,0] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm6 = xmm6[2,0,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm6[2,0,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1],ymm4[2,3],ymm5[4,5],ymm4[6,7] ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm5 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] @@ -551,7 +551,7 @@ ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm13[2],xmm14[2],xmm13[3],xmm14[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm14[3,0],xmm13[3,0] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm5 = xmm5[2,0,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm5[2,0,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] ; AVX1-ONLY-NEXT: vmovaps %ymm0, 96(%r8) @@ -940,7 +940,7 @@ ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm3[0],xmm2[0] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[0,1,2,0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,1,2,0] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -957,7 +957,7 @@ ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm3[0],xmm2[0] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[0,1,2,0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,1,2,0] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -972,7 +972,7 @@ ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 80(%rdx), %xmm12 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm12[0] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,1,2,0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,0] ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm12[0],xmm2[0],xmm12[1],xmm2[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] @@ -987,7 +987,7 @@ ; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %xmm10 ; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %xmm8 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm10[0],xmm8[0] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,1,2,0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,0] ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm8[0],xmm10[0],xmm8[1],xmm10[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] @@ -1002,7 +1002,7 @@ ; AVX1-ONLY-NEXT: vmovaps 48(%rcx), %xmm9 ; AVX1-ONLY-NEXT: vmovaps 48(%rdx), %xmm7 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm9[0],xmm7[0] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,1,2,0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,0] ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm7[0],xmm9[0],xmm7[1],xmm9[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] @@ -1017,7 +1017,7 @@ ; AVX1-ONLY-NEXT: vmovaps 96(%rcx), %xmm6 ; AVX1-ONLY-NEXT: vmovaps 96(%rdx), %xmm5 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm6[0],xmm5[0] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,1,2,0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,0] ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] @@ -1032,7 +1032,7 @@ ; AVX1-ONLY-NEXT: vmovaps 112(%rcx), %xmm4 ; AVX1-ONLY-NEXT: vmovaps 112(%rdx), %xmm3 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm4[0],xmm3[0] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,1,2,0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,0] ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm11 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] @@ -1046,7 +1046,7 @@ ; AVX1-ONLY-NEXT: vmovaps (%rcx), %xmm11 ; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm2 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm15 = xmm11[0],xmm2[0] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm15 = xmm15[0,1,2,0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm15[0,1,2,0] ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm14 = xmm2[0],xmm11[0],xmm2[1],xmm11[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm15, %ymm14 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm14[2,3],ymm0[4,5],ymm14[6,7] @@ -1060,7 +1060,7 @@ ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm14 = xmm0[2],xmm15[2],xmm0[3],xmm15[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm15[3,0],xmm0[3,0] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm15 = xmm15[2,0,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm15[2,0,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm14, %ymm14 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1],ymm1[2,3],ymm14[4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -1073,7 +1073,7 @@ ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm14 = xmm15[2],xmm1[2],xmm15[3],xmm1[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm1[3,0],xmm15[3,0] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm15 = xmm15[2,0,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm15[2,0,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm14, %ymm14 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm14[0,1],ymm0[2,3],ymm14[4,5],ymm0[6,7] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload @@ -1084,7 +1084,7 @@ ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm12 = xmm14[2],xmm1[2],xmm14[3],xmm1[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm1[3,0],xmm14[3,0] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm14 = xmm14[2,0,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm14[2,0,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm12, %ymm12 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm12[0,1],ymm0[2,3],ymm12[4,5],ymm0[6,7] ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm12 = xmm8[2],xmm10[2],xmm8[3],xmm10[3] @@ -1094,7 +1094,7 @@ ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm10 = xmm1[2],xmm12[2],xmm1[3],xmm12[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm12 = xmm12[3,0],xmm1[3,0] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm12 = xmm12[2,0,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm12 = xmm12[2,0,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm10, %ymm10 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm10[0,1],ymm8[2,3],ymm10[4,5],ymm8[6,7] ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm10 = xmm7[2],xmm9[2],xmm7[3],xmm9[3] @@ -1104,7 +1104,7 @@ ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm9 = xmm1[2],xmm10[2],xmm1[3],xmm10[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm10[3,0],xmm1[3,0] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm10 = xmm10[2,0,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm10[2,0,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm9, %ymm9 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm9[0,1],ymm7[2,3],ymm9[4,5],ymm7[6,7] ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm9 = xmm5[2],xmm6[2],xmm5[3],xmm6[3] @@ -1114,7 +1114,7 @@ ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm6 = xmm9[2],xmm1[2],xmm9[3],xmm1[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm9 = xmm1[3,0],xmm9[3,0] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm9 = xmm9[2,0,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm9 = xmm9[2,0,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm6, %ymm6 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1],ymm5[2,3],ymm6[4,5],ymm5[6,7] ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm6 = xmm3[2],xmm4[2],xmm3[3],xmm4[3] @@ -1124,7 +1124,7 @@ ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm4 = xmm6[2],xmm1[2],xmm6[3],xmm1[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm1[3,0],xmm6[3,0] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm6 = xmm6[2,0,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm6[2,0,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm4, %ymm4 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1],ymm3[2,3],ymm4[4,5],ymm3[6,7] ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm4 = xmm2[2],xmm11[2],xmm2[3],xmm11[3] @@ -1133,7 +1133,7 @@ ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm4[2],xmm13[2],xmm4[3],xmm13[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm13[3,0],xmm4[3,0] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm4 = xmm4[2,0,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm4[2,0,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovaps %ymm1, 32(%r8) @@ -1894,7 +1894,7 @@ ; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %xmm8 ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm11[0],xmm2[1],xmm11[1] ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm11[0],xmm2[0] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[0,1,2,0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,1,2,0] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -1907,7 +1907,7 @@ ; AVX1-ONLY-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm9[0],xmm10[0] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[0,1,2,0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,1,2,0] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -1919,7 +1919,7 @@ ; AVX1-ONLY-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm7[0],xmm8[0] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,1,2,0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,0] ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm8[0],xmm7[0],xmm8[1],xmm7[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] @@ -1936,7 +1936,7 @@ ; AVX1-ONLY-NEXT: vmovaps 48(%rdx), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm3[0],xmm2[0] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,1,2,0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,0] ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] @@ -1953,7 +1953,7 @@ ; AVX1-ONLY-NEXT: vmovaps 64(%rdx), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm3[0],xmm2[0] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,1,2,0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,0] ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] @@ -1970,7 +1970,7 @@ ; AVX1-ONLY-NEXT: vmovaps 80(%rdx), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm3[0],xmm2[0] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,1,2,0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,0] ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] @@ -1987,7 +1987,7 @@ ; AVX1-ONLY-NEXT: vmovaps 96(%rdx), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm3[0],xmm2[0] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,1,2,0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,0] ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] @@ -2004,7 +2004,7 @@ ; AVX1-ONLY-NEXT: vmovaps 112(%rdx), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm3[0],xmm2[0] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,1,2,0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,0] ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] @@ -2021,7 +2021,7 @@ ; AVX1-ONLY-NEXT: vmovaps 128(%rdx), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm3[0],xmm2[0] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,1,2,0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,0] ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] @@ -2038,7 +2038,7 @@ ; AVX1-ONLY-NEXT: vmovaps 144(%rdx), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm3[0],xmm2[0] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,1,2,0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,0] ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] @@ -2055,7 +2055,7 @@ ; AVX1-ONLY-NEXT: vmovaps 160(%rdx), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm3[0],xmm2[0] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,1,2,0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,0] ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] @@ -2071,7 +2071,7 @@ ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 176(%rdx), %xmm12 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm12[0] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,1,2,0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,0] ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm12[0],xmm2[0],xmm12[1],xmm2[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] @@ -2086,7 +2086,7 @@ ; AVX1-ONLY-NEXT: vmovaps 192(%rcx), %xmm9 ; AVX1-ONLY-NEXT: vmovaps 192(%rdx), %xmm8 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm9[0],xmm8[0] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,1,2,0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,0] ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm8[0],xmm9[0],xmm8[1],xmm9[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] @@ -2101,7 +2101,7 @@ ; AVX1-ONLY-NEXT: vmovaps 208(%rcx), %xmm5 ; AVX1-ONLY-NEXT: vmovaps 208(%rdx), %xmm4 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm5[0],xmm4[0] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,1,2,0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,0] ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] @@ -2115,7 +2115,7 @@ ; AVX1-ONLY-NEXT: vmovaps 224(%rcx), %xmm3 ; AVX1-ONLY-NEXT: vmovaps 224(%rdx), %xmm2 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm3[0],xmm2[0] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,1,2,0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,0] ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm6 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] @@ -2128,7 +2128,7 @@ ; AVX1-ONLY-NEXT: vmovaps 240(%rcx), %xmm6 ; AVX1-ONLY-NEXT: vmovaps 240(%rdx), %xmm1 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm14 = xmm6[0],xmm1[0] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm14 = xmm14[0,1,2,0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm14[0,1,2,0] ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm13 = xmm1[0],xmm6[0],xmm1[1],xmm6[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm14, %ymm13 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1],ymm13[2,3],ymm15[4,5],ymm13[6,7] @@ -2142,7 +2142,7 @@ ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm14 = xmm0[2],xmm15[2],xmm0[3],xmm15[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm15[3,0],xmm0[3,0] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm15 = xmm15[2,0,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm15[2,0,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm14, %ymm14 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1],ymm13[2,3],ymm14[4,5],ymm13[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -2155,7 +2155,7 @@ ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm14 = xmm15[2],xmm0[2],xmm15[3],xmm0[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm0[3,0],xmm15[3,0] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm15 = xmm15[2,0,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm15[2,0,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm14, %ymm14 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1],ymm13[2,3],ymm14[4,5],ymm13[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -2168,7 +2168,7 @@ ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm14 = xmm15[2],xmm0[2],xmm15[3],xmm0[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm0[3,0],xmm15[3,0] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm15 = xmm15[2,0,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm15[2,0,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm14, %ymm14 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1],ymm13[2,3],ymm14[4,5],ymm13[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -2181,7 +2181,7 @@ ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm14 = xmm0[2],xmm15[2],xmm0[3],xmm15[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm15[3,0],xmm0[3,0] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm15 = xmm15[2,0,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm15[2,0,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm14, %ymm14 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1],ymm13[2,3],ymm14[4,5],ymm13[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -2194,7 +2194,7 @@ ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm14 = xmm15[2],xmm0[2],xmm15[3],xmm0[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm0[3,0],xmm15[3,0] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm15 = xmm15[2,0,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm15[2,0,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm14, %ymm14 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1],ymm13[2,3],ymm14[4,5],ymm13[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -2207,7 +2207,7 @@ ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm14 = xmm15[2],xmm0[2],xmm15[3],xmm0[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm0[3,0],xmm15[3,0] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm15 = xmm15[2,0,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm15[2,0,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm14, %ymm14 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1],ymm13[2,3],ymm14[4,5],ymm13[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -2220,7 +2220,7 @@ ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm14 = xmm15[2],xmm0[2],xmm15[3],xmm0[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm0[3,0],xmm15[3,0] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm15 = xmm15[2,0,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm15[2,0,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm14, %ymm14 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1],ymm13[2,3],ymm14[4,5],ymm13[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -2233,7 +2233,7 @@ ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm14 = xmm15[2],xmm0[2],xmm15[3],xmm0[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm0[3,0],xmm15[3,0] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm15 = xmm15[2,0,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm15[2,0,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm14, %ymm14 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1],ymm13[2,3],ymm14[4,5],ymm13[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -2246,7 +2246,7 @@ ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm14 = xmm15[2],xmm0[2],xmm15[3],xmm0[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm0[3,0],xmm15[3,0] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm15 = xmm15[2,0,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm15[2,0,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm14, %ymm14 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1],ymm13[2,3],ymm14[4,5],ymm13[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -2259,7 +2259,7 @@ ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm14 = xmm15[2],xmm0[2],xmm15[3],xmm0[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm0[3,0],xmm15[3,0] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm15 = xmm15[2,0,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm15[2,0,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm14, %ymm14 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1],ymm13[2,3],ymm14[4,5],ymm13[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -2272,7 +2272,7 @@ ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm14 = xmm15[2],xmm0[2],xmm15[3],xmm0[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm0[3,0],xmm15[3,0] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm15 = xmm15[2,0,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm15[2,0,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm14, %ymm14 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm14[0,1],ymm13[2,3],ymm14[4,5],ymm13[6,7] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -2283,7 +2283,7 @@ ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm13 = xmm14[2],xmm0[2],xmm14[3],xmm0[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm0[3,0],xmm14[3,0] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm14 = xmm14[2,0,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm14[2,0,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm13, %ymm13 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0,1],ymm12[2,3],ymm13[4,5],ymm12[6,7] ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm13 = xmm8[2],xmm9[2],xmm8[3],xmm9[3] @@ -2293,7 +2293,7 @@ ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm9 = xmm13[2],xmm0[2],xmm13[3],xmm0[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm13 = xmm0[3,0],xmm13[3,0] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm13 = xmm13[2,0,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm13 = xmm13[2,0,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm9, %ymm9 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1],ymm8[2,3],ymm9[4,5],ymm8[6,7] ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm9 = xmm4[2],xmm5[2],xmm4[3],xmm5[3] @@ -2303,7 +2303,7 @@ ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm5 = xmm9[2],xmm0[2],xmm9[3],xmm0[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm9 = xmm0[3,0],xmm9[3,0] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm9 = xmm9[2,0,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm9 = xmm9[2,0,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm5, %ymm5 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1],ymm4[2,3],ymm5[4,5],ymm4[6,7] ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm5 = xmm2[2],xmm3[2],xmm2[3],xmm3[3] @@ -2312,7 +2312,7 @@ ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm3 = xmm0[2],xmm11[2],xmm0[3],xmm11[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm11[3,0],xmm0[3,0] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm5 = xmm5[2,0,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm5[2,0,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm3, %ymm3 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7] ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm3 = xmm1[2],xmm6[2],xmm1[3],xmm6[3] @@ -2320,7 +2320,7 @@ ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm10[2],xmm7[2],xmm10[3],xmm7[3] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm7[3,0],xmm10[3,0] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm3 = xmm3[2,0,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm3[2,0,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] ; AVX1-ONLY-NEXT: vmovaps %ymm0, 992(%r8) diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-5.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-5.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-5.ll @@ -49,13 +49,13 @@ ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm2 ; AVX1-ONLY-NEXT: vmovsd {{.*#+}} xmm3 = mem[0],zero -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm2 = ymm2[0,2,2,1,4,6,6,5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,2,2,1,4,6,6,5] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[u,u,0,2,u,5,7,u] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3,4,5,6],ymm2[7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4],ymm0[5,6,7] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,3,3,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2,3] ; AVX1-ONLY-NEXT: vmovlps %xmm1, 32(%r9) ; AVX1-ONLY-NEXT: vmovaps %ymm0, (%r9) @@ -245,7 +245,7 @@ ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm5[1],ymm4[2,3,4,5],ymm5[6],ymm4[7] ; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm5 = ; AVX2-FAST-NEXT: vpermps %ymm2, %ymm5, %ymm2 -; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,3,3,3] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3] ; AVX2-FAST-NEXT: vmovaps %ymm4, 32(%r9) ; AVX2-FAST-NEXT: vmovaps %ymm0, (%r9) @@ -409,7 +409,7 @@ ; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm11 ; AVX1-ONLY-NEXT: vmovaps (%rcx), %xmm12 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm13 = xmm12[0],xmm11[0] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm13 = xmm13[0,1,2,0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm13 = xmm13[0,1,2,0] ; AVX1-ONLY-NEXT: vbroadcastss 4(%rdx), %xmm14 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm13, %ymm13 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm13[2,3],ymm5[4,5,6],ymm13[7] @@ -440,9 +440,9 @@ ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm10 = ymm10[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2],ymm10[3,4],ymm8[5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm9[2],ymm8[3,4,5,6],ymm9[7] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm3 = ymm3[3,0,2,3,7,4,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[3,0,2,3,7,4,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4],ymm3[5,6,7] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm3 = ymm4[0,1,3,0,4,5,7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm4[0,1,3,0,4,5,7,4] ; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} ymm2 = ymm2[1,0,2,2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4,5,6],ymm3[7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4,5],ymm2[6,7] @@ -465,29 +465,29 @@ ; AVX2-SLOW-NEXT: vmovaps (%rcx), %xmm6 ; AVX2-SLOW-NEXT: vmovaps (%rdx), %xmm7 ; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm5 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm5 = xmm5[0,1,2,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm5 = xmm5[0,1,2,2] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,0,2,1] ; AVX2-SLOW-NEXT: vmovaps (%rsi), %xmm8 ; AVX2-SLOW-NEXT: vmovaps (%rdi), %xmm9 ; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm10 = xmm9[0],xmm8[0],xmm9[1],xmm8[1] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm10 = xmm10[0,1,3,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm10 = xmm10[0,1,3,2] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm11 = ymm10[0,1,1,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm11[0,1],ymm5[2,3],ymm11[4,5,6],ymm5[7] ; AVX2-SLOW-NEXT: vinsertf128 $1, (%r8), %ymm10, %ymm10 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm10[0],ymm5[1,2,3],ymm10[4],ymm5[5,6,7] ; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} xmm8 = xmm9[2],xmm8[2],xmm9[3],xmm8[3] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm8 = xmm8[0,1,2,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm8 = xmm8[0,1,2,2] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,0,2,1] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm6 = xmm6[1,1,2,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm6 = xmm6[1,1,2,2] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1],xmm7[2],xmm6[3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,1,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm8[2,3],ymm6[4,5],ymm8[6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm7 = ymm0[0,1,2,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm7[1],ymm6[2,3,4,5],ymm7[6],ymm6[7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm7 = ymm4[1,1,1,1,5,5,5,5] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm7 = ymm4[1,1,1,1,5,5,5,5] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0],ymm2[1],ymm7[2,3,4],ymm2[5],ymm7[6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[2,2,2,2] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm8 = ymm3[1,1,2,2,5,5,6,6] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm8 = ymm3[1,1,2,2,5,5,6,6] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0],ymm1[1,2],ymm8[3,4],ymm1[5,6],ymm8[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[2,2,2,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1,2],ymm7[3,4],ymm8[5,6,7] @@ -495,17 +495,17 @@ ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0],ymm7[1,2,3,4],ymm8[5],ymm7[6,7] ; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm8 = ymm3[2],ymm1[2],ymm3[3],ymm1[3],ymm3[6],ymm1[6],ymm3[7],ymm1[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[3,3,3,3] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm9 = ymm4[0,2,3,3,4,6,7,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm10 = ymm2[2,3,2,3,6,7,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm9 = ymm4[0,2,3,3,4,6,7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm10 = ymm2[2,3,2,3,6,7,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0],ymm9[1,2],ymm10[3,4],ymm9[5,6],ymm10[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[2,1,3,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1,2],ymm8[3,4],ymm9[5,6,7] ; AVX2-SLOW-NEXT: vbroadcastsd 24(%r8), %ymm9 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm9[2],ymm8[3,4,5,6],ymm9[7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm3 = ymm3[3,0,2,3,7,4,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm3 = ymm3[3,0,2,3,7,4,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4],ymm3[5,6,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm3 = ymm4[0,1,3,0,4,5,7,4] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm2 = ymm2[2,3,0,1,6,7,4,5] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm3 = ymm4[0,1,3,0,4,5,7,4] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,3,0,1,6,7,4,5] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4,5,6],ymm3[7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4,5],ymm2[6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7] @@ -531,7 +531,7 @@ ; AVX2-FAST-NEXT: vpermps %ymm5, %ymm8, %ymm5 ; AVX2-FAST-NEXT: vmovaps (%rdx), %xmm9 ; AVX2-FAST-NEXT: vmovaps (%rcx), %xmm10 -; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm11 = xmm10[1,1,2,2] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm11 = xmm10[1,1,2,2] ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm11 = xmm11[0,1],xmm9[2],xmm11[3] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[0,1,1,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm11[0,1],ymm5[2,3],ymm11[4,5],ymm5[6,7] @@ -545,10 +545,10 @@ ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm8[2,3],ymm7[4,5,6],ymm8[7] ; AVX2-FAST-NEXT: vinsertf128 $1, (%r8), %ymm6, %ymm6 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm7[1,2,3],ymm6[4],ymm7[5,6,7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm7 = ymm4[1,1,1,1,5,5,5,5] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm7 = ymm4[1,1,1,1,5,5,5,5] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0],ymm2[1],ymm7[2,3,4],ymm2[5],ymm7[6,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[2,2,2,2] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm8 = ymm3[1,1,2,2,5,5,6,6] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm8 = ymm3[1,1,2,2,5,5,6,6] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0],ymm1[1,2],ymm8[3,4],ymm1[5,6],ymm8[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[2,2,2,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1,2],ymm7[3,4],ymm8[5,6,7] @@ -556,17 +556,17 @@ ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0],ymm7[1,2,3,4],ymm8[5],ymm7[6,7] ; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm8 = ymm3[2],ymm1[2],ymm3[3],ymm1[3],ymm3[6],ymm1[6],ymm3[7],ymm1[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[3,3,3,3] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm9 = ymm4[0,2,3,3,4,6,7,7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm10 = ymm2[2,3,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm9 = ymm4[0,2,3,3,4,6,7,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm10 = ymm2[2,3,2,3,6,7,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0],ymm9[1,2],ymm10[3,4],ymm9[5,6],ymm10[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[2,1,3,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1,2],ymm8[3,4],ymm9[5,6,7] ; AVX2-FAST-NEXT: vbroadcastsd 24(%r8), %ymm9 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm9[2],ymm8[3,4,5,6],ymm9[7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm3 = ymm3[3,0,2,3,7,4,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm3 = ymm3[3,0,2,3,7,4,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4],ymm3[5,6,7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm3 = ymm4[0,1,3,0,4,5,7,4] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm2 = ymm2[2,3,0,1,6,7,4,5] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm3 = ymm4[0,1,3,0,4,5,7,4] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,3,0,1,6,7,4,5] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4,5,6],ymm3[7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4,5],ymm2[6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7] @@ -588,29 +588,29 @@ ; AVX2-FAST-PERLANE-NEXT: vmovaps (%rcx), %xmm6 ; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdx), %xmm7 ; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm5 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm5 = xmm5[0,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm5 = xmm5[0,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,0,2,1] ; AVX2-FAST-PERLANE-NEXT: vmovaps (%rsi), %xmm8 ; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdi), %xmm9 ; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm10 = xmm9[0],xmm8[0],xmm9[1],xmm8[1] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm10 = xmm10[0,1,3,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm10 = xmm10[0,1,3,2] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm11 = ymm10[0,1,1,1] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm11[0,1],ymm5[2,3],ymm11[4,5,6],ymm5[7] ; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, (%r8), %ymm10, %ymm10 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm10[0],ymm5[1,2,3],ymm10[4],ymm5[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} xmm8 = xmm9[2],xmm8[2],xmm9[3],xmm8[3] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm8 = xmm8[0,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm8 = xmm8[0,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm6 = xmm6[1,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm6 = xmm6[1,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1],xmm7[2],xmm6[3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,1,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm8[2,3],ymm6[4,5],ymm8[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm7 = ymm0[0,1,2,1] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm7[1],ymm6[2,3,4,5],ymm7[6],ymm6[7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm7 = ymm4[1,1,1,1,5,5,5,5] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm7 = ymm4[1,1,1,1,5,5,5,5] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0],ymm2[1],ymm7[2,3,4],ymm2[5],ymm7[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm8 = ymm3[1,1,2,2,5,5,6,6] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm8 = ymm3[1,1,2,2,5,5,6,6] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0],ymm1[1,2],ymm8[3,4],ymm1[5,6],ymm8[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[2,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1,2],ymm7[3,4],ymm8[5,6,7] @@ -618,17 +618,17 @@ ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0],ymm7[1,2,3,4],ymm8[5],ymm7[6,7] ; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm8 = ymm3[2],ymm1[2],ymm3[3],ymm1[3],ymm3[6],ymm1[6],ymm3[7],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[3,3,3,3] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm9 = ymm4[0,2,3,3,4,6,7,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm10 = ymm2[2,3,2,3,6,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm9 = ymm4[0,2,3,3,4,6,7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm10 = ymm2[2,3,2,3,6,7,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0],ymm9[1,2],ymm10[3,4],ymm9[5,6],ymm10[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[2,1,3,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1,2],ymm8[3,4],ymm9[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 24(%r8), %ymm9 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm9[2],ymm8[3,4,5,6],ymm9[7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm3 = ymm3[3,0,2,3,7,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm3 = ymm3[3,0,2,3,7,4,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4],ymm3[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm3 = ymm4[0,1,3,0,4,5,7,4] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm2 = ymm2[2,3,0,1,6,7,4,5] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm3 = ymm4[0,1,3,0,4,5,7,4] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,3,0,1,6,7,4,5] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4,5,6],ymm3[7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4,5],ymm2[6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6,7] @@ -874,7 +874,7 @@ ; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm11 ; AVX1-ONLY-NEXT: vmovaps (%rcx), %xmm12 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm9 = xmm12[0],xmm11[0] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm9 = xmm9[0,1,2,0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm9 = xmm9[0,1,2,0] ; AVX1-ONLY-NEXT: vbroadcastss 4(%rdx), %xmm13 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm9, %ymm9 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm9[2,3],ymm4[4,5,6],ymm9[7] @@ -885,7 +885,7 @@ ; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %xmm13 ; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %xmm14 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm9 = xmm14[0],xmm13[0] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm9 = xmm9[0,1,2,0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm9 = xmm9[0,1,2,0] ; AVX1-ONLY-NEXT: vbroadcastss 36(%rdx), %xmm15 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm9, %ymm9 ; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm15 = xmm8[0],xmm7[0],zero,zero @@ -958,16 +958,16 @@ ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm4[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm0[2],ymm8[3,4,5,6],ymm0[7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4],ymm0[5],ymm1[6,7] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm1 = ymm3[3,0,2,3,7,4,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm3[3,0,2,3,7,4,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4],ymm1[5,6,7] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm3 = ymm9[0,1,3,0,4,5,7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm9[0,1,3,0,4,5,7,4] ; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} ymm2 = ymm6[1,0,2,2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4,5,6],ymm3[7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4,5],ymm2[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm12[3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm2 = ymm15[3,0,2,3,7,4,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm15[3,0,2,3,7,4,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4],ymm2[5,6,7] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm3 = ymm14[0,1,3,0,4,5,7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm14[0,1,3,0,4,5,7,4] ; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} ymm9 = ymm13[1,0,2,2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm9[0,1],ymm3[2],ymm9[3,4,5,6],ymm3[7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm3[1,2,3],ymm2[4,5],ymm3[6,7] @@ -1004,11 +1004,11 @@ ; AVX2-SLOW-NEXT: vmovaps (%rdi), %xmm10 ; AVX2-SLOW-NEXT: vmovaps 32(%rdi), %xmm12 ; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} xmm8 = xmm10[2],xmm9[2],xmm10[3],xmm9[3] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm8 = xmm8[0,1,2,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm8 = xmm8[0,1,2,2] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,0,2,1] ; AVX2-SLOW-NEXT: vmovaps (%rdx), %xmm13 ; AVX2-SLOW-NEXT: vmovaps (%rcx), %xmm14 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm15 = xmm14[1,1,2,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm15 = xmm14[1,1,2,2] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm15 = xmm15[0,1],xmm13[2],xmm15[3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[0,1,1,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm15[0,1],ymm8[2,3],ymm15[4,5],ymm8[6,7] @@ -1018,37 +1018,37 @@ ; AVX2-SLOW-NEXT: vmovaps 32(%rdx), %xmm15 ; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm13 = xmm13[0],xmm14[0],xmm13[1],xmm14[1] ; AVX2-SLOW-NEXT: vmovaps 32(%rcx), %xmm14 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm13 = xmm13[0,1,2,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm13 = xmm13[0,1,2,2] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[0,0,2,1] ; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm9 = xmm9[0,1,3,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm9 = xmm9[0,1,3,2] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm10 = ymm9[0,1,1,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1],ymm13[2,3],ymm10[4,5,6],ymm13[7] ; AVX2-SLOW-NEXT: vinsertf128 $1, (%r8), %ymm9, %ymm9 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0],ymm10[1,2,3],ymm9[4],ymm10[5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} xmm10 = xmm12[2],xmm11[2],xmm12[3],xmm11[3] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm10 = xmm10[0,1,2,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm10 = xmm10[0,1,2,2] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[0,0,2,1] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm13 = xmm14[1,1,2,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm13 = xmm14[1,1,2,2] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm13 = xmm13[0,1],xmm15[2],xmm13[3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[0,1,1,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm10 = ymm13[0,1],ymm10[2,3],ymm13[4,5],ymm10[6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm13 = ymm6[0,1,2,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0],ymm13[1],ymm10[2,3,4,5],ymm13[6],ymm10[7] ; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm13 = xmm15[0],xmm14[0],xmm15[1],xmm14[1] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm13 = xmm13[0,1,2,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm13 = xmm13[0,1,2,2] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[0,0,2,1] ; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm11 = xmm12[0],xmm11[0],xmm12[1],xmm11[1] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm11 = xmm11[0,1,3,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm11 = xmm11[0,1,3,2] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm12 = ymm11[0,1,1,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1],ymm13[2,3],ymm12[4,5,6],ymm13[7] ; AVX2-SLOW-NEXT: vinsertf128 $1, 32(%r8), %ymm11, %ymm11 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0],ymm12[1,2,3],ymm11[4],ymm12[5,6,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm12 = ymm7[1,1,1,1,5,5,5,5] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm12 = ymm7[1,1,1,1,5,5,5,5] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0],ymm4[1],ymm12[2,3,4],ymm4[5],ymm12[6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm12 = ymm12[2,2,2,2] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm13 = ymm5[1,1,2,2,5,5,6,6] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm13 = ymm5[1,1,2,2,5,5,6,6] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0],ymm3[1,2],ymm13[3,4],ymm3[5,6],ymm13[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[2,2,2,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0,1,2],ymm12[3,4],ymm13[5,6,7] @@ -1056,15 +1056,15 @@ ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0],ymm12[1,2,3,4],ymm13[5],ymm12[6,7] ; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm13 = ymm5[2],ymm3[2],ymm5[3],ymm3[3],ymm5[6],ymm3[6],ymm5[7],ymm3[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[3,3,3,3] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm14 = ymm7[0,2,3,3,4,6,7,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm15 = ymm4[2,3,2,3,6,7,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm14 = ymm7[0,2,3,3,4,6,7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm15 = ymm4[2,3,2,3,6,7,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0],ymm14[1,2],ymm15[3,4],ymm14[5,6],ymm15[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[2,1,3,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0,1,2],ymm13[3,4],ymm14[5,6,7] ; AVX2-SLOW-NEXT: vbroadcastsd 56(%r8), %ymm14 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1],ymm14[2],ymm13[3,4,5,6],ymm14[7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm14 = ymm2[0,2,3,3,4,6,7,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm15 = ymm1[2,3,2,3,6,7,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm14 = ymm2[0,2,3,3,4,6,7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm15 = ymm1[2,3,2,3,6,7,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0],ymm14[1,2],ymm15[3,4],ymm14[5,6],ymm15[7] ; AVX2-SLOW-NEXT: vmovaps (%rdi), %ymm15 ; AVX2-SLOW-NEXT: vmovaps (%rsi), %ymm0 @@ -1074,26 +1074,26 @@ ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm14[0,1,2],ymm8[3,4],ymm14[5,6,7] ; AVX2-SLOW-NEXT: vbroadcastsd 24(%r8), %ymm14 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm14[2],ymm8[3,4,5,6],ymm14[7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm14 = ymm2[1,1,1,1,5,5,5,5] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm14 = ymm2[1,1,1,1,5,5,5,5] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0],ymm1[1],ymm14[2,3,4],ymm1[5],ymm14[6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[2,2,2,2] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm9 = ymm0[1,1,2,2,5,5,6,6] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm9 = ymm0[1,1,2,2,5,5,6,6] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0],ymm15[1,2],ymm9[3,4],ymm15[5,6],ymm9[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[2,2,2,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2],ymm14[3,4],ymm9[5,6,7] ; AVX2-SLOW-NEXT: vbroadcastsd 16(%r8), %ymm14 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm14[0],ymm9[1,2,3,4],ymm14[5],ymm9[6,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm5 = ymm5[3,0,2,3,7,4,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm5 = ymm5[3,0,2,3,7,4,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4],ymm5[5,6,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm5 = ymm7[0,1,3,0,4,5,7,4] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm4 = ymm4[2,3,0,1,6,7,4,5] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm5 = ymm7[0,1,3,0,4,5,7,4] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm4 = ymm4[2,3,0,1,6,7,4,5] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm5[2],ymm4[3,4,5,6],ymm5[7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm4[1,2,3],ymm3[4,5],ymm4[6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm6[3],ymm3[4,5,6,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,0,2,3,7,4,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,0,2,3,7,4,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4],ymm0[5,6,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm2 = ymm2[0,1,3,0,4,5,7,4] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[2,3,0,1,6,7,4,5] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,1,3,0,4,5,7,4] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,3,0,1,6,7,4,5] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4,5,6],ymm2[7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4,5],ymm1[6,7] ; AVX2-SLOW-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload @@ -1131,7 +1131,7 @@ ; AVX2-FAST-NEXT: vmovaps 32(%rdx), %xmm13 ; AVX2-FAST-NEXT: vmovaps (%rcx), %xmm14 ; AVX2-FAST-NEXT: vmovaps 32(%rcx), %xmm15 -; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm5 = xmm14[1,1,2,2] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm5 = xmm14[1,1,2,2] ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1],xmm12[2],xmm5[3] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,1,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1],ymm4[2,3],ymm5[4,5],ymm4[6,7] @@ -1140,7 +1140,7 @@ ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vunpckhps {{.*#+}} xmm5 = xmm9[2],xmm7[2],xmm9[3],xmm7[3] ; AVX2-FAST-NEXT: vpermps %ymm5, %ymm11, %ymm5 -; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm8 = xmm15[1,1,2,2] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm8 = xmm15[1,1,2,2] ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0,1],xmm13[2],xmm8[3] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,1,1,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm8[0,1],ymm5[2,3],ymm8[4,5],ymm5[6,7] @@ -1157,17 +1157,17 @@ ; AVX2-FAST-NEXT: vmovaps 32(%rcx), %ymm12 ; AVX2-FAST-NEXT: vinsertf128 $1, (%r8), %ymm6, %ymm6 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm10[1,2,3],ymm6[4],ymm10[5,6,7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm10 = ymm12[1,1,1,1,5,5,5,5] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm10 = ymm12[1,1,1,1,5,5,5,5] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0],ymm8[1],ymm10[2,3,4],ymm8[5],ymm10[6,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[2,2,2,2] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm14 = ymm2[1,1,2,2,5,5,6,6] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm14 = ymm2[1,1,2,2,5,5,6,6] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0],ymm1[1,2],ymm14[3,4],ymm1[5,6],ymm14[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[2,2,2,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm10 = ymm14[0,1,2],ymm10[3,4],ymm14[5,6,7] ; AVX2-FAST-NEXT: vbroadcastsd 48(%r8), %ymm14 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm10 = ymm14[0],ymm10[1,2,3,4],ymm14[5],ymm10[6,7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm14 = ymm12[0,2,3,3,4,6,7,7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm4 = ymm8[2,3,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm14 = ymm12[0,2,3,3,4,6,7,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm8[2,3,2,3,6,7,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm14[1,2],ymm4[3,4],ymm14[5,6],ymm4[7] ; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm14 = ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[6],ymm1[6],ymm2[7],ymm1[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[3,3,3,3] @@ -1184,8 +1184,8 @@ ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm4[2,3],ymm0[4,5,6],ymm4[7] ; AVX2-FAST-NEXT: vinsertf128 $1, 32(%r8), %ymm7, %ymm4 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm4[0],ymm0[1,2,3],ymm4[4],ymm0[5,6,7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm0 = ymm9[0,2,3,3,4,6,7,7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm4 = ymm11[2,3,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm9[0,2,3,3,4,6,7,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm11[2,3,2,3,6,7,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0],ymm0[1,2],ymm4[3,4],ymm0[5,6],ymm4[7] ; AVX2-FAST-NEXT: vmovaps (%rdi), %ymm4 ; AVX2-FAST-NEXT: vmovaps (%rsi), %ymm13 @@ -1195,26 +1195,26 @@ ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm15[3,4],ymm0[5,6,7] ; AVX2-FAST-NEXT: vbroadcastsd 24(%r8), %ymm15 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm15[2],ymm0[3,4,5,6],ymm15[7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm15 = ymm9[1,1,1,1,5,5,5,5] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm15 = ymm9[1,1,1,1,5,5,5,5] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0],ymm11[1],ymm15[2,3,4],ymm11[5],ymm15[6,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[2,2,2,2] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm5 = ymm13[1,1,2,2,5,5,6,6] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm5 = ymm13[1,1,2,2,5,5,6,6] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm4[1,2],ymm5[3,4],ymm4[5,6],ymm5[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[2,2,2,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2],ymm15[3,4],ymm5[5,6,7] ; AVX2-FAST-NEXT: vbroadcastsd 16(%r8), %ymm15 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm15[0],ymm5[1,2,3,4],ymm15[5],ymm5[6,7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm2 = ymm2[3,0,2,3,7,4,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm2[3,0,2,3,7,4,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4],ymm2[5,6,7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm2 = ymm12[0,1,3,0,4,5,7,4] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm8 = ymm8[2,3,0,1,6,7,4,5] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm12[0,1,3,0,4,5,7,4] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm8 = ymm8[2,3,0,1,6,7,4,5] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm8[0,1],ymm2[2],ymm8[3,4,5,6],ymm2[7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4,5],ymm2[6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm3[3],ymm1[4,5,6,7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm2 = ymm13[3,0,2,3,7,4,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm13[3,0,2,3,7,4,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4],ymm2[5,6,7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm3 = ymm9[0,1,3,0,4,5,7,4] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm4 = ymm11[2,3,0,1,6,7,4,5] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm3 = ymm9[0,1,3,0,4,5,7,4] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm11[2,3,0,1,6,7,4,5] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1],ymm3[2],ymm4[3,4,5,6],ymm3[7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm3[1,2,3],ymm2[4,5],ymm3[6,7] ; AVX2-FAST-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload @@ -1250,11 +1250,11 @@ ; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdi), %xmm10 ; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rdi), %xmm12 ; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} xmm8 = xmm10[2],xmm9[2],xmm10[3],xmm9[3] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm8 = xmm8[0,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm8 = xmm8[0,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[0,0,2,1] ; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdx), %xmm13 ; AVX2-FAST-PERLANE-NEXT: vmovaps (%rcx), %xmm14 -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm15 = xmm14[1,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm15 = xmm14[1,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm15 = xmm15[0,1],xmm13[2],xmm15[3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[0,1,1,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm15[0,1],ymm8[2,3],ymm15[4,5],ymm8[6,7] @@ -1264,37 +1264,37 @@ ; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rdx), %xmm15 ; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm13 = xmm13[0],xmm14[0],xmm13[1],xmm14[1] ; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rcx), %xmm14 -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm13 = xmm13[0,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm13 = xmm13[0,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[0,0,2,1] ; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm9 = xmm9[0,1,3,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm9 = xmm9[0,1,3,2] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm10 = ymm9[0,1,1,1] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1],ymm13[2,3],ymm10[4,5,6],ymm13[7] ; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, (%r8), %ymm9, %ymm9 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0],ymm10[1,2,3],ymm9[4],ymm10[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} xmm10 = xmm12[2],xmm11[2],xmm12[3],xmm11[3] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm10 = xmm10[0,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm10 = xmm10[0,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm13 = xmm14[1,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm13 = xmm14[1,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm13 = xmm13[0,1],xmm15[2],xmm13[3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[0,1,1,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm10 = ymm13[0,1],ymm10[2,3],ymm13[4,5],ymm10[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm13 = ymm6[0,1,2,1] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0],ymm13[1],ymm10[2,3,4,5],ymm13[6],ymm10[7] ; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm13 = xmm15[0],xmm14[0],xmm15[1],xmm14[1] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm13 = xmm13[0,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm13 = xmm13[0,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[0,0,2,1] ; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm11 = xmm12[0],xmm11[0],xmm12[1],xmm11[1] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm11 = xmm11[0,1,3,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm11 = xmm11[0,1,3,2] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm12 = ymm11[0,1,1,1] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1],ymm13[2,3],ymm12[4,5,6],ymm13[7] ; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, 32(%r8), %ymm11, %ymm11 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0],ymm12[1,2,3],ymm11[4],ymm12[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm12 = ymm7[1,1,1,1,5,5,5,5] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm12 = ymm7[1,1,1,1,5,5,5,5] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0],ymm4[1],ymm12[2,3,4],ymm4[5],ymm12[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm12 = ymm12[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm13 = ymm5[1,1,2,2,5,5,6,6] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm13 = ymm5[1,1,2,2,5,5,6,6] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0],ymm3[1,2],ymm13[3,4],ymm3[5,6],ymm13[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[2,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0,1,2],ymm12[3,4],ymm13[5,6,7] @@ -1302,15 +1302,15 @@ ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0],ymm12[1,2,3,4],ymm13[5],ymm12[6,7] ; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm13 = ymm5[2],ymm3[2],ymm5[3],ymm3[3],ymm5[6],ymm3[6],ymm5[7],ymm3[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[3,3,3,3] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm14 = ymm7[0,2,3,3,4,6,7,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm15 = ymm4[2,3,2,3,6,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm14 = ymm7[0,2,3,3,4,6,7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm15 = ymm4[2,3,2,3,6,7,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0],ymm14[1,2],ymm15[3,4],ymm14[5,6],ymm15[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[2,1,3,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0,1,2],ymm13[3,4],ymm14[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 56(%r8), %ymm14 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1],ymm14[2],ymm13[3,4,5,6],ymm14[7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm14 = ymm2[0,2,3,3,4,6,7,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm15 = ymm1[2,3,2,3,6,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm14 = ymm2[0,2,3,3,4,6,7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm15 = ymm1[2,3,2,3,6,7,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0],ymm14[1,2],ymm15[3,4],ymm14[5,6],ymm15[7] ; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdi), %ymm15 ; AVX2-FAST-PERLANE-NEXT: vmovaps (%rsi), %ymm0 @@ -1320,26 +1320,26 @@ ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm14[0,1,2],ymm8[3,4],ymm14[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 24(%r8), %ymm14 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm14[2],ymm8[3,4,5,6],ymm14[7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm14 = ymm2[1,1,1,1,5,5,5,5] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm14 = ymm2[1,1,1,1,5,5,5,5] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0],ymm1[1],ymm14[2,3,4],ymm1[5],ymm14[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm14 = ymm14[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm9 = ymm0[1,1,2,2,5,5,6,6] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm9 = ymm0[1,1,2,2,5,5,6,6] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0],ymm15[1,2],ymm9[3,4],ymm15[5,6],ymm9[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[2,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2],ymm14[3,4],ymm9[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 16(%r8), %ymm14 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm14[0],ymm9[1,2,3,4],ymm14[5],ymm9[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm5 = ymm5[3,0,2,3,7,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm5 = ymm5[3,0,2,3,7,4,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4],ymm5[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm5 = ymm7[0,1,3,0,4,5,7,4] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm4 = ymm4[2,3,0,1,6,7,4,5] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm5 = ymm7[0,1,3,0,4,5,7,4] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm4 = ymm4[2,3,0,1,6,7,4,5] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm5[2],ymm4[3,4,5,6],ymm5[7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm4[1,2,3],ymm3[4,5],ymm4[6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2],ymm6[3],ymm3[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,0,2,3,7,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,0,2,3,7,4,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4],ymm0[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm2 = ymm2[0,1,3,0,4,5,7,4] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[2,3,0,1,6,7,4,5] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,1,3,0,4,5,7,4] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,3,0,1,6,7,4,5] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4,5,6],ymm2[7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4,5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload @@ -1920,7 +1920,7 @@ ; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm10 ; AVX1-ONLY-NEXT: vmovaps (%rcx), %xmm12 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm12[0],xmm10[0] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[0,1,2,0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,1,2,0] ; AVX1-ONLY-NEXT: vbroadcastss 4(%rdx), %xmm3 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5,6],ymm2[7] @@ -1933,7 +1933,7 @@ ; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %xmm4 ; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %xmm3 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm2 = xmm3[0],xmm4[0] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[0,1,2,0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,1,2,0] ; AVX1-ONLY-NEXT: vbroadcastss 36(%rdx), %xmm6 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm2, %ymm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5,6],ymm2[7] @@ -1946,7 +1946,7 @@ ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps 64(%rcx), %xmm8 ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm6 = xmm8[0],xmm0[0] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm6 = xmm6[0,1,2,0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm6[0,1,2,0] ; AVX1-ONLY-NEXT: vbroadcastss 68(%rdx), %xmm13 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm6, %ymm6 ; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm13 = zero,xmm1[1],xmm2[1],zero @@ -1961,7 +1961,7 @@ ; AVX1-ONLY-NEXT: vmovaps 96(%rcx), %xmm6 ; AVX1-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm6 = xmm6[0],xmm0[0] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm6 = xmm6[0,1,2,0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm6[0,1,2,0] ; AVX1-ONLY-NEXT: vbroadcastss 100(%rdx), %xmm13 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm6, %ymm13 ; AVX1-ONLY-NEXT: vmovaps 96(%rsi), %xmm0 @@ -2124,38 +2124,38 @@ ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm4[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm0[0],ymm5[1,2,3,4],ymm0[5],ymm5[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm1[0,1],ymm0[2],ymm1[3,4,5,6],ymm0[7] -; AVX1-ONLY-NEXT: vpermilps $227, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm1 = mem[3,0,2,3,7,4,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm0[3,0,2,3,7,4,6,7] ; AVX1-ONLY-NEXT: vblendps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4],ymm1[5,6,7] -; AVX1-ONLY-NEXT: vpermilps $52, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm0 = mem[0,1,3,0,4,5,7,4] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,3,0,4,5,7,4] ; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} ymm13 = ymm13[1,0,2,2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1],ymm0[2],ymm13[3,4,5,6],ymm0[7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3],ymm1[4,5],ymm0[6,7] ; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[0,1,2],mem[3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm1 = ymm12[3,0,2,3,7,4,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm12[3,0,2,3,7,4,6,7] ; AVX1-ONLY-NEXT: vblendps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm1 = ymm1[0,1,2,3],mem[4],ymm1[5,6,7] -; AVX1-ONLY-NEXT: vpermilps $52, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm12 = mem[0,1,3,0,4,5,7,4] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm12[0,1,3,0,4,5,7,4] ; AVX1-ONLY-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm13 = mem[1,0,2,2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0,1],ymm12[2],ymm13[3,4,5,6],ymm12[7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm12[1,2,3],ymm1[4,5],ymm12[6,7] ; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm1 = ymm1[0,1,2],mem[3],ymm1[4,5,6,7] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm6 = ymm6[3,0,2,3,7,4,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm6[3,0,2,3,7,4,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4],ymm6[5,6,7] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm7 = ymm9[0,1,3,0,4,5,7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm9[0,1,3,0,4,5,7,4] ; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} ymm9 = ymm11[1,0,2,2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm9[0,1],ymm7[2],ymm9[3,4,5,6],ymm7[7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm7[1,2,3],ymm6[4,5],ymm7[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2],ymm8[3],ymm6[4,5,6,7] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm7 = ymm15[3,0,2,3,7,4,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm15[3,0,2,3,7,4,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm7[0,1,2,3],ymm2[4],ymm7[5,6,7] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm3 = ymm3[0,1,3,0,4,5,7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,1,3,0,4,5,7,4] ; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} ymm7 = ymm14[1,0,2,2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm7[0,1],ymm3[2],ymm7[3,4,5,6],ymm3[7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm3[1,2,3],ymm2[4,5],ymm3[6,7] @@ -2214,14 +2214,14 @@ ; AVX2-SLOW-NEXT: vmovaps 32(%rdx), %xmm4 ; AVX2-SLOW-NEXT: vmovaps 64(%rdx), %xmm1 ; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm2 = xmm8[0],xmm7[0],xmm8[1],xmm7[1] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[0,1,2,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,1,2,2] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm5 = ymm2[0,0,2,1] ; AVX2-SLOW-NEXT: vmovaps (%rsi), %xmm9 ; AVX2-SLOW-NEXT: vmovaps 32(%rsi), %xmm6 ; AVX2-SLOW-NEXT: vmovaps 64(%rsi), %xmm2 ; AVX2-SLOW-NEXT: vmovaps (%rdi), %xmm10 ; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm11 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm11 = xmm11[0,1,3,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm11 = xmm11[0,1,3,2] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm12 = ymm11[0,1,1,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm12[0,1],ymm5[2,3],ymm12[4,5,6],ymm5[7] ; AVX2-SLOW-NEXT: vmovaps 32(%rdi), %xmm12 @@ -2230,9 +2230,9 @@ ; AVX2-SLOW-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps 64(%rdi), %xmm5 ; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} xmm9 = xmm10[2],xmm9[2],xmm10[3],xmm9[3] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm9 = xmm9[0,1,2,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm9 = xmm9[0,1,2,2] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[0,0,2,1] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm7 = xmm7[1,1,2,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm7 = xmm7[1,1,2,2] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm8[2],xmm7[3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,1,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm9[2,3],ymm7[4,5],ymm9[6,7] @@ -2240,19 +2240,19 @@ ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0],ymm8[1],ymm7[2,3,4,5],ymm8[6],ymm7[7] ; AVX2-SLOW-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm7 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm7 = xmm7[0,1,2,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm7 = xmm7[0,1,2,2] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,0,2,1] ; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm8 = xmm12[0],xmm6[0],xmm12[1],xmm6[1] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm8 = xmm8[0,1,3,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm8 = xmm8[0,1,3,2] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm9 = ymm8[0,1,1,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm9[0,1],ymm7[2,3],ymm9[4,5,6],ymm7[7] ; AVX2-SLOW-NEXT: vinsertf128 $1, 32(%r8), %ymm8, %ymm8 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0],ymm7[1,2,3],ymm8[4],ymm7[5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} xmm6 = xmm12[2],xmm6[2],xmm12[3],xmm6[3] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm6 = xmm6[0,1,2,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm6 = xmm6[0,1,2,2] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,0,2,1] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm3 = xmm3[1,1,2,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm3[1,1,2,2] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm4[2],xmm3[3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,1,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm6[2,3],ymm3[4,5],ymm6[6,7] @@ -2260,19 +2260,19 @@ ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3,4,5],ymm4[6],ymm3[7] ; AVX2-SLOW-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm3 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm3 = xmm3[0,1,2,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,1,2,2] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,0,2,1] ; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm4 = xmm5[0],xmm2[0],xmm5[1],xmm2[1] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm4 = xmm4[0,1,3,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm4 = xmm4[0,1,3,2] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm6 = ymm4[0,1,1,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm6[0,1],ymm3[2,3],ymm6[4,5,6],ymm3[7] ; AVX2-SLOW-NEXT: vinsertf128 $1, 64(%r8), %ymm4, %ymm4 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0],ymm3[1,2,3],ymm4[4],ymm3[5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} xmm2 = xmm5[2],xmm2[2],xmm5[3],xmm2[3] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[0,1,2,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,1,2,2] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,0,2,1] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,2,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1,2,2] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5],ymm2[6,7] @@ -2282,12 +2282,12 @@ ; AVX2-SLOW-NEXT: vmovaps 96(%rcx), %xmm0 ; AVX2-SLOW-NEXT: vmovaps 96(%rdx), %xmm1 ; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[0,1,2,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,1,2,2] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,0,2,1] ; AVX2-SLOW-NEXT: vmovaps 96(%rsi), %xmm3 ; AVX2-SLOW-NEXT: vmovaps 96(%rdi), %xmm4 ; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm5 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm5 = xmm5[0,1,3,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm5 = xmm5[0,1,3,2] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm6 = ymm5[0,1,1,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm6[0,1],ymm2[2,3],ymm6[4,5,6],ymm2[7] ; AVX2-SLOW-NEXT: vmovaps (%rdi), %ymm6 @@ -2297,23 +2297,23 @@ ; AVX2-SLOW-NEXT: vmovaps (%rdx), %ymm5 ; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} xmm2 = xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX2-SLOW-NEXT: vmovaps (%rcx), %ymm14 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,2,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1,2,2] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3] ; AVX2-SLOW-NEXT: vmovaps 96(%r8), %ymm3 ; AVX2-SLOW-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm1 = xmm2[0,1,2,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm2[0,1,2,2] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,0,2,1] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm3[0,1,2,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5],ymm1[6],ymm0[7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm14[1,1,1,1,5,5,5,5] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm14[1,1,1,1,5,5,5,5] ; AVX2-SLOW-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm5[1],ymm0[2,3,4],ymm5[5],ymm0[6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] ; AVX2-SLOW-NEXT: vmovaps (%rsi), %ymm13 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm13[1,1,2,2,5,5,6,6] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm13[1,1,2,2,5,5,6,6] ; AVX2-SLOW-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm6[1,2],ymm1[3,4],ymm6[5,6],ymm1[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,3] @@ -2321,8 +2321,8 @@ ; AVX2-SLOW-NEXT: vbroadcastsd 16(%r8), %ymm1 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4],ymm1[5],ymm0[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm14[0,2,3,3,4,6,7,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm5[2,3,2,3,6,7,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm14[0,2,3,3,4,6,7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm5[2,3,2,3,6,7,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7] ; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm1 = ymm13[2],ymm6[2],ymm13[3],ymm6[3],ymm13[6],ymm6[6],ymm13[7],ymm6[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] @@ -2333,20 +2333,20 @@ ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps 32(%rdx), %ymm12 ; AVX2-SLOW-NEXT: vmovaps 32(%rcx), %ymm11 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm11[1,1,1,1,5,5,5,5] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm11[1,1,1,1,5,5,5,5] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm12[1],ymm0[2,3,4],ymm12[5],ymm0[6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] ; AVX2-SLOW-NEXT: vmovaps 32(%rdi), %ymm10 ; AVX2-SLOW-NEXT: vmovaps 32(%rsi), %ymm9 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm9[1,1,2,2,5,5,6,6] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm9[1,1,2,2,5,5,6,6] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm10[1,2],ymm1[3,4],ymm10[5,6],ymm1[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4],ymm1[5,6,7] ; AVX2-SLOW-NEXT: vbroadcastsd 48(%r8), %ymm1 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4],ymm1[5],ymm0[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm11[0,2,3,3,4,6,7,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm12[2,3,2,3,6,7,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm11[0,2,3,3,4,6,7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm12[2,3,2,3,6,7,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7] ; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm1 = ymm9[2],ymm10[2],ymm9[3],ymm10[3],ymm9[6],ymm10[6],ymm9[7],ymm10[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] @@ -2357,20 +2357,20 @@ ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps 64(%rdx), %ymm8 ; AVX2-SLOW-NEXT: vmovaps 64(%rcx), %ymm7 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm7[1,1,1,1,5,5,5,5] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm7[1,1,1,1,5,5,5,5] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm8[1],ymm0[2,3,4],ymm8[5],ymm0[6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] ; AVX2-SLOW-NEXT: vmovaps 64(%rdi), %ymm6 ; AVX2-SLOW-NEXT: vmovaps 64(%rsi), %ymm5 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm5[1,1,2,2,5,5,6,6] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm5[1,1,2,2,5,5,6,6] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm6[1,2],ymm1[3,4],ymm6[5,6],ymm1[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4],ymm1[5,6,7] ; AVX2-SLOW-NEXT: vbroadcastsd 80(%r8), %ymm1 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4],ymm1[5],ymm0[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm7[0,2,3,3,4,6,7,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm8[2,3,2,3,6,7,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm7[0,2,3,3,4,6,7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm8[2,3,2,3,6,7,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7] ; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm1 = ymm5[2],ymm6[2],ymm5[3],ymm6[3],ymm5[6],ymm6[6],ymm5[7],ymm6[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] @@ -2381,20 +2381,20 @@ ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps 96(%rdx), %ymm4 ; AVX2-SLOW-NEXT: vmovaps 96(%rcx), %ymm3 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm3[1,1,1,1,5,5,5,5] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm3[1,1,1,1,5,5,5,5] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm4[1],ymm0[2,3,4],ymm4[5],ymm0[6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] ; AVX2-SLOW-NEXT: vmovaps 96(%rdi), %ymm2 ; AVX2-SLOW-NEXT: vmovaps 96(%rsi), %ymm1 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm15 = ymm1[1,1,2,2,5,5,6,6] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm15 = ymm1[1,1,2,2,5,5,6,6] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0],ymm2[1,2],ymm15[3,4],ymm2[5,6],ymm15[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[2,2,2,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2],ymm0[3,4],ymm15[5,6,7] ; AVX2-SLOW-NEXT: vbroadcastsd 112(%r8), %ymm15 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0],ymm0[1,2,3,4],ymm15[5],ymm0[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm3[0,2,3,3,4,6,7,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm15 = ymm4[2,3,2,3,6,7,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm3[0,2,3,3,4,6,7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm15 = ymm4[2,3,2,3,6,7,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0],ymm0[1,2],ymm15[3,4],ymm0[5,6],ymm15[7] ; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm15 = ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[6],ymm2[6],ymm1[7],ymm2[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[3,3,3,3] @@ -2402,36 +2402,36 @@ ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm15[3,4],ymm0[5,6,7] ; AVX2-SLOW-NEXT: vbroadcastsd 120(%r8), %ymm15 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm15 = ymm0[0,1],ymm15[2],ymm0[3,4,5,6],ymm15[7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm13[3,0,2,3,7,4,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm13[3,0,2,3,7,4,6,7] ; AVX2-SLOW-NEXT: vblendps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4],ymm0[5,6,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm13 = ymm14[0,1,3,0,4,5,7,4] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm13 = ymm14[0,1,3,0,4,5,7,4] ; AVX2-SLOW-NEXT: vpermilps $78, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm14 = mem[2,3,0,1,6,7,4,5] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0,1],ymm13[2],ymm14[3,4,5,6],ymm13[7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm13[1,2,3],ymm0[4,5],ymm13[6,7] ; AVX2-SLOW-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm0 = ymm0[0,1,2],mem[3],ymm0[4,5,6,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm9 = ymm9[3,0,2,3,7,4,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm9 = ymm9[3,0,2,3,7,4,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4],ymm9[5,6,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm10 = ymm11[0,1,3,0,4,5,7,4] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm11 = ymm12[2,3,0,1,6,7,4,5] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm10 = ymm11[0,1,3,0,4,5,7,4] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm11 = ymm12[2,3,0,1,6,7,4,5] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0,1],ymm10[2],ymm11[3,4,5,6],ymm10[7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0],ymm10[1,2,3],ymm9[4,5],ymm10[6,7] ; AVX2-SLOW-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm9 = ymm9[0,1,2],mem[3],ymm9[4,5,6,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm5 = ymm5[3,0,2,3,7,4,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm5 = ymm5[3,0,2,3,7,4,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4],ymm5[5,6,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm6 = ymm7[0,1,3,0,4,5,7,4] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm7 = ymm8[2,3,0,1,6,7,4,5] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm6 = ymm7[0,1,3,0,4,5,7,4] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm7 = ymm8[2,3,0,1,6,7,4,5] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1],ymm6[2],ymm7[3,4,5,6],ymm6[7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm6[1,2,3],ymm5[4,5],ymm6[6,7] ; AVX2-SLOW-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm5 = ymm5[0,1,2],mem[3],ymm5[4,5,6,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[3,0,2,3,7,4,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm1[3,0,2,3,7,4,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4],ymm1[5,6,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm2 = ymm3[0,1,3,0,4,5,7,4] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm3 = ymm4[2,3,0,1,6,7,4,5] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm3[0,1,3,0,4,5,7,4] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm3 = ymm4[2,3,0,1,6,7,4,5] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2],ymm3[3,4,5,6],ymm2[7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4,5],ymm2[6,7] ; AVX2-SLOW-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload @@ -2496,7 +2496,7 @@ ; AVX2-FAST-NEXT: vmovaps 32(%rdx), %xmm1 ; AVX2-FAST-NEXT: vmovaps (%rcx), %xmm8 ; AVX2-FAST-NEXT: vmovaps 32(%rcx), %xmm2 -; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm3 = xmm8[1,1,2,2] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm3 = xmm8[1,1,2,2] ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm7[2],xmm3[3] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,1,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3],ymm3[4,5],ymm0[6,7] @@ -2505,7 +2505,7 @@ ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vunpckhps {{.*#+}} xmm0 = xmm9[2],xmm6[2],xmm9[3],xmm6[3] ; AVX2-FAST-NEXT: vpermps %ymm0, %ymm14, %ymm0 -; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm3 = xmm2[1,1,2,2] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm3 = xmm2[1,1,2,2] ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm1[2],xmm3[3] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,1,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3],ymm3[4,5],ymm0[6,7] @@ -2516,7 +2516,7 @@ ; AVX2-FAST-NEXT: vpermps %ymm0, %ymm14, %ymm10 ; AVX2-FAST-NEXT: vmovaps 64(%rdx), %xmm0 ; AVX2-FAST-NEXT: vmovaps 64(%rcx), %xmm3 -; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm11 = xmm3[1,1,2,2] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm11 = xmm3[1,1,2,2] ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm11 = xmm11[0,1],xmm0[2],xmm11[3] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[0,1,1,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0,1],ymm10[2,3],ymm11[4,5],ymm10[6,7] @@ -2535,7 +2535,7 @@ ; AVX2-FAST-NEXT: vmovaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vmovaps 96(%rcx), %xmm11 ; AVX2-FAST-NEXT: vmovaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm11 = xmm11[1,1,2,2] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm11 = xmm11[1,1,2,2] ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm11 = xmm11[0,1],xmm12[2],xmm11[3] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[0,1,1,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0,1],ymm10[2,3],ymm11[4,5],ymm10[6,7] @@ -2555,7 +2555,7 @@ ; AVX2-FAST-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps (%rdx), %ymm5 ; AVX2-FAST-NEXT: vmovaps (%rcx), %ymm7 -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm4 = ymm7[1,1,1,1,5,5,5,5] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm7[1,1,1,1,5,5,5,5] ; AVX2-FAST-NEXT: vmovaps %ymm7, %ymm8 ; AVX2-FAST-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm5[1],ymm4[2,3,4],ymm5[5],ymm4[6,7] @@ -2564,15 +2564,15 @@ ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[2,2,2,2] ; AVX2-FAST-NEXT: vmovaps (%rdi), %ymm13 ; AVX2-FAST-NEXT: vmovaps (%rsi), %ymm12 -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm5 = ymm12[1,1,2,2,5,5,6,6] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm5 = ymm12[1,1,2,2,5,5,6,6] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm13[1,2],ymm5[3,4],ymm13[5,6],ymm5[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[2,2,2,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3,4],ymm5[5,6,7] ; AVX2-FAST-NEXT: vbroadcastsd 16(%r8), %ymm5 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0],ymm4[1,2,3,4],ymm5[5],ymm4[6,7] ; AVX2-FAST-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm4 = ymm8[0,2,3,3,4,6,7,7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm5 = ymm7[2,3,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm8[0,2,3,3,4,6,7,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm5 = ymm7[2,3,2,3,6,7,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0],ymm4[1,2],ymm5[3,4],ymm4[5,6],ymm5[7] ; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm5 = ymm12[2],ymm13[2],ymm12[3],ymm13[3],ymm12[6],ymm13[6],ymm12[7],ymm13[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[3,3,3,3] @@ -2591,20 +2591,20 @@ ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps 32(%rdx), %ymm10 ; AVX2-FAST-NEXT: vmovaps 32(%rcx), %ymm9 -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm1 = ymm9[1,1,1,1,5,5,5,5] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm9[1,1,1,1,5,5,5,5] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm10[1],ymm1[2,3,4],ymm10[5],ymm1[6,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,2] ; AVX2-FAST-NEXT: vmovaps 32(%rdi), %ymm8 ; AVX2-FAST-NEXT: vmovaps 32(%rsi), %ymm7 -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm2 = ymm7[1,1,2,2,5,5,6,6] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm7[1,1,2,2,5,5,6,6] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm8[1,2],ymm2[3,4],ymm8[5,6],ymm2[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[2,2,2,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4],ymm2[5,6,7] ; AVX2-FAST-NEXT: vbroadcastsd 48(%r8), %ymm2 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3,4],ymm2[5],ymm1[6,7] ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm1 = ymm9[0,2,3,3,4,6,7,7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm2 = ymm10[2,3,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm9[0,2,3,3,4,6,7,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm10[2,3,2,3,6,7,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3,4],ymm1[5,6],ymm2[7] ; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm2 = ymm7[2],ymm8[2],ymm7[3],ymm8[3],ymm7[6],ymm8[6],ymm7[7],ymm8[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[3,3,3,3] @@ -2624,20 +2624,20 @@ ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps 64(%rdx), %ymm6 ; AVX2-FAST-NEXT: vmovaps 64(%rcx), %ymm5 -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm0 = ymm5[1,1,1,1,5,5,5,5] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm5[1,1,1,1,5,5,5,5] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm6[1],ymm0[2,3,4],ymm6[5],ymm0[6,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] ; AVX2-FAST-NEXT: vmovaps 64(%rdi), %ymm4 ; AVX2-FAST-NEXT: vmovaps 64(%rsi), %ymm3 -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm1 = ymm3[1,1,2,2,5,5,6,6] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm3[1,1,2,2,5,5,6,6] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm4[1,2],ymm1[3,4],ymm4[5,6],ymm1[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4],ymm1[5,6,7] ; AVX2-FAST-NEXT: vbroadcastsd 80(%r8), %ymm1 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4],ymm1[5],ymm0[6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm0 = ymm5[0,2,3,3,4,6,7,7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm1 = ymm6[2,3,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm5[0,2,3,3,4,6,7,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm6[2,3,2,3,6,7,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7] ; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm1 = ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[6],ymm4[6],ymm3[7],ymm4[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] @@ -2660,20 +2660,20 @@ ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps 96(%rdx), %ymm14 ; AVX2-FAST-NEXT: vmovaps 96(%rcx), %ymm2 -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm0 = ymm2[1,1,1,1,5,5,5,5] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm2[1,1,1,1,5,5,5,5] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm14[1],ymm0[2,3,4],ymm14[5],ymm0[6,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm11 = ymm0[2,2,2,2] ; AVX2-FAST-NEXT: vmovaps 96(%rdi), %ymm1 ; AVX2-FAST-NEXT: vmovaps 96(%rsi), %ymm0 -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm15 = ymm0[1,1,2,2,5,5,6,6] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm15 = ymm0[1,1,2,2,5,5,6,6] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0],ymm1[1,2],ymm15[3,4],ymm1[5,6],ymm15[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[2,2,2,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm11 = ymm15[0,1,2],ymm11[3,4],ymm15[5,6,7] ; AVX2-FAST-NEXT: vbroadcastsd 112(%r8), %ymm15 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm11 = ymm15[0],ymm11[1,2,3,4],ymm15[5],ymm11[6,7] ; AVX2-FAST-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm11 = ymm2[0,2,3,3,4,6,7,7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm15 = ymm14[2,3,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm11 = ymm2[0,2,3,3,4,6,7,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm15 = ymm14[2,3,2,3,6,7,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm11 = ymm15[0],ymm11[1,2],ymm15[3,4],ymm11[5,6],ymm15[7] ; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm15 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[3,3,3,3] @@ -2681,7 +2681,7 @@ ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2],ymm15[3,4],ymm11[5,6,7] ; AVX2-FAST-NEXT: vbroadcastsd 120(%r8), %ymm15 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1],ymm15[2],ymm11[3,4,5,6],ymm15[7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm12 = ymm12[3,0,2,3,7,4,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm12 = ymm12[3,0,2,3,7,4,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm13[4],ymm12[5,6,7] ; AVX2-FAST-NEXT: vpermilps $52, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm13 = mem[0,1,3,0,4,5,7,4] @@ -2691,26 +2691,26 @@ ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0],ymm13[1,2,3],ymm12[4,5],ymm13[6,7] ; AVX2-FAST-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm12 = ymm12[0,1,2],mem[3],ymm12[4,5,6,7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm7 = ymm7[3,0,2,3,7,4,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm7 = ymm7[3,0,2,3,7,4,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4],ymm7[5,6,7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm8 = ymm9[0,1,3,0,4,5,7,4] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm9 = ymm10[2,3,0,1,6,7,4,5] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm8 = ymm9[0,1,3,0,4,5,7,4] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm9 = ymm10[2,3,0,1,6,7,4,5] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1],ymm8[2],ymm9[3,4,5,6],ymm8[7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0],ymm8[1,2,3],ymm7[4,5],ymm8[6,7] ; AVX2-FAST-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm7 = ymm7[0,1,2],mem[3],ymm7[4,5,6,7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm3 = ymm3[3,0,2,3,7,4,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm3 = ymm3[3,0,2,3,7,4,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4],ymm3[5,6,7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm4 = ymm5[0,1,3,0,4,5,7,4] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm5 = ymm6[2,3,0,1,6,7,4,5] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm5[0,1,3,0,4,5,7,4] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm5 = ymm6[2,3,0,1,6,7,4,5] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1],ymm4[2],ymm5[3,4,5,6],ymm4[7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm4[1,2,3],ymm3[4,5],ymm4[6,7] ; AVX2-FAST-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm3 = ymm3[0,1,2],mem[3],ymm3[4,5,6,7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,0,2,3,7,4,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,0,2,3,7,4,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4],ymm0[5,6,7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm1 = ymm2[0,1,3,0,4,5,7,4] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm2 = ymm14[2,3,0,1,6,7,4,5] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,1,3,0,4,5,7,4] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm14[2,3,0,1,6,7,4,5] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4,5,6],ymm1[7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4,5],ymm1[6,7] ; AVX2-FAST-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload @@ -2770,14 +2770,14 @@ ; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rdx), %xmm4 ; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%rdx), %xmm1 ; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm2 = xmm8[0],xmm7[0],xmm8[1],xmm7[1] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[0,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm5 = ymm2[0,0,2,1] ; AVX2-FAST-PERLANE-NEXT: vmovaps (%rsi), %xmm9 ; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rsi), %xmm6 ; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%rsi), %xmm2 ; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdi), %xmm10 ; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm11 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm11 = xmm11[0,1,3,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm11 = xmm11[0,1,3,2] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm12 = ymm11[0,1,1,1] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm12[0,1],ymm5[2,3],ymm12[4,5,6],ymm5[7] ; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rdi), %xmm12 @@ -2786,9 +2786,9 @@ ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%rdi), %xmm5 ; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} xmm9 = xmm10[2],xmm9[2],xmm10[3],xmm9[3] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm9 = xmm9[0,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm9 = xmm9[0,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm7 = xmm7[1,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm7 = xmm7[1,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm8[2],xmm7[3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,1,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm9[2,3],ymm7[4,5],ymm9[6,7] @@ -2796,19 +2796,19 @@ ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0],ymm8[1],ymm7[2,3,4,5],ymm8[6],ymm7[7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm7 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm7 = xmm7[0,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm7 = xmm7[0,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,0,2,1] ; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm8 = xmm12[0],xmm6[0],xmm12[1],xmm6[1] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm8 = xmm8[0,1,3,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm8 = xmm8[0,1,3,2] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm9 = ymm8[0,1,1,1] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm9[0,1],ymm7[2,3],ymm9[4,5,6],ymm7[7] ; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, 32(%r8), %ymm8, %ymm8 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0],ymm7[1,2,3],ymm8[4],ymm7[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} xmm6 = xmm12[2],xmm6[2],xmm12[3],xmm6[3] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm6 = xmm6[0,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm6 = xmm6[0,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm3 = xmm3[1,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm3 = xmm3[1,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm4[2],xmm3[3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,1,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm6[2,3],ymm3[4,5],ymm6[6,7] @@ -2816,19 +2816,19 @@ ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3,4,5],ymm4[6],ymm3[7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm3 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm3 = xmm3[0,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,0,2,1] ; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm4 = xmm5[0],xmm2[0],xmm5[1],xmm2[1] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm4 = xmm4[0,1,3,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm4 = xmm4[0,1,3,2] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm6 = ymm4[0,1,1,1] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm6[0,1],ymm3[2,3],ymm6[4,5,6],ymm3[7] ; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, 64(%r8), %ymm4, %ymm4 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0],ymm3[1,2,3],ymm4[4],ymm3[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} xmm2 = xmm5[2],xmm2[2],xmm5[3],xmm2[3] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[0,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5],ymm2[6,7] @@ -2838,12 +2838,12 @@ ; AVX2-FAST-PERLANE-NEXT: vmovaps 96(%rcx), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovaps 96(%rdx), %xmm1 ; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[0,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,0,2,1] ; AVX2-FAST-PERLANE-NEXT: vmovaps 96(%rsi), %xmm3 ; AVX2-FAST-PERLANE-NEXT: vmovaps 96(%rdi), %xmm4 ; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm5 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm5 = xmm5[0,1,3,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm5 = xmm5[0,1,3,2] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm6 = ymm5[0,1,1,1] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm6[0,1],ymm2[2,3],ymm6[4,5,6],ymm2[7] ; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdi), %ymm6 @@ -2853,23 +2853,23 @@ ; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdx), %ymm5 ; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} xmm2 = xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX2-FAST-PERLANE-NEXT: vmovaps (%rcx), %ymm14 -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3] ; AVX2-FAST-PERLANE-NEXT: vmovaps 96(%r8), %ymm3 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm1 = xmm2[0,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm2[0,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,0,2,1] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm3[0,1,2,1] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5],ymm1[6],ymm0[7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm0 = ymm14[1,1,1,1,5,5,5,5] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm14[1,1,1,1,5,5,5,5] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm5[1],ymm0[2,3,4],ymm5[5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] ; AVX2-FAST-PERLANE-NEXT: vmovaps (%rsi), %ymm13 -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm13[1,1,2,2,5,5,6,6] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm13[1,1,2,2,5,5,6,6] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm6[1,2],ymm1[3,4],ymm6[5,6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,3] @@ -2877,8 +2877,8 @@ ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 16(%r8), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4],ymm1[5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm0 = ymm14[0,2,3,3,4,6,7,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm5[2,3,2,3,6,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm14[0,2,3,3,4,6,7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm5[2,3,2,3,6,7,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm1 = ymm13[2],ymm6[2],ymm13[3],ymm6[3],ymm13[6],ymm6[6],ymm13[7],ymm6[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] @@ -2889,20 +2889,20 @@ ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rdx), %ymm12 ; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rcx), %ymm11 -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm0 = ymm11[1,1,1,1,5,5,5,5] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm11[1,1,1,1,5,5,5,5] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm12[1],ymm0[2,3,4],ymm12[5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] ; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rdi), %ymm10 ; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rsi), %ymm9 -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm9[1,1,2,2,5,5,6,6] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm9[1,1,2,2,5,5,6,6] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm10[1,2],ymm1[3,4],ymm10[5,6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4],ymm1[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 48(%r8), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4],ymm1[5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm0 = ymm11[0,2,3,3,4,6,7,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm12[2,3,2,3,6,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm11[0,2,3,3,4,6,7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm12[2,3,2,3,6,7,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm1 = ymm9[2],ymm10[2],ymm9[3],ymm10[3],ymm9[6],ymm10[6],ymm9[7],ymm10[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] @@ -2913,20 +2913,20 @@ ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%rdx), %ymm8 ; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%rcx), %ymm7 -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm0 = ymm7[1,1,1,1,5,5,5,5] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm7[1,1,1,1,5,5,5,5] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm8[1],ymm0[2,3,4],ymm8[5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] ; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%rdi), %ymm6 ; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%rsi), %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm5[1,1,2,2,5,5,6,6] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm5[1,1,2,2,5,5,6,6] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm6[1,2],ymm1[3,4],ymm6[5,6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4],ymm1[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 80(%r8), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4],ymm1[5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm0 = ymm7[0,2,3,3,4,6,7,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm8[2,3,2,3,6,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm7[0,2,3,3,4,6,7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm8[2,3,2,3,6,7,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm1 = ymm5[2],ymm6[2],ymm5[3],ymm6[3],ymm5[6],ymm6[6],ymm5[7],ymm6[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] @@ -2937,20 +2937,20 @@ ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 96(%rdx), %ymm4 ; AVX2-FAST-PERLANE-NEXT: vmovaps 96(%rcx), %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm0 = ymm3[1,1,1,1,5,5,5,5] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm3[1,1,1,1,5,5,5,5] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm4[1],ymm0[2,3,4],ymm4[5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] ; AVX2-FAST-PERLANE-NEXT: vmovaps 96(%rdi), %ymm2 ; AVX2-FAST-PERLANE-NEXT: vmovaps 96(%rsi), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm15 = ymm1[1,1,2,2,5,5,6,6] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm15 = ymm1[1,1,2,2,5,5,6,6] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0],ymm2[1,2],ymm15[3,4],ymm2[5,6],ymm15[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[2,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2],ymm0[3,4],ymm15[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 112(%r8), %ymm15 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0],ymm0[1,2,3,4],ymm15[5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm0 = ymm3[0,2,3,3,4,6,7,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm15 = ymm4[2,3,2,3,6,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm3[0,2,3,3,4,6,7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm15 = ymm4[2,3,2,3,6,7,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0],ymm0[1,2],ymm15[3,4],ymm0[5,6],ymm15[7] ; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm15 = ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[6],ymm2[6],ymm1[7],ymm2[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[3,3,3,3] @@ -2958,36 +2958,36 @@ ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm15[3,4],ymm0[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 120(%r8), %ymm15 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm15 = ymm0[0,1],ymm15[2],ymm0[3,4,5,6],ymm15[7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm0 = ymm13[3,0,2,3,7,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm13[3,0,2,3,7,4,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm0 = ymm0[0,1,2,3],mem[4],ymm0[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm13 = ymm14[0,1,3,0,4,5,7,4] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm13 = ymm14[0,1,3,0,4,5,7,4] ; AVX2-FAST-PERLANE-NEXT: vpermilps $78, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm14 = mem[2,3,0,1,6,7,4,5] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0,1],ymm13[2],ymm14[3,4,5,6],ymm13[7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm13[1,2,3],ymm0[4,5],ymm13[6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm0 = ymm0[0,1,2],mem[3],ymm0[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm9 = ymm9[3,0,2,3,7,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm9 = ymm9[3,0,2,3,7,4,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4],ymm9[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm10 = ymm11[0,1,3,0,4,5,7,4] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm11 = ymm12[2,3,0,1,6,7,4,5] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm10 = ymm11[0,1,3,0,4,5,7,4] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm11 = ymm12[2,3,0,1,6,7,4,5] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0,1],ymm10[2],ymm11[3,4,5,6],ymm10[7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0],ymm10[1,2,3],ymm9[4,5],ymm10[6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm9 = ymm9[0,1,2],mem[3],ymm9[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm5 = ymm5[3,0,2,3,7,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm5 = ymm5[3,0,2,3,7,4,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4],ymm5[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm6 = ymm7[0,1,3,0,4,5,7,4] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm7 = ymm8[2,3,0,1,6,7,4,5] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm6 = ymm7[0,1,3,0,4,5,7,4] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm7 = ymm8[2,3,0,1,6,7,4,5] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1],ymm6[2],ymm7[3,4,5,6],ymm6[7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm6[1,2,3],ymm5[4,5],ymm6[6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm5 = ymm5[0,1,2],mem[3],ymm5[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[3,0,2,3,7,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm1[3,0,2,3,7,4,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4],ymm1[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm2 = ymm3[0,1,3,0,4,5,7,4] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm3 = ymm4[2,3,0,1,6,7,4,5] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm3[0,1,3,0,4,5,7,4] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm3 = ymm4[2,3,0,1,6,7,4,5] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2],ymm3[3,4,5,6],ymm2[7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3],ymm1[4,5],ymm2[6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload @@ -4106,7 +4106,7 @@ ; AVX1-ONLY-NEXT: vmovaps (%rcx), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm4 = xmm0[0],xmm2[0] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm4 = xmm4[0,1,2,0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm4[0,1,2,0] ; AVX1-ONLY-NEXT: vbroadcastss 4(%rdx), %xmm5 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm4 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm4[2,3],ymm1[4,5,6],ymm4[7] @@ -4121,7 +4121,7 @@ ; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm4 = xmm2[0],xmm0[0] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm4 = xmm4[0,1,2,0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm4[0,1,2,0] ; AVX1-ONLY-NEXT: vbroadcastss 36(%rdx), %xmm5 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm4 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm4[2,3],ymm1[4,5,6],ymm4[7] @@ -4135,7 +4135,7 @@ ; AVX1-ONLY-NEXT: vmovaps 64(%rcx), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,1,2,0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,0] ; AVX1-ONLY-NEXT: vbroadcastss 68(%rdx), %xmm3 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm3 = zero,xmm13[1],xmm10[1],zero @@ -4150,7 +4150,7 @@ ; AVX1-ONLY-NEXT: vmovaps 96(%rcx), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,1,2,0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,0] ; AVX1-ONLY-NEXT: vbroadcastss 100(%rdx), %xmm3 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm3 ; AVX1-ONLY-NEXT: vmovaps 96(%rsi), %xmm11 @@ -4167,7 +4167,7 @@ ; AVX1-ONLY-NEXT: vmovaps 128(%rcx), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm3 = xmm1[0],xmm0[0] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm3 = xmm3[0,1,2,0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,1,2,0] ; AVX1-ONLY-NEXT: vbroadcastss 132(%rdx), %xmm4 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm4 ; AVX1-ONLY-NEXT: vmovaps 128(%rsi), %xmm0 @@ -4185,7 +4185,7 @@ ; AVX1-ONLY-NEXT: vmovaps 160(%rcx), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm4 = xmm1[0],xmm0[0] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm4 = xmm4[0,1,2,0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm4[0,1,2,0] ; AVX1-ONLY-NEXT: vbroadcastss 164(%rdx), %xmm6 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm4, %ymm6 ; AVX1-ONLY-NEXT: vmovaps 160(%rsi), %xmm0 @@ -4203,7 +4203,7 @@ ; AVX1-ONLY-NEXT: vmovaps 192(%rcx), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,2,0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1,2,0] ; AVX1-ONLY-NEXT: vbroadcastss 196(%rdx), %xmm6 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vmovaps 192(%rsi), %xmm0 @@ -4221,7 +4221,7 @@ ; AVX1-ONLY-NEXT: vmovaps 224(%rcx), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,2,0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1,2,0] ; AVX1-ONLY-NEXT: vbroadcastss 228(%rdx), %xmm1 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm3 ; AVX1-ONLY-NEXT: vmovaps 224(%rsi), %xmm1 @@ -4292,10 +4292,9 @@ ; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm2 = zero,zero,xmm1[2],xmm0[2] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm1[1,1],xmm0[1,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %ymm1 -; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %ymm12 ; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %ymm14 -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm14[1,1],ymm1[1,1],ymm14[5,5],ymm1[5,5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm14[1,1],ymm12[1,1],ymm14[5,5],ymm12[5,5] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4],ymm0[5,6,7] ; AVX1-ONLY-NEXT: vmovaps 48(%rcx), %xmm1 @@ -4460,11 +4459,11 @@ ; AVX1-ONLY-NEXT: vinsertps {{.*#+}} xmm2 = zero,zero,xmm1[2],xmm0[2] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm1[1,1],xmm0[1,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vmovaps 160(%rdx), %ymm2 -; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vmovaps 160(%rcx), %ymm1 +; AVX1-ONLY-NEXT: vmovaps 160(%rdx), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,1],ymm2[1,1],ymm1[5,5],ymm2[5,5] +; AVX1-ONLY-NEXT: vmovaps 160(%rcx), %ymm2 +; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[1,1],ymm1[1,1],ymm2[5,5],ymm1[5,5] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4],ymm0[5,6,7] ; AVX1-ONLY-NEXT: vmovaps 176(%rcx), %xmm1 @@ -4472,10 +4471,11 @@ ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm2[3,3],xmm1[3,3] ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 -; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %ymm12 -; AVX1-ONLY-NEXT: vmovaps 160(%rsi), %ymm2 +; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[3,3],ymm12[3,3],ymm2[7,7],ymm12[7,7] +; AVX1-ONLY-NEXT: vmovaps 160(%rsi), %ymm3 +; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm3[3,3],ymm2[3,3],ymm3[7,7],ymm2[7,7] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4],ymm1[5,6,7] ; AVX1-ONLY-NEXT: vmovaps 160(%r8), %ymm2 @@ -4563,86 +4563,86 @@ ; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm6[0,1],ymm10[2],ymm6[3,4,5,6],ymm10[7] ; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpermilps $227, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm3 = mem[3,0,2,3,7,4,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[3,0,2,3,7,4,6,7] ; AVX1-ONLY-NEXT: vblendps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm3 = ymm3[0,1,2,3],mem[4],ymm3[5,6,7] -; AVX1-ONLY-NEXT: vpermilps $52, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm6 = mem[0,1,3,0,4,5,7,4] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm6[0,1,3,0,4,5,7,4] ; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} ymm10 = ymm15[1,0,2,2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm10[0,1],ymm6[2],ymm10[3,4,5,6],ymm6[7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm6[1,2,3],ymm3[4,5],ymm6[6,7] ; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm3 = ymm3[0,1,2],mem[3],ymm3[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpermilps $227, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm3 = mem[3,0,2,3,7,4,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[3,0,2,3,7,4,6,7] ; AVX1-ONLY-NEXT: vblendps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm3 = ymm3[0,1,2,3],mem[4],ymm3[5,6,7] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm6 = ymm14[0,1,3,0,4,5,7,4] -; AVX1-ONLY-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm10 = mem[1,0,2,2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm14[0,1,3,0,4,5,7,4] +; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} ymm10 = ymm12[1,0,2,2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm10[0,1],ymm6[2],ymm10[3,4,5,6],ymm6[7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm6[1,2,3],ymm3[4,5],ymm6[6,7] ; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm10 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm10 = ymm3[0,1,2],mem[3],ymm3[4,5,6,7] -; AVX1-ONLY-NEXT: vpermilps $227, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm3 = mem[3,0,2,3,7,4,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[3,0,2,3,7,4,6,7] ; AVX1-ONLY-NEXT: vblendps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm3 = ymm3[0,1,2,3],mem[4],ymm3[5,6,7] -; AVX1-ONLY-NEXT: vpermilps $52, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm6 = mem[0,1,3,0,4,5,7,4] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm6[0,1,3,0,4,5,7,4] ; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} ymm15 = ymm13[1,0,2,2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm15[0,1],ymm6[2],ymm15[3,4,5,6],ymm6[7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm6[1,2,3],ymm3[4,5],ymm6[6,7] ; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm6 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm6 = ymm3[0,1,2],mem[3],ymm3[4,5,6,7] -; AVX1-ONLY-NEXT: vpermilps $227, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm3 = mem[3,0,2,3,7,4,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[3,0,2,3,7,4,6,7] ; AVX1-ONLY-NEXT: vblendps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm3 = ymm3[0,1,2,3],mem[4],ymm3[5,6,7] -; AVX1-ONLY-NEXT: vpermilps $52, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm15 = mem[0,1,3,0,4,5,7,4] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm12[0,1,3,0,4,5,7,4] ; AVX1-ONLY-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm14 = mem[1,0,2,2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1],ymm15[2],ymm14[3,4,5,6],ymm15[7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm14[1,2,3],ymm3[4,5],ymm14[6,7] ; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm3 = ymm3[0,1,2],mem[3],ymm3[4,5,6,7] -; AVX1-ONLY-NEXT: vpermilps $227, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm14 = mem[3,0,2,3,7,4,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm12[3,0,2,3,7,4,6,7] ; AVX1-ONLY-NEXT: vblendps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm14 = ymm14[0,1,2,3],mem[4],ymm14[5,6,7] -; AVX1-ONLY-NEXT: vpermilps $52, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm15 = mem[0,1,3,0,4,5,7,4] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm12[0,1,3,0,4,5,7,4] ; AVX1-ONLY-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm13 = mem[1,0,2,2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1],ymm15[2],ymm13[3,4,5,6],ymm15[7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0],ymm13[1,2,3],ymm14[4,5],ymm13[6,7] ; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm13 = ymm13[0,1,2],mem[3],ymm13[4,5,6,7] -; AVX1-ONLY-NEXT: vpermilps $227, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm14 = mem[3,0,2,3,7,4,6,7] -; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm12[4],ymm14[5,6,7] -; AVX1-ONLY-NEXT: vpermilps $52, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload -; AVX1-ONLY-NEXT: # ymm15 = mem[0,1,3,0,4,5,7,4] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm12[3,0,2,3,7,4,6,7] +; AVX1-ONLY-NEXT: vblendps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload +; AVX1-ONLY-NEXT: # ymm14 = ymm14[0,1,2,3],mem[4],ymm14[5,6,7] +; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm12[0,1,3,0,4,5,7,4] ; AVX1-ONLY-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm12 = mem[1,0,2,2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1],ymm15[2],ymm12[3,4,5,6],ymm15[7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm14[0],ymm12[1,2,3],ymm14[4,5],ymm12[6,7] ; AVX1-ONLY-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm12 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm12 = ymm12[0,1,2],mem[3],ymm12[4,5,6,7] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm7 = ymm7[3,0,2,3,7,4,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm7[3,0,2,3,7,4,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4],ymm7[5,6,7] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm8 = ymm11[0,1,3,0,4,5,7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm11[0,1,3,0,4,5,7,4] ; AVX1-ONLY-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm11 = mem[1,0,2,2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm11[0,1],ymm8[2],ymm11[3,4,5,6],ymm8[7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0],ymm8[1,2,3],ymm7[4,5],ymm8[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2],ymm9[3],ymm7[4,5,6,7] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,0,2,3,7,4,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,0,2,3,7,4,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4],ymm0[5,6,7] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm1 = ymm4[0,1,3,0,4,5,7,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm4[0,1,3,0,4,5,7,4] ; AVX1-ONLY-NEXT: vpermilpd {{.*#+}} ymm4 = ymm5[1,0,2,2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1],ymm1[2],ymm4[3,4,5,6],ymm1[7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4,5],ymm1[6,7] @@ -4740,14 +4740,14 @@ ; AVX2-SLOW-NEXT: vmovaps 32(%rdx), %xmm4 ; AVX2-SLOW-NEXT: vmovaps 64(%rdx), %xmm1 ; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm2 = xmm8[0],xmm7[0],xmm8[1],xmm7[1] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[0,1,2,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,1,2,2] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm5 = ymm2[0,0,2,1] ; AVX2-SLOW-NEXT: vmovaps (%rsi), %xmm9 ; AVX2-SLOW-NEXT: vmovaps 32(%rsi), %xmm6 ; AVX2-SLOW-NEXT: vmovaps 64(%rsi), %xmm2 ; AVX2-SLOW-NEXT: vmovaps (%rdi), %xmm10 ; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm11 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm11 = xmm11[0,1,3,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm11 = xmm11[0,1,3,2] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm12 = ymm11[0,1,1,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm12[0,1],ymm5[2,3],ymm12[4,5,6],ymm5[7] ; AVX2-SLOW-NEXT: vmovaps 32(%rdi), %xmm12 @@ -4756,9 +4756,9 @@ ; AVX2-SLOW-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps 64(%rdi), %xmm5 ; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} xmm9 = xmm10[2],xmm9[2],xmm10[3],xmm9[3] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm9 = xmm9[0,1,2,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm9 = xmm9[0,1,2,2] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[0,0,2,1] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm7 = xmm7[1,1,2,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm7 = xmm7[1,1,2,2] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm8[2],xmm7[3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,1,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm9[2,3],ymm7[4,5],ymm9[6,7] @@ -4766,19 +4766,19 @@ ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0],ymm8[1],ymm7[2,3,4,5],ymm8[6],ymm7[7] ; AVX2-SLOW-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm7 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm7 = xmm7[0,1,2,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm7 = xmm7[0,1,2,2] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,0,2,1] ; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm8 = xmm12[0],xmm6[0],xmm12[1],xmm6[1] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm8 = xmm8[0,1,3,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm8 = xmm8[0,1,3,2] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm9 = ymm8[0,1,1,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm9[0,1],ymm7[2,3],ymm9[4,5,6],ymm7[7] ; AVX2-SLOW-NEXT: vinsertf128 $1, 32(%r8), %ymm8, %ymm8 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0],ymm7[1,2,3],ymm8[4],ymm7[5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} xmm6 = xmm12[2],xmm6[2],xmm12[3],xmm6[3] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm6 = xmm6[0,1,2,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm6 = xmm6[0,1,2,2] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,0,2,1] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm3 = xmm3[1,1,2,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm3[1,1,2,2] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm4[2],xmm3[3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,1,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm6[2,3],ymm3[4,5],ymm6[6,7] @@ -4786,19 +4786,19 @@ ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3,4,5],ymm4[6],ymm3[7] ; AVX2-SLOW-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm3 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm3 = xmm3[0,1,2,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,1,2,2] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,0,2,1] ; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm4 = xmm5[0],xmm2[0],xmm5[1],xmm2[1] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm4 = xmm4[0,1,3,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm4 = xmm4[0,1,3,2] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm6 = ymm4[0,1,1,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm6[0,1],ymm3[2,3],ymm6[4,5,6],ymm3[7] ; AVX2-SLOW-NEXT: vinsertf128 $1, 64(%r8), %ymm4, %ymm4 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0],ymm3[1,2,3],ymm4[4],ymm3[5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} xmm2 = xmm5[2],xmm2[2],xmm5[3],xmm2[3] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[0,1,2,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,1,2,2] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,0,2,1] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,2,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1,2,2] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5],ymm2[6,7] @@ -4808,12 +4808,12 @@ ; AVX2-SLOW-NEXT: vmovaps 96(%rcx), %xmm0 ; AVX2-SLOW-NEXT: vmovaps 96(%rdx), %xmm1 ; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[0,1,2,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,1,2,2] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,0,2,1] ; AVX2-SLOW-NEXT: vmovaps 96(%rsi), %xmm3 ; AVX2-SLOW-NEXT: vmovaps 96(%rdi), %xmm4 ; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm5 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm5 = xmm5[0,1,3,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm5 = xmm5[0,1,3,2] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm6 = ymm5[0,1,1,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm6[0,1],ymm2[2,3],ymm6[4,5,6],ymm2[7] ; AVX2-SLOW-NEXT: vmovaps 96(%r8), %ymm6 @@ -4822,9 +4822,9 @@ ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0],ymm2[1,2,3],ymm5[4],ymm2[5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} xmm2 = xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[0,1,2,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,1,2,2] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,0,2,1] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,2,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1,2,2] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5],ymm2[6,7] @@ -4834,23 +4834,23 @@ ; AVX2-SLOW-NEXT: vmovaps 128(%rcx), %xmm0 ; AVX2-SLOW-NEXT: vmovaps 128(%rdx), %xmm1 ; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[0,1,2,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,1,2,2] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,0,2,1] ; AVX2-SLOW-NEXT: vmovaps 128(%rsi), %xmm3 ; AVX2-SLOW-NEXT: vmovaps 128(%rdi), %xmm4 ; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm5 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm5 = xmm5[0,1,3,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm5 = xmm5[0,1,3,2] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm6 = ymm5[0,1,1,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm6[0,1],ymm2[2,3],ymm6[4,5,6],ymm2[7] ; AVX2-SLOW-NEXT: vinsertf128 $1, 128(%r8), %ymm5, %ymm5 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0],ymm2[1,2,3],ymm5[4],ymm2[5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} xmm2 = xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,2,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1,2,2] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3] ; AVX2-SLOW-NEXT: vmovaps 128(%r8), %ymm3 ; AVX2-SLOW-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm1 = xmm2[0,1,2,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm2[0,1,2,2] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,0,2,1] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] @@ -4860,21 +4860,21 @@ ; AVX2-SLOW-NEXT: vmovaps 160(%rcx), %xmm0 ; AVX2-SLOW-NEXT: vmovaps 160(%rdx), %xmm1 ; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[0,1,2,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,1,2,2] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,0,2,1] ; AVX2-SLOW-NEXT: vmovaps 160(%rsi), %xmm3 ; AVX2-SLOW-NEXT: vmovaps 160(%rdi), %xmm4 ; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm5 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm5 = xmm5[0,1,3,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm5 = xmm5[0,1,3,2] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm6 = ymm5[0,1,1,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm6[0,1],ymm2[2,3],ymm6[4,5,6],ymm2[7] ; AVX2-SLOW-NEXT: vinsertf128 $1, 160(%r8), %ymm5, %ymm5 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0],ymm2[1,2,3],ymm5[4],ymm2[5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} xmm2 = xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,2,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1,2,2] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm1 = xmm2[0,1,2,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm2[0,1,2,2] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,0,2,1] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] @@ -4886,21 +4886,21 @@ ; AVX2-SLOW-NEXT: vmovaps 192(%rcx), %xmm0 ; AVX2-SLOW-NEXT: vmovaps 192(%rdx), %xmm1 ; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[0,1,2,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,1,2,2] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,0,2,1] ; AVX2-SLOW-NEXT: vmovaps 192(%rsi), %xmm3 ; AVX2-SLOW-NEXT: vmovaps 192(%rdi), %xmm4 ; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm5 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm5 = xmm5[0,1,3,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm5 = xmm5[0,1,3,2] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm6 = ymm5[0,1,1,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm6[0,1],ymm2[2,3],ymm6[4,5,6],ymm2[7] ; AVX2-SLOW-NEXT: vinsertf128 $1, 192(%r8), %ymm5, %ymm5 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0],ymm2[1,2,3],ymm5[4],ymm2[5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} xmm2 = xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,2,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1,2,2] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm1 = xmm2[0,1,2,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm2[0,1,2,2] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,0,2,1] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] @@ -4912,21 +4912,21 @@ ; AVX2-SLOW-NEXT: vmovaps 224(%rcx), %xmm0 ; AVX2-SLOW-NEXT: vmovaps 224(%rdx), %xmm1 ; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[0,1,2,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,1,2,2] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,0,2,1] ; AVX2-SLOW-NEXT: vmovaps 224(%rsi), %xmm3 ; AVX2-SLOW-NEXT: vmovaps 224(%rdi), %xmm4 ; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm5 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm5 = xmm5[0,1,3,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm5 = xmm5[0,1,3,2] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm6 = ymm5[0,1,1,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm6[0,1],ymm2[2,3],ymm6[4,5,6],ymm2[7] ; AVX2-SLOW-NEXT: vinsertf128 $1, 224(%r8), %ymm5, %ymm5 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0],ymm2[1,2,3],ymm5[4],ymm2[5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} xmm2 = xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,2,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1,2,2] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm1 = xmm2[0,1,2,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm2[0,1,2,2] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,0,2,1] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] @@ -4937,7 +4937,7 @@ ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps (%rdx), %ymm14 ; AVX2-SLOW-NEXT: vmovaps (%rcx), %ymm1 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm1[1,1,1,1,5,5,5,5] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm1[1,1,1,1,5,5,5,5] ; AVX2-SLOW-NEXT: vmovaps %ymm1, %ymm2 ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm14[1],ymm0[2,3,4],ymm14[5],ymm0[6,7] @@ -4945,7 +4945,7 @@ ; AVX2-SLOW-NEXT: vmovaps (%rdi), %ymm3 ; AVX2-SLOW-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps (%rsi), %ymm4 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm4[1,1,2,2,5,5,6,6] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm4[1,1,2,2,5,5,6,6] ; AVX2-SLOW-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm3[1,2],ymm1[3,4],ymm3[5,6],ymm1[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,3] @@ -4953,8 +4953,8 @@ ; AVX2-SLOW-NEXT: vbroadcastsd 16(%r8), %ymm1 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4],ymm1[5],ymm0[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm2[0,2,3,3,4,6,7,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm14[2,3,2,3,6,7,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm2[0,2,3,3,4,6,7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm14[2,3,2,3,6,7,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7] ; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm1 = ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[6],ymm3[6],ymm4[7],ymm3[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] @@ -4965,7 +4965,7 @@ ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps 32(%rdx), %ymm1 ; AVX2-SLOW-NEXT: vmovaps 32(%rcx), %ymm2 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm2[1,1,1,1,5,5,5,5] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm2[1,1,1,1,5,5,5,5] ; AVX2-SLOW-NEXT: vmovaps %ymm2, %ymm3 ; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7] @@ -4975,7 +4975,7 @@ ; AVX2-SLOW-NEXT: vmovaps 32(%rdi), %ymm4 ; AVX2-SLOW-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps 32(%rsi), %ymm5 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm5[1,1,2,2,5,5,6,6] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm5[1,1,2,2,5,5,6,6] ; AVX2-SLOW-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm4[1,2],ymm1[3,4],ymm4[5,6],ymm1[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,3] @@ -4983,8 +4983,8 @@ ; AVX2-SLOW-NEXT: vbroadcastsd 48(%r8), %ymm1 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4],ymm1[5],ymm0[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm3[0,2,3,3,4,6,7,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm2[2,3,2,3,6,7,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm3[0,2,3,3,4,6,7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm2[2,3,2,3,6,7,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7] ; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm1 = ymm5[2],ymm4[2],ymm5[3],ymm4[3],ymm5[6],ymm4[6],ymm5[7],ymm4[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] @@ -4995,7 +4995,7 @@ ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps 64(%rdx), %ymm1 ; AVX2-SLOW-NEXT: vmovaps 64(%rcx), %ymm2 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm2[1,1,1,1,5,5,5,5] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm2[1,1,1,1,5,5,5,5] ; AVX2-SLOW-NEXT: vmovaps %ymm2, %ymm3 ; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7] @@ -5005,7 +5005,7 @@ ; AVX2-SLOW-NEXT: vmovaps 64(%rdi), %ymm4 ; AVX2-SLOW-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps 64(%rsi), %ymm5 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm5[1,1,2,2,5,5,6,6] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm5[1,1,2,2,5,5,6,6] ; AVX2-SLOW-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm4[1,2],ymm1[3,4],ymm4[5,6],ymm1[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,3] @@ -5013,8 +5013,8 @@ ; AVX2-SLOW-NEXT: vbroadcastsd 80(%r8), %ymm1 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4],ymm1[5],ymm0[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm3[0,2,3,3,4,6,7,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm2[2,3,2,3,6,7,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm3[0,2,3,3,4,6,7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm2[2,3,2,3,6,7,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7] ; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm1 = ymm5[2],ymm4[2],ymm5[3],ymm4[3],ymm5[6],ymm4[6],ymm5[7],ymm4[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] @@ -5025,7 +5025,7 @@ ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps 96(%rdx), %ymm1 ; AVX2-SLOW-NEXT: vmovaps 96(%rcx), %ymm2 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm2[1,1,1,1,5,5,5,5] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm2[1,1,1,1,5,5,5,5] ; AVX2-SLOW-NEXT: vmovaps %ymm2, %ymm3 ; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7] @@ -5035,7 +5035,7 @@ ; AVX2-SLOW-NEXT: vmovaps 96(%rdi), %ymm4 ; AVX2-SLOW-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps 96(%rsi), %ymm5 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm5[1,1,2,2,5,5,6,6] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm5[1,1,2,2,5,5,6,6] ; AVX2-SLOW-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm4[1,2],ymm1[3,4],ymm4[5,6],ymm1[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,3] @@ -5043,8 +5043,8 @@ ; AVX2-SLOW-NEXT: vbroadcastsd 112(%r8), %ymm1 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4],ymm1[5],ymm0[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm3[0,2,3,3,4,6,7,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm2[2,3,2,3,6,7,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm3[0,2,3,3,4,6,7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm2[2,3,2,3,6,7,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7] ; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm1 = ymm5[2],ymm4[2],ymm5[3],ymm4[3],ymm5[6],ymm4[6],ymm5[7],ymm4[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] @@ -5055,7 +5055,7 @@ ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps 128(%rdx), %ymm1 ; AVX2-SLOW-NEXT: vmovaps 128(%rcx), %ymm2 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm2[1,1,1,1,5,5,5,5] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm2[1,1,1,1,5,5,5,5] ; AVX2-SLOW-NEXT: vmovaps %ymm2, %ymm3 ; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7] @@ -5065,15 +5065,15 @@ ; AVX2-SLOW-NEXT: vmovaps 128(%rdi), %ymm4 ; AVX2-SLOW-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps 128(%rsi), %ymm13 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm13[1,1,2,2,5,5,6,6] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm13[1,1,2,2,5,5,6,6] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm4[1,2],ymm1[3,4],ymm4[5,6],ymm1[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4],ymm1[5,6,7] ; AVX2-SLOW-NEXT: vbroadcastsd 144(%r8), %ymm1 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4],ymm1[5],ymm0[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm3[0,2,3,3,4,6,7,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm2[2,3,2,3,6,7,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm3[0,2,3,3,4,6,7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm2[2,3,2,3,6,7,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7] ; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm1 = ymm13[2],ymm4[2],ymm13[3],ymm4[3],ymm13[6],ymm4[6],ymm13[7],ymm4[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] @@ -5084,20 +5084,20 @@ ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps 160(%rdx), %ymm12 ; AVX2-SLOW-NEXT: vmovaps 160(%rcx), %ymm11 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm11[1,1,1,1,5,5,5,5] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm11[1,1,1,1,5,5,5,5] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm12[1],ymm0[2,3,4],ymm12[5],ymm0[6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] ; AVX2-SLOW-NEXT: vmovaps 160(%rdi), %ymm9 ; AVX2-SLOW-NEXT: vmovaps 160(%rsi), %ymm8 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm8[1,1,2,2,5,5,6,6] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm8[1,1,2,2,5,5,6,6] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm9[1,2],ymm1[3,4],ymm9[5,6],ymm1[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4],ymm1[5,6,7] ; AVX2-SLOW-NEXT: vbroadcastsd 176(%r8), %ymm1 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4],ymm1[5],ymm0[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm11[0,2,3,3,4,6,7,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm12[2,3,2,3,6,7,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm11[0,2,3,3,4,6,7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm12[2,3,2,3,6,7,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7] ; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm1 = ymm8[2],ymm9[2],ymm8[3],ymm9[3],ymm8[6],ymm9[6],ymm8[7],ymm9[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] @@ -5108,20 +5108,20 @@ ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps 192(%rdx), %ymm7 ; AVX2-SLOW-NEXT: vmovaps 192(%rcx), %ymm6 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm6[1,1,1,1,5,5,5,5] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm6[1,1,1,1,5,5,5,5] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm7[1],ymm0[2,3,4],ymm7[5],ymm0[6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] ; AVX2-SLOW-NEXT: vmovaps 192(%rdi), %ymm5 ; AVX2-SLOW-NEXT: vmovaps 192(%rsi), %ymm4 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm4[1,1,2,2,5,5,6,6] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm4[1,1,2,2,5,5,6,6] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm5[1,2],ymm1[3,4],ymm5[5,6],ymm1[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4],ymm1[5,6,7] ; AVX2-SLOW-NEXT: vbroadcastsd 208(%r8), %ymm1 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4],ymm1[5],ymm0[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm6[0,2,3,3,4,6,7,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm7[2,3,2,3,6,7,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm6[0,2,3,3,4,6,7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm7[2,3,2,3,6,7,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7] ; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm1 = ymm4[2],ymm5[2],ymm4[3],ymm5[3],ymm4[6],ymm5[6],ymm4[7],ymm5[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] @@ -5132,20 +5132,20 @@ ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps 224(%rdx), %ymm3 ; AVX2-SLOW-NEXT: vmovaps 224(%rcx), %ymm2 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm2[1,1,1,1,5,5,5,5] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm2[1,1,1,1,5,5,5,5] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3,4],ymm3[5],ymm0[6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm10 = ymm0[2,2,2,2] ; AVX2-SLOW-NEXT: vmovaps 224(%rdi), %ymm1 ; AVX2-SLOW-NEXT: vmovaps 224(%rsi), %ymm0 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm15 = ymm0[1,1,2,2,5,5,6,6] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm15 = ymm0[1,1,2,2,5,5,6,6] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0],ymm1[1,2],ymm15[3,4],ymm1[5,6],ymm15[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[2,2,2,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm10 = ymm15[0,1,2],ymm10[3,4],ymm15[5,6,7] ; AVX2-SLOW-NEXT: vbroadcastsd 240(%r8), %ymm15 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm10 = ymm15[0],ymm10[1,2,3,4],ymm15[5],ymm10[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm10, (%rsp) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm10 = ymm2[0,2,3,3,4,6,7,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm15 = ymm3[2,3,2,3,6,7,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm10 = ymm2[0,2,3,3,4,6,7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm15 = ymm3[2,3,2,3,6,7,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm10 = ymm15[0],ymm10[1,2],ymm15[3,4],ymm10[5,6],ymm15[7] ; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm15 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[3,3,3,3] @@ -5160,7 +5160,7 @@ ; AVX2-SLOW-NEXT: # ymm10 = ymm10[0,1,2,3],mem[4],ymm10[5,6,7] ; AVX2-SLOW-NEXT: vpermilps $52, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm15 = mem[0,1,3,0,4,5,7,4] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm14 = ymm14[2,3,0,1,6,7,4,5] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm14 = ymm14[2,3,0,1,6,7,4,5] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1],ymm15[2],ymm14[3,4,5,6],ymm15[7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0],ymm14[1,2,3],ymm10[4,5],ymm14[6,7] ; AVX2-SLOW-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload @@ -5204,7 +5204,7 @@ ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0],ymm14[1,2,3],ymm10[4,5],ymm14[6,7] ; AVX2-SLOW-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm10 = ymm10[0,1,2],mem[3],ymm10[4,5,6,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm13 = ymm13[3,0,2,3,7,4,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm13 = ymm13[3,0,2,3,7,4,6,7] ; AVX2-SLOW-NEXT: vblendps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm13 = ymm13[0,1,2,3],mem[4],ymm13[5,6,7] ; AVX2-SLOW-NEXT: vpermilps $52, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload @@ -5215,26 +5215,26 @@ ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0],ymm14[1,2,3],ymm13[4,5],ymm14[6,7] ; AVX2-SLOW-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm13 = ymm13[0,1,2],mem[3],ymm13[4,5,6,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm8 = ymm8[3,0,2,3,7,4,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm8 = ymm8[3,0,2,3,7,4,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4],ymm8[5,6,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm9 = ymm11[0,1,3,0,4,5,7,4] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm11 = ymm12[2,3,0,1,6,7,4,5] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm9 = ymm11[0,1,3,0,4,5,7,4] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm11 = ymm12[2,3,0,1,6,7,4,5] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm11[0,1],ymm9[2],ymm11[3,4,5,6],ymm9[7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0],ymm9[1,2,3],ymm8[4,5],ymm9[6,7] ; AVX2-SLOW-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm8 = ymm8[0,1,2],mem[3],ymm8[4,5,6,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm4 = ymm4[3,0,2,3,7,4,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm4 = ymm4[3,0,2,3,7,4,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4],ymm4[5,6,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm5 = ymm6[0,1,3,0,4,5,7,4] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm6 = ymm7[2,3,0,1,6,7,4,5] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm5 = ymm6[0,1,3,0,4,5,7,4] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm6 = ymm7[2,3,0,1,6,7,4,5] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1],ymm5[2],ymm6[3,4,5,6],ymm5[7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm5[1,2,3],ymm4[4,5],ymm5[6,7] ; AVX2-SLOW-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm4 = ymm4[0,1,2],mem[3],ymm4[4,5,6,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,0,2,3,7,4,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,0,2,3,7,4,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4],ymm0[5,6,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm2[0,1,3,0,4,5,7,4] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm2 = ymm3[2,3,0,1,6,7,4,5] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,1,3,0,4,5,7,4] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm3[2,3,0,1,6,7,4,5] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4,5,6],ymm1[7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4,5],ymm1[6,7] ; AVX2-SLOW-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload @@ -5339,7 +5339,7 @@ ; AVX2-FAST-NEXT: vmovaps 32(%rdx), %xmm8 ; AVX2-FAST-NEXT: vmovaps (%rcx), %xmm14 ; AVX2-FAST-NEXT: vmovaps 32(%rcx), %xmm9 -; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm1 = xmm14[1,1,2,2] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm14[1,1,2,2] ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm13[2],xmm1[3] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,1,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] @@ -5348,7 +5348,7 @@ ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vunpckhps {{.*#+}} xmm0 = xmm5[2],xmm4[2],xmm5[3],xmm4[3] ; AVX2-FAST-NEXT: vpermps %ymm0, %ymm2, %ymm0 -; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm1 = xmm9[1,1,2,2] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm9[1,1,2,2] ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm8[2],xmm1[3] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,1,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] @@ -5359,7 +5359,7 @@ ; AVX2-FAST-NEXT: vpermps %ymm0, %ymm2, %ymm0 ; AVX2-FAST-NEXT: vmovaps 64(%rdx), %xmm6 ; AVX2-FAST-NEXT: vmovaps 64(%rcx), %xmm7 -; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm1 = xmm7[1,1,2,2] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm7[1,1,2,2] ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm6[2],xmm1[3] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,1,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7] @@ -5378,7 +5378,7 @@ ; AVX2-FAST-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vmovaps 96(%rcx), %xmm1 ; AVX2-FAST-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm12 = xmm1[1,1,2,2] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm12 = xmm1[1,1,2,2] ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm12 = xmm12[0,1],xmm0[2],xmm12[3] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm12 = ymm12[0,1,1,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1],ymm15[2,3],ymm12[4,5],ymm15[6,7] @@ -5397,7 +5397,7 @@ ; AVX2-FAST-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vmovaps 128(%rcx), %xmm1 ; AVX2-FAST-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm15 = xmm1[1,1,2,2] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm15 = xmm1[1,1,2,2] ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm15 = xmm15[0,1],xmm0[2],xmm15[3] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[0,1,1,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm12 = ymm15[0,1],ymm12[2,3],ymm15[4,5],ymm12[6,7] @@ -5416,7 +5416,7 @@ ; AVX2-FAST-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill ; AVX2-FAST-NEXT: vmovaps 160(%rcx), %xmm1 ; AVX2-FAST-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm15 = xmm1[1,1,2,2] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm15 = xmm1[1,1,2,2] ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm15 = xmm15[0,1],xmm0[2],xmm15[3] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[0,1,1,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm12 = ymm15[0,1],ymm12[2,3],ymm15[4,5],ymm12[6,7] @@ -5435,7 +5435,7 @@ ; AVX2-FAST-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vmovaps 192(%rcx), %xmm1 ; AVX2-FAST-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm15 = xmm1[1,1,2,2] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm15 = xmm1[1,1,2,2] ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm15 = xmm15[0,1],xmm0[2],xmm15[3] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[0,1,1,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm12 = ymm15[0,1],ymm12[2,3],ymm15[4,5],ymm12[6,7] @@ -5454,7 +5454,7 @@ ; AVX2-FAST-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vmovaps 224(%rcx), %xmm0 ; AVX2-FAST-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm15 = xmm0[1,1,2,2] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm15 = xmm0[1,1,2,2] ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm15 = xmm15[0,1],xmm1[2],xmm15[3] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[0,1,1,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm12 = ymm15[0,1],ymm12[2,3],ymm15[4,5],ymm12[6,7] @@ -5474,7 +5474,7 @@ ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps (%rdx), %ymm0 ; AVX2-FAST-NEXT: vmovaps (%rcx), %ymm1 -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm10 = ymm1[1,1,1,1,5,5,5,5] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm10 = ymm1[1,1,1,1,5,5,5,5] ; AVX2-FAST-NEXT: vmovaps %ymm1, %ymm13 ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0],ymm0[1],ymm10[2,3,4],ymm0[5],ymm10[6,7] @@ -5483,7 +5483,7 @@ ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[2,2,2,2] ; AVX2-FAST-NEXT: vmovaps (%rdi), %ymm15 ; AVX2-FAST-NEXT: vmovaps (%rsi), %ymm0 -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm11 = ymm0[1,1,2,2,5,5,6,6] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm11 = ymm0[1,1,2,2,5,5,6,6] ; AVX2-FAST-NEXT: vmovaps %ymm0, %ymm1 ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0],ymm15[1,2],ymm11[3,4],ymm15[5,6],ymm11[7] @@ -5492,8 +5492,8 @@ ; AVX2-FAST-NEXT: vbroadcastsd 16(%r8), %ymm11 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm11[0],ymm10[1,2,3,4],ymm11[5],ymm10[6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm10 = ymm13[0,2,3,3,4,6,7,7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm11 = ymm12[2,3,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm10 = ymm13[0,2,3,3,4,6,7,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm11 = ymm12[2,3,2,3,6,7,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0],ymm10[1,2],ymm11[3,4],ymm10[5,6],ymm11[7] ; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm11 = ymm1[2],ymm15[2],ymm1[3],ymm15[3],ymm1[6],ymm15[6],ymm1[7],ymm15[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[3,3,3,3] @@ -5512,7 +5512,7 @@ ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps 32(%rdx), %ymm0 ; AVX2-FAST-NEXT: vmovaps 32(%rcx), %ymm1 -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm4 = ymm1[1,1,1,1,5,5,5,5] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm1[1,1,1,1,5,5,5,5] ; AVX2-FAST-NEXT: vmovaps %ymm1, %ymm8 ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm0[1],ymm4[2,3,4],ymm0[5],ymm4[6,7] @@ -5522,7 +5522,7 @@ ; AVX2-FAST-NEXT: vmovaps 32(%rdi), %ymm9 ; AVX2-FAST-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps 32(%rsi), %ymm0 -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm5 = ymm0[1,1,2,2,5,5,6,6] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm5 = ymm0[1,1,2,2,5,5,6,6] ; AVX2-FAST-NEXT: vmovaps %ymm0, %ymm10 ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm9[1,2],ymm5[3,4],ymm9[5,6],ymm5[7] @@ -5531,8 +5531,8 @@ ; AVX2-FAST-NEXT: vbroadcastsd 48(%r8), %ymm5 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0],ymm4[1,2,3,4],ymm5[5],ymm4[6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm4 = ymm8[0,2,3,3,4,6,7,7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm5 = ymm1[2,3,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm8[0,2,3,3,4,6,7,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm5 = ymm1[2,3,2,3,6,7,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0],ymm4[1,2],ymm5[3,4],ymm4[5,6],ymm5[7] ; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm5 = ymm10[2],ymm9[2],ymm10[3],ymm9[3],ymm10[6],ymm9[6],ymm10[7],ymm9[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[3,3,3,3] @@ -5552,7 +5552,7 @@ ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps 64(%rdx), %ymm0 ; AVX2-FAST-NEXT: vmovaps 64(%rcx), %ymm1 -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm3 = ymm1[1,1,1,1,5,5,5,5] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm3 = ymm1[1,1,1,1,5,5,5,5] ; AVX2-FAST-NEXT: vmovaps %ymm1, %ymm5 ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm0[1],ymm3[2,3,4],ymm0[5],ymm3[6,7] @@ -5562,7 +5562,7 @@ ; AVX2-FAST-NEXT: vmovaps 64(%rdi), %ymm6 ; AVX2-FAST-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps 64(%rsi), %ymm0 -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm4 = ymm0[1,1,2,2,5,5,6,6] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm0[1,1,2,2,5,5,6,6] ; AVX2-FAST-NEXT: vmovaps %ymm0, %ymm7 ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm6[1,2],ymm4[3,4],ymm6[5,6],ymm4[7] @@ -5571,8 +5571,8 @@ ; AVX2-FAST-NEXT: vbroadcastsd 80(%r8), %ymm4 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0],ymm3[1,2,3,4],ymm4[5],ymm3[6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm3 = ymm5[0,2,3,3,4,6,7,7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm4 = ymm1[2,3,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm3 = ymm5[0,2,3,3,4,6,7,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm1[2,3,2,3,6,7,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0],ymm3[1,2],ymm4[3,4],ymm3[5,6],ymm4[7] ; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm4 = ymm7[2],ymm6[2],ymm7[3],ymm6[3],ymm7[6],ymm6[6],ymm7[7],ymm6[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[3,3,3,3] @@ -5595,7 +5595,7 @@ ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps 96(%rdx), %ymm1 ; AVX2-FAST-NEXT: vmovaps 96(%rcx), %ymm3 -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm0 = ymm3[1,1,1,1,5,5,5,5] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm3[1,1,1,1,5,5,5,5] ; AVX2-FAST-NEXT: vmovaps %ymm3, %ymm4 ; AVX2-FAST-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7] @@ -5605,7 +5605,7 @@ ; AVX2-FAST-NEXT: vmovaps 96(%rdi), %ymm5 ; AVX2-FAST-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps 96(%rsi), %ymm6 -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm1 = ymm6[1,1,2,2,5,5,6,6] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm6[1,1,2,2,5,5,6,6] ; AVX2-FAST-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm5[1,2],ymm1[3,4],ymm5[5,6],ymm1[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,3] @@ -5613,8 +5613,8 @@ ; AVX2-FAST-NEXT: vbroadcastsd 112(%r8), %ymm1 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4],ymm1[5],ymm0[6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm0 = ymm4[0,2,3,3,4,6,7,7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm1 = ymm3[2,3,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm4[0,2,3,3,4,6,7,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm3[2,3,2,3,6,7,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7] ; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm1 = ymm6[2],ymm5[2],ymm6[3],ymm5[3],ymm6[6],ymm5[6],ymm6[7],ymm5[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] @@ -5637,7 +5637,7 @@ ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps 128(%rdx), %ymm13 ; AVX2-FAST-NEXT: vmovaps 128(%rcx), %ymm1 -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm0 = ymm1[1,1,1,1,5,5,5,5] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm1[1,1,1,1,5,5,5,5] ; AVX2-FAST-NEXT: vmovaps %ymm1, %ymm3 ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm13[1],ymm0[2,3,4],ymm13[5],ymm0[6,7] @@ -5647,15 +5647,15 @@ ; AVX2-FAST-NEXT: vmovaps 128(%rdi), %ymm4 ; AVX2-FAST-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps 128(%rsi), %ymm13 -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm1 = ymm13[1,1,2,2,5,5,6,6] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm13[1,1,2,2,5,5,6,6] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm4[1,2],ymm1[3,4],ymm4[5,6],ymm1[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4],ymm1[5,6,7] ; AVX2-FAST-NEXT: vbroadcastsd 144(%r8), %ymm1 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4],ymm1[5],ymm0[6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm0 = ymm3[0,2,3,3,4,6,7,7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm1 = ymm5[2,3,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm3[0,2,3,3,4,6,7,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm5[2,3,2,3,6,7,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7] ; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm1 = ymm13[2],ymm4[2],ymm13[3],ymm4[3],ymm13[6],ymm4[6],ymm13[7],ymm4[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] @@ -5678,22 +5678,22 @@ ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps 160(%rdx), %ymm1 ; AVX2-FAST-NEXT: vmovaps 160(%rcx), %ymm12 -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm0 = ymm12[1,1,1,1,5,5,5,5] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm12[1,1,1,1,5,5,5,5] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7] ; AVX2-FAST-NEXT: vmovaps %ymm1, %ymm3 ; AVX2-FAST-NEXT: vmovups %ymm1, (%rsp) # 32-byte Spill ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] ; AVX2-FAST-NEXT: vmovaps 160(%rdi), %ymm10 ; AVX2-FAST-NEXT: vmovaps 160(%rsi), %ymm9 -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm1 = ymm9[1,1,2,2,5,5,6,6] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm9[1,1,2,2,5,5,6,6] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm10[1,2],ymm1[3,4],ymm10[5,6],ymm1[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4],ymm1[5,6,7] ; AVX2-FAST-NEXT: vbroadcastsd 176(%r8), %ymm1 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4],ymm1[5],ymm0[6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm0 = ymm12[0,2,3,3,4,6,7,7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm1 = ymm3[2,3,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm12[0,2,3,3,4,6,7,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm3[2,3,2,3,6,7,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7] ; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm1 = ymm9[2],ymm10[2],ymm9[3],ymm10[3],ymm9[6],ymm10[6],ymm9[7],ymm10[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] @@ -5716,20 +5716,20 @@ ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps 192(%rdx), %ymm7 ; AVX2-FAST-NEXT: vmovaps 192(%rcx), %ymm6 -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm0 = ymm6[1,1,1,1,5,5,5,5] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm6[1,1,1,1,5,5,5,5] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm7[1],ymm0[2,3,4],ymm7[5],ymm0[6,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] ; AVX2-FAST-NEXT: vmovaps 192(%rdi), %ymm5 ; AVX2-FAST-NEXT: vmovaps 192(%rsi), %ymm4 -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm1 = ymm4[1,1,2,2,5,5,6,6] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm4[1,1,2,2,5,5,6,6] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm5[1,2],ymm1[3,4],ymm5[5,6],ymm1[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4],ymm1[5,6,7] ; AVX2-FAST-NEXT: vbroadcastsd 208(%r8), %ymm1 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4],ymm1[5],ymm0[6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm0 = ymm6[0,2,3,3,4,6,7,7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm1 = ymm7[2,3,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm6[0,2,3,3,4,6,7,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm7[2,3,2,3,6,7,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7] ; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm1 = ymm4[2],ymm5[2],ymm4[3],ymm5[3],ymm4[6],ymm5[6],ymm4[7],ymm5[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] @@ -5752,20 +5752,20 @@ ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps 224(%rdx), %ymm3 ; AVX2-FAST-NEXT: vmovaps 224(%rcx), %ymm2 -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm0 = ymm2[1,1,1,1,5,5,5,5] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm2[1,1,1,1,5,5,5,5] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3,4],ymm3[5],ymm0[6,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm8 = ymm0[2,2,2,2] ; AVX2-FAST-NEXT: vmovaps 224(%rdi), %ymm1 ; AVX2-FAST-NEXT: vmovaps 224(%rsi), %ymm0 -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm11 = ymm0[1,1,2,2,5,5,6,6] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm11 = ymm0[1,1,2,2,5,5,6,6] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0],ymm1[1,2],ymm11[3,4],ymm1[5,6],ymm11[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[2,2,2,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm11[0,1,2],ymm8[3,4],ymm11[5,6,7] ; AVX2-FAST-NEXT: vbroadcastsd 240(%r8), %ymm11 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm11[0],ymm8[1,2,3,4],ymm11[5],ymm8[6,7] ; AVX2-FAST-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm8 = ymm2[0,2,3,3,4,6,7,7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm11 = ymm3[2,3,2,3,6,7,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm8 = ymm2[0,2,3,3,4,6,7,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm11 = ymm3[2,3,2,3,6,7,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm11[0],ymm8[1,2],ymm11[3,4],ymm8[5,6],ymm11[7] ; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm11 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[3,3,3,3] @@ -5823,7 +5823,7 @@ ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0],ymm14[1,2,3],ymm8[4,5],ymm14[6,7] ; AVX2-FAST-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm8 = ymm8[0,1,2],mem[3],ymm8[4,5,6,7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm14 = ymm13[3,0,2,3,7,4,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm14 = ymm13[3,0,2,3,7,4,6,7] ; AVX2-FAST-NEXT: vblendps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm14 = ymm14[0,1,2,3],mem[4],ymm14[5,6,7] ; AVX2-FAST-NEXT: vpermilps $52, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload @@ -5834,27 +5834,27 @@ ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0],ymm13[1,2,3],ymm14[4,5],ymm13[6,7] ; AVX2-FAST-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm13 = ymm13[0,1,2],mem[3],ymm13[4,5,6,7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm9 = ymm9[3,0,2,3,7,4,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm9 = ymm9[3,0,2,3,7,4,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4],ymm9[5,6,7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm10 = ymm12[0,1,3,0,4,5,7,4] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm10 = ymm12[0,1,3,0,4,5,7,4] ; AVX2-FAST-NEXT: vpermilps $78, (%rsp), %ymm12 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm12 = mem[2,3,0,1,6,7,4,5] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm10 = ymm12[0,1],ymm10[2],ymm12[3,4,5,6],ymm10[7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0],ymm10[1,2,3],ymm9[4,5],ymm10[6,7] ; AVX2-FAST-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm9 = ymm9[0,1,2],mem[3],ymm9[4,5,6,7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm4 = ymm4[3,0,2,3,7,4,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm4[3,0,2,3,7,4,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4],ymm4[5,6,7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm5 = ymm6[0,1,3,0,4,5,7,4] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm6 = ymm7[2,3,0,1,6,7,4,5] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm5 = ymm6[0,1,3,0,4,5,7,4] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm6 = ymm7[2,3,0,1,6,7,4,5] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1],ymm5[2],ymm6[3,4,5,6],ymm5[7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm5[1,2,3],ymm4[4,5],ymm5[6,7] ; AVX2-FAST-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm4 = ymm4[0,1,2],mem[3],ymm4[4,5,6,7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,0,2,3,7,4,6,7] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,0,2,3,7,4,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4],ymm0[5,6,7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm1 = ymm2[0,1,3,0,4,5,7,4] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm2 = ymm3[2,3,0,1,6,7,4,5] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,1,3,0,4,5,7,4] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm3[2,3,0,1,6,7,4,5] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4,5,6],ymm1[7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4,5],ymm1[6,7] ; AVX2-FAST-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload @@ -5953,14 +5953,14 @@ ; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rdx), %xmm4 ; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%rdx), %xmm1 ; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm2 = xmm8[0],xmm7[0],xmm8[1],xmm7[1] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[0,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm5 = ymm2[0,0,2,1] ; AVX2-FAST-PERLANE-NEXT: vmovaps (%rsi), %xmm9 ; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rsi), %xmm6 ; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%rsi), %xmm2 ; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdi), %xmm10 ; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm11 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm11 = xmm11[0,1,3,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm11 = xmm11[0,1,3,2] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm12 = ymm11[0,1,1,1] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm12[0,1],ymm5[2,3],ymm12[4,5,6],ymm5[7] ; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rdi), %xmm12 @@ -5969,9 +5969,9 @@ ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%rdi), %xmm5 ; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} xmm9 = xmm10[2],xmm9[2],xmm10[3],xmm9[3] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm9 = xmm9[0,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm9 = xmm9[0,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm7 = xmm7[1,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm7 = xmm7[1,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm8[2],xmm7[3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,1,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm9[2,3],ymm7[4,5],ymm9[6,7] @@ -5979,19 +5979,19 @@ ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0],ymm8[1],ymm7[2,3,4,5],ymm8[6],ymm7[7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm7 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm7 = xmm7[0,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm7 = xmm7[0,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,0,2,1] ; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm8 = xmm12[0],xmm6[0],xmm12[1],xmm6[1] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm8 = xmm8[0,1,3,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm8 = xmm8[0,1,3,2] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm9 = ymm8[0,1,1,1] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm9[0,1],ymm7[2,3],ymm9[4,5,6],ymm7[7] ; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, 32(%r8), %ymm8, %ymm8 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0],ymm7[1,2,3],ymm8[4],ymm7[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} xmm6 = xmm12[2],xmm6[2],xmm12[3],xmm6[3] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm6 = xmm6[0,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm6 = xmm6[0,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm3 = xmm3[1,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm3 = xmm3[1,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm4[2],xmm3[3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,1,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm6[2,3],ymm3[4,5],ymm6[6,7] @@ -5999,19 +5999,19 @@ ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3,4,5],ymm4[6],ymm3[7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm3 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm3 = xmm3[0,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,0,2,1] ; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm4 = xmm5[0],xmm2[0],xmm5[1],xmm2[1] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm4 = xmm4[0,1,3,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm4 = xmm4[0,1,3,2] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm6 = ymm4[0,1,1,1] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm6[0,1],ymm3[2,3],ymm6[4,5,6],ymm3[7] ; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, 64(%r8), %ymm4, %ymm4 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0],ymm3[1,2,3],ymm4[4],ymm3[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} xmm2 = xmm5[2],xmm2[2],xmm5[3],xmm2[3] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[0,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5],ymm2[6,7] @@ -6021,12 +6021,12 @@ ; AVX2-FAST-PERLANE-NEXT: vmovaps 96(%rcx), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovaps 96(%rdx), %xmm1 ; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[0,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,0,2,1] ; AVX2-FAST-PERLANE-NEXT: vmovaps 96(%rsi), %xmm3 ; AVX2-FAST-PERLANE-NEXT: vmovaps 96(%rdi), %xmm4 ; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm5 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm5 = xmm5[0,1,3,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm5 = xmm5[0,1,3,2] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm6 = ymm5[0,1,1,1] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm6[0,1],ymm2[2,3],ymm6[4,5,6],ymm2[7] ; AVX2-FAST-PERLANE-NEXT: vmovaps 96(%r8), %ymm6 @@ -6035,9 +6035,9 @@ ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0],ymm2[1,2,3],ymm5[4],ymm2[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} xmm2 = xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[0,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,0,2,1] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5],ymm2[6,7] @@ -6047,23 +6047,23 @@ ; AVX2-FAST-PERLANE-NEXT: vmovaps 128(%rcx), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovaps 128(%rdx), %xmm1 ; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[0,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,0,2,1] ; AVX2-FAST-PERLANE-NEXT: vmovaps 128(%rsi), %xmm3 ; AVX2-FAST-PERLANE-NEXT: vmovaps 128(%rdi), %xmm4 ; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm5 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm5 = xmm5[0,1,3,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm5 = xmm5[0,1,3,2] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm6 = ymm5[0,1,1,1] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm6[0,1],ymm2[2,3],ymm6[4,5,6],ymm2[7] ; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, 128(%r8), %ymm5, %ymm5 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0],ymm2[1,2,3],ymm5[4],ymm2[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} xmm2 = xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3] ; AVX2-FAST-PERLANE-NEXT: vmovaps 128(%r8), %ymm3 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm1 = xmm2[0,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm2[0,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,0,2,1] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] @@ -6073,21 +6073,21 @@ ; AVX2-FAST-PERLANE-NEXT: vmovaps 160(%rcx), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovaps 160(%rdx), %xmm1 ; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[0,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,0,2,1] ; AVX2-FAST-PERLANE-NEXT: vmovaps 160(%rsi), %xmm3 ; AVX2-FAST-PERLANE-NEXT: vmovaps 160(%rdi), %xmm4 ; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm5 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm5 = xmm5[0,1,3,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm5 = xmm5[0,1,3,2] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm6 = ymm5[0,1,1,1] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm6[0,1],ymm2[2,3],ymm6[4,5,6],ymm2[7] ; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, 160(%r8), %ymm5, %ymm5 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0],ymm2[1,2,3],ymm5[4],ymm2[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} xmm2 = xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm1 = xmm2[0,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm2[0,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,0,2,1] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] @@ -6099,21 +6099,21 @@ ; AVX2-FAST-PERLANE-NEXT: vmovaps 192(%rcx), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovaps 192(%rdx), %xmm1 ; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[0,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,0,2,1] ; AVX2-FAST-PERLANE-NEXT: vmovaps 192(%rsi), %xmm3 ; AVX2-FAST-PERLANE-NEXT: vmovaps 192(%rdi), %xmm4 ; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm5 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm5 = xmm5[0,1,3,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm5 = xmm5[0,1,3,2] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm6 = ymm5[0,1,1,1] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm6[0,1],ymm2[2,3],ymm6[4,5,6],ymm2[7] ; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, 192(%r8), %ymm5, %ymm5 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0],ymm2[1,2,3],ymm5[4],ymm2[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} xmm2 = xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm1 = xmm2[0,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm2[0,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,0,2,1] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] @@ -6125,21 +6125,21 @@ ; AVX2-FAST-PERLANE-NEXT: vmovaps 224(%rcx), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovaps 224(%rdx), %xmm1 ; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[0,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,0,2,1] ; AVX2-FAST-PERLANE-NEXT: vmovaps 224(%rsi), %xmm3 ; AVX2-FAST-PERLANE-NEXT: vmovaps 224(%rdi), %xmm4 ; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm5 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm5 = xmm5[0,1,3,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm5 = xmm5[0,1,3,2] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm6 = ymm5[0,1,1,1] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm6[0,1],ymm2[2,3],ymm6[4,5,6],ymm2[7] ; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, 224(%r8), %ymm5, %ymm5 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0],ymm2[1,2,3],ymm5[4],ymm2[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} xmm2 = xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm1 = xmm2[0,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm2[0,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,0,2,1] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] @@ -6150,7 +6150,7 @@ ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdx), %ymm14 ; AVX2-FAST-PERLANE-NEXT: vmovaps (%rcx), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm0 = ymm1[1,1,1,1,5,5,5,5] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm1[1,1,1,1,5,5,5,5] ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm1, %ymm2 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm14[1],ymm0[2,3,4],ymm14[5],ymm0[6,7] @@ -6158,7 +6158,7 @@ ; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdi), %ymm3 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps (%rsi), %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm4[1,1,2,2,5,5,6,6] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm4[1,1,2,2,5,5,6,6] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm3[1,2],ymm1[3,4],ymm3[5,6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,3] @@ -6166,8 +6166,8 @@ ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 16(%r8), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4],ymm1[5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm0 = ymm2[0,2,3,3,4,6,7,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm14[2,3,2,3,6,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm2[0,2,3,3,4,6,7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm14[2,3,2,3,6,7,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm1 = ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[6],ymm3[6],ymm4[7],ymm3[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] @@ -6178,7 +6178,7 @@ ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rdx), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rcx), %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm0 = ymm2[1,1,1,1,5,5,5,5] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm2[1,1,1,1,5,5,5,5] ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, %ymm3 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7] @@ -6188,7 +6188,7 @@ ; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rdi), %ymm4 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rsi), %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm5[1,1,2,2,5,5,6,6] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm5[1,1,2,2,5,5,6,6] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm4[1,2],ymm1[3,4],ymm4[5,6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,3] @@ -6196,8 +6196,8 @@ ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 48(%r8), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4],ymm1[5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm0 = ymm3[0,2,3,3,4,6,7,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm2[2,3,2,3,6,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm3[0,2,3,3,4,6,7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm2[2,3,2,3,6,7,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm1 = ymm5[2],ymm4[2],ymm5[3],ymm4[3],ymm5[6],ymm4[6],ymm5[7],ymm4[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] @@ -6208,7 +6208,7 @@ ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%rdx), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%rcx), %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm0 = ymm2[1,1,1,1,5,5,5,5] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm2[1,1,1,1,5,5,5,5] ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, %ymm3 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7] @@ -6218,7 +6218,7 @@ ; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%rdi), %ymm4 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%rsi), %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm5[1,1,2,2,5,5,6,6] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm5[1,1,2,2,5,5,6,6] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm4[1,2],ymm1[3,4],ymm4[5,6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,3] @@ -6226,8 +6226,8 @@ ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 80(%r8), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4],ymm1[5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm0 = ymm3[0,2,3,3,4,6,7,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm2[2,3,2,3,6,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm3[0,2,3,3,4,6,7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm2[2,3,2,3,6,7,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm1 = ymm5[2],ymm4[2],ymm5[3],ymm4[3],ymm5[6],ymm4[6],ymm5[7],ymm4[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] @@ -6238,7 +6238,7 @@ ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 96(%rdx), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovaps 96(%rcx), %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm0 = ymm2[1,1,1,1,5,5,5,5] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm2[1,1,1,1,5,5,5,5] ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, %ymm3 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7] @@ -6248,7 +6248,7 @@ ; AVX2-FAST-PERLANE-NEXT: vmovaps 96(%rdi), %ymm4 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 96(%rsi), %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm5[1,1,2,2,5,5,6,6] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm5[1,1,2,2,5,5,6,6] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm4[1,2],ymm1[3,4],ymm4[5,6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,3] @@ -6256,8 +6256,8 @@ ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 112(%r8), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4],ymm1[5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm0 = ymm3[0,2,3,3,4,6,7,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm2[2,3,2,3,6,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm3[0,2,3,3,4,6,7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm2[2,3,2,3,6,7,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm1 = ymm5[2],ymm4[2],ymm5[3],ymm4[3],ymm5[6],ymm4[6],ymm5[7],ymm4[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] @@ -6268,7 +6268,7 @@ ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 128(%rdx), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovaps 128(%rcx), %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm0 = ymm2[1,1,1,1,5,5,5,5] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm2[1,1,1,1,5,5,5,5] ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm2, %ymm3 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7] @@ -6278,15 +6278,15 @@ ; AVX2-FAST-PERLANE-NEXT: vmovaps 128(%rdi), %ymm4 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 128(%rsi), %ymm13 -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm13[1,1,2,2,5,5,6,6] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm13[1,1,2,2,5,5,6,6] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm4[1,2],ymm1[3,4],ymm4[5,6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4],ymm1[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 144(%r8), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4],ymm1[5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm0 = ymm3[0,2,3,3,4,6,7,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm2[2,3,2,3,6,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm3[0,2,3,3,4,6,7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm2[2,3,2,3,6,7,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm1 = ymm13[2],ymm4[2],ymm13[3],ymm4[3],ymm13[6],ymm4[6],ymm13[7],ymm4[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] @@ -6297,20 +6297,20 @@ ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 160(%rdx), %ymm12 ; AVX2-FAST-PERLANE-NEXT: vmovaps 160(%rcx), %ymm11 -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm0 = ymm11[1,1,1,1,5,5,5,5] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm11[1,1,1,1,5,5,5,5] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm12[1],ymm0[2,3,4],ymm12[5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] ; AVX2-FAST-PERLANE-NEXT: vmovaps 160(%rdi), %ymm9 ; AVX2-FAST-PERLANE-NEXT: vmovaps 160(%rsi), %ymm8 -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm8[1,1,2,2,5,5,6,6] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm8[1,1,2,2,5,5,6,6] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm9[1,2],ymm1[3,4],ymm9[5,6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4],ymm1[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 176(%r8), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4],ymm1[5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm0 = ymm11[0,2,3,3,4,6,7,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm12[2,3,2,3,6,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm11[0,2,3,3,4,6,7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm12[2,3,2,3,6,7,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm1 = ymm8[2],ymm9[2],ymm8[3],ymm9[3],ymm8[6],ymm9[6],ymm8[7],ymm9[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] @@ -6321,20 +6321,20 @@ ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 192(%rdx), %ymm7 ; AVX2-FAST-PERLANE-NEXT: vmovaps 192(%rcx), %ymm6 -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm0 = ymm6[1,1,1,1,5,5,5,5] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm6[1,1,1,1,5,5,5,5] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm7[1],ymm0[2,3,4],ymm7[5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] ; AVX2-FAST-PERLANE-NEXT: vmovaps 192(%rdi), %ymm5 ; AVX2-FAST-PERLANE-NEXT: vmovaps 192(%rsi), %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm4[1,1,2,2,5,5,6,6] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm4[1,1,2,2,5,5,6,6] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm5[1,2],ymm1[3,4],ymm5[5,6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4],ymm1[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 208(%r8), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4],ymm1[5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm0 = ymm6[0,2,3,3,4,6,7,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm7[2,3,2,3,6,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm6[0,2,3,3,4,6,7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm7[2,3,2,3,6,7,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4],ymm0[5,6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm1 = ymm4[2],ymm5[2],ymm4[3],ymm5[3],ymm4[6],ymm5[6],ymm4[7],ymm5[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,3,3,3] @@ -6345,20 +6345,20 @@ ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 224(%rdx), %ymm3 ; AVX2-FAST-PERLANE-NEXT: vmovaps 224(%rcx), %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm0 = ymm2[1,1,1,1,5,5,5,5] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm2[1,1,1,1,5,5,5,5] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3,4],ymm3[5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm10 = ymm0[2,2,2,2] ; AVX2-FAST-PERLANE-NEXT: vmovaps 224(%rdi), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovaps 224(%rsi), %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm15 = ymm0[1,1,2,2,5,5,6,6] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm15 = ymm0[1,1,2,2,5,5,6,6] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0],ymm1[1,2],ymm15[3,4],ymm1[5,6],ymm15[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[2,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm10 = ymm15[0,1,2],ymm10[3,4],ymm15[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 240(%r8), %ymm15 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm10 = ymm15[0],ymm10[1,2,3,4],ymm15[5],ymm10[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm10, (%rsp) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm10 = ymm2[0,2,3,3,4,6,7,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm15 = ymm3[2,3,2,3,6,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm10 = ymm2[0,2,3,3,4,6,7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm15 = ymm3[2,3,2,3,6,7,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm10 = ymm15[0],ymm10[1,2],ymm15[3,4],ymm10[5,6],ymm15[7] ; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm15 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[3,3,3,3] @@ -6373,7 +6373,7 @@ ; AVX2-FAST-PERLANE-NEXT: # ymm10 = ymm10[0,1,2,3],mem[4],ymm10[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermilps $52, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm15 = mem[0,1,3,0,4,5,7,4] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm14 = ymm14[2,3,0,1,6,7,4,5] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm14 = ymm14[2,3,0,1,6,7,4,5] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1],ymm15[2],ymm14[3,4,5,6],ymm15[7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0],ymm14[1,2,3],ymm10[4,5],ymm14[6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload @@ -6417,7 +6417,7 @@ ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0],ymm14[1,2,3],ymm10[4,5],ymm14[6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm10 = ymm10[0,1,2],mem[3],ymm10[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm13 = ymm13[3,0,2,3,7,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm13 = ymm13[3,0,2,3,7,4,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps $16, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm13 = ymm13[0,1,2,3],mem[4],ymm13[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermilps $52, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload @@ -6428,26 +6428,26 @@ ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0],ymm14[1,2,3],ymm13[4,5],ymm14[6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm13 = ymm13[0,1,2],mem[3],ymm13[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm8 = ymm8[3,0,2,3,7,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm8 = ymm8[3,0,2,3,7,4,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4],ymm8[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm9 = ymm11[0,1,3,0,4,5,7,4] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm11 = ymm12[2,3,0,1,6,7,4,5] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm9 = ymm11[0,1,3,0,4,5,7,4] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm11 = ymm12[2,3,0,1,6,7,4,5] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm11[0,1],ymm9[2],ymm11[3,4,5,6],ymm9[7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0],ymm9[1,2,3],ymm8[4,5],ymm9[6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm8 = ymm8[0,1,2],mem[3],ymm8[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm4 = ymm4[3,0,2,3,7,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm4 = ymm4[3,0,2,3,7,4,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4],ymm4[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm5 = ymm6[0,1,3,0,4,5,7,4] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm6 = ymm7[2,3,0,1,6,7,4,5] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm5 = ymm6[0,1,3,0,4,5,7,4] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm6 = ymm7[2,3,0,1,6,7,4,5] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1],ymm5[2],ymm6[3,4,5,6],ymm5[7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm5[1,2,3],ymm4[4,5],ymm5[6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm4 = ymm4[0,1,2],mem[3],ymm4[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,0,2,3,7,4,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,0,2,3,7,4,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4],ymm0[5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm2[0,1,3,0,4,5,7,4] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm2 = ymm3[2,3,0,1,6,7,4,5] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,1,3,0,4,5,7,4] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm3[2,3,0,1,6,7,4,5] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4,5,6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4,5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-6.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-6.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-6.ll @@ -56,7 +56,7 @@ ; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm2 = ymm2[u,u,0,2,u,u,5,7] ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3,4,5,6,7] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm4[0,2,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm4[0,2,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm3[1,3],xmm4[1,3] @@ -78,9 +78,9 @@ ; AVX2-SLOW-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero ; AVX2-SLOW-NEXT: vmovsd {{.*#+}} xmm3 = mem[0],zero ; AVX2-SLOW-NEXT: vmovlhps {{.*#+}} xmm2 = xmm3[0],xmm2[0] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,1] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm3 = xmm2[0,2,2,3] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm2[0,2,2,3] ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5],ymm0[6,7] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,3],xmm2[1,3] @@ -127,9 +127,9 @@ ; AVX2-FAST-PERLANE-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero ; AVX2-FAST-PERLANE-NEXT: vmovsd {{.*#+}} xmm3 = mem[0],zero ; AVX2-FAST-PERLANE-NEXT: vmovlhps {{.*#+}} xmm2 = xmm3[0],xmm2[0] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,1] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm3 = xmm2[0,2,2,3] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm3 = xmm2[0,2,2,3] ; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,3],xmm2[1,3] @@ -227,24 +227,24 @@ ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm10 = ymm9[0],ymm8[0],ymm9[1],ymm8[1],ymm9[4],ymm8[4],ymm9[5],ymm8[5] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm11 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm6[0],ymm11[0],ymm6[2],ymm11[2] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm12 = ymm12[0,2,3,1,4,6,7,5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm12[0,2,3,1,4,6,7,5] ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm13 = xmm3[0],xmm1[0] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm13 = xmm13[0,1,2,0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm13 = xmm13[0,1,2,0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1],ymm13[2,3],ymm12[4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm12[0,1,2,3],ymm10[4,5],ymm12[6,7] ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm6 = ymm11[2],ymm6[2],ymm11[3],ymm6[3],ymm11[6],ymm6[6],ymm11[7],ymm6[7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm11 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm7[1,2],ymm11[1,2],ymm7[5,6],ymm11[5,6] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm7 = ymm7[0,2,3,1,4,6,7,5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm7[0,2,3,1,4,6,7,5] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5],ymm7[6,7] ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm6[0,1],ymm4[2,3],ymm6[4,5,6,7] ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm8[1],ymm9[1],ymm8[3],ymm9[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm5 = ymm5[0,2,3,1,4,6,7,5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[0,2,3,1,4,6,7,5] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,3],ymm2[3,3],ymm0[7,7],ymm2[7,7] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1],ymm0[2,3,4,5],ymm5[6,7] ; AVX1-ONLY-NEXT: vmovaps %ymm0, 64(%rax) ; AVX1-ONLY-NEXT: vmovaps %ymm4, 32(%rax) @@ -284,7 +284,7 @@ ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] ; AVX2-ONLY-NEXT: vbroadcastf128 {{.*#+}} ymm1 = [2,6,3,7,2,6,3,7] ; AVX2-ONLY-NEXT: # ymm1 = mem[0,1,0,1] ; AVX2-ONLY-NEXT: vpermps %ymm8, %ymm1, %ymm1 @@ -413,7 +413,7 @@ ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm7[0],ymm8[0],ymm7[1],ymm8[1],ymm7[4],ymm8[4],ymm7[5],ymm8[5] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm3[0],ymm2[0],ymm3[2],ymm2[2] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm4 = ymm4[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm4 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm4[2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] @@ -422,7 +422,7 @@ ; AVX1-ONLY-NEXT: vmovaps (%rcx), %xmm9 ; AVX1-ONLY-NEXT: vmovaps (%rdx), %xmm10 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm10[1,2],xmm9[1,2] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm4 = xmm4[0,2,1,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm4[0,2,1,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm4, %ymm4 ; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm5 ; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm6 @@ -436,30 +436,30 @@ ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm8 = ymm7[2],ymm8[2],ymm7[3],ymm8[3],ymm7[6],ymm8[6],ymm7[7],ymm8[7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm2[1,2],ymm3[1,2],ymm2[5,6],ymm3[5,6] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm7[2,3,2,3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm7 = ymm7[0,2,1,3,4,6,5,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm7[0,2,1,3,4,6,5,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5],ymm7[6,7] ; AVX1-ONLY-NEXT: vbroadcastss 20(%r8), %xmm12 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm12[2,3],ymm7[4,5,6,7] ; AVX1-ONLY-NEXT: vbroadcastss 20(%r9), %ymm12 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2],ymm12[3],ymm7[4,5,6,7] ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm9 = xmm10[2],xmm9[2],xmm10[3],xmm9[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm9 = xmm9[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm9 = xmm9[2,3,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm11, %ymm9 ; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm10 = mem[2,1,3,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm10, %ymm10 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1],ymm9[2,3,4,5],ymm10[6,7] ; AVX1-ONLY-NEXT: vmovaps (%r9), %xmm10 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm11 = xmm10[0,2,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm11 = xmm10[0,2,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm11, %ymm10 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0],ymm10[1],ymm9[2,3,4,5,6],ymm10[7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm3[3,0],ymm2[3,0],ymm3[7,4],ymm2[7,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm2 = ymm2[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm8[2,3],ymm2[2,3] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,2,3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[2,1,3,3,6,5,7,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,1,3,3,6,5,7,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4,5,6],ymm2[7] ; AVX1-ONLY-NEXT: vbroadcastss (%rcx), %xmm2 ; AVX1-ONLY-NEXT: vbroadcastss (%rdx), %xmm3 @@ -943,7 +943,7 @@ ; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %xmm10 ; AVX1-ONLY-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm6[1,2],xmm8[1,2] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,1,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,1,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -960,7 +960,7 @@ ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm4[0],ymm13[0],ymm4[1],ymm13[1],ymm4[4],ymm13[4],ymm4[5],ymm13[5] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm9[0],ymm7[0],ymm9[2],ymm7[2] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm11[4,5],ymm0[6,7] @@ -968,7 +968,7 @@ ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm10[1,2],xmm3[1,2] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,1,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,1,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %xmm3 ; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm2 @@ -987,7 +987,7 @@ ; AVX1-ONLY-NEXT: vmovaps (%rdx), %ymm11 ; AVX1-ONLY-NEXT: vmovaps (%rcx), %ymm12 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm12[0],ymm11[0],ymm12[2],ymm11[2] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm10 = ymm10[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm14[0,1],ymm10[2,3],ymm14[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps (%r8), %ymm14 @@ -998,7 +998,7 @@ ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm13 = ymm4[2],ymm13[2],ymm4[3],ymm13[3],ymm4[6],ymm13[6],ymm4[7],ymm13[7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm7[1,2],ymm9[1,2],ymm7[5,6],ymm9[5,6] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3,2,3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm4 = ymm4[0,2,1,3,4,6,5,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,2,1,3,4,6,5,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm13[4,5],ymm4[6,7] ; AVX1-ONLY-NEXT: vbroadcastss 52(%r8), %xmm10 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm10[2,3],ymm4[4,5,6,7] @@ -1007,20 +1007,20 @@ ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm4 = ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[6],ymm0[6],ymm1[7],ymm0[7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm11[1,2],ymm12[1,2],ymm11[5,6],ymm12[5,6] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5],ymm0[6,7] ; AVX1-ONLY-NEXT: vbroadcastss 20(%r8), %xmm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vbroadcastss 20(%r9), %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm6[2],xmm8[2],xmm6[3],xmm8[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm5, %ymm0 ; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm5 = mem[2,1,3,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm5, %ymm5 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1],ymm0[2,3,4,5],ymm5[6,7] ; AVX1-ONLY-NEXT: vmovaps (%r9), %xmm5 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm6 = xmm5[0,2,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm5[0,2,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm6, %ymm5 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm0[0],ymm5[1],ymm0[2,3,4,5,6],ymm5[7] ; AVX1-ONLY-NEXT: vbroadcastss 32(%rcx), %xmm0 @@ -1034,25 +1034,25 @@ ; AVX1-ONLY-NEXT: vbroadcastss 32(%r9), %ymm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm0[0,1,2,3,4],ymm2[5],ymm0[6,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm9[3,0],ymm7[3,0],ymm9[7,4],ymm7[7,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm13[2,3],ymm0[2,3] ; AVX1-ONLY-NEXT: vperm2f128 $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm3 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm3 = ymm3[2,1,3,3,6,5,7,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[2,1,3,3,6,5,7,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3,4,5],ymm3[6,7] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3,4,5,6],ymm3[7] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm3 = xmm3[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm3[2,3,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm15, %ymm3 ; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm6 = mem[2,1,3,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm6, %ymm6 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm6[0,1],ymm3[2,3,4,5],ymm6[6,7] ; AVX1-ONLY-NEXT: vmovaps 32(%r9), %xmm6 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm7 = xmm6[0,2,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm6[0,2,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm7, %ymm6 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm6[1],ymm3[2,3,4,5,6],ymm6[7] ; AVX1-ONLY-NEXT: vbroadcastss (%rcx), %xmm6 @@ -1068,14 +1068,14 @@ ; AVX1-ONLY-NEXT: vbroadcastss (%r9), %ymm7 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm7[5],ymm6[6,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm12[3,0],ymm11[3,0],ymm12[7,4],ymm11[7,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm7 = ymm7[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm7[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3],ymm7[2,3] ; AVX1-ONLY-NEXT: vperm2f128 $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm7 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm7 = ymm7[2,1,3,3,6,5,7,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm7[2,1,3,3,6,5,7,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm7[0,1],ymm4[2,3,4,5],ymm7[6,7] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm7 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm7 = ymm7[0,2,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm7[0,2,2,3,4,6,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm7[1],ymm4[2,3,4,5,6],ymm7[7] ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX1-ONLY-NEXT: vmovaps %ymm4, 160(%rax) @@ -2350,7 +2350,7 @@ ; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %xmm3 ; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,2],xmm1[1,2] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,1,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,1,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -2369,7 +2369,7 @@ ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm10[0],ymm4[0],ymm10[2],ymm4[2] ; AVX1-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5],ymm0[6,7] @@ -2377,7 +2377,7 @@ ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm3[1,2],xmm2[1,2] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,1,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,1,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, (%rsp) # 16-byte Spill @@ -2401,7 +2401,7 @@ ; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 32(%r8), %ymm1 @@ -2415,7 +2415,7 @@ ; AVX1-ONLY-NEXT: vmovaps 64(%rdx), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm1[1,2],xmm0[1,2] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,1,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,1,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovaps 64(%rsi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -2440,7 +2440,7 @@ ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm14[0],ymm6[0],ymm14[2],ymm6[2] ; AVX1-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 64(%r8), %ymm1 @@ -2454,7 +2454,7 @@ ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,2],xmm15[1,2] ; AVX1-ONLY-NEXT: vmovaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,1,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,1,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm2 ; AVX1-ONLY-NEXT: vmovaps 96(%rsi), %xmm13 ; AVX1-ONLY-NEXT: vmovaps 96(%rdi), %xmm4 @@ -2475,7 +2475,7 @@ ; AVX1-ONLY-NEXT: vmovaps 96(%rcx), %ymm11 ; AVX1-ONLY-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm11[0],ymm12[0],ymm11[2],ymm12[2] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm11 = ymm11[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm11[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm11, %xmm11 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm11[2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 96(%r8), %ymm1 @@ -2489,7 +2489,7 @@ ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm0[1,2],ymm10[1,2],ymm0[5,6],ymm10[5,6] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm9[2,3,2,3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm9 = ymm9[0,2,1,3,4,6,5,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm9[0,2,1,3,4,6,5,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm11[4,5],ymm9[6,7] ; AVX1-ONLY-NEXT: vbroadcastss 20(%r8), %xmm10 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1],ymm10[2,3],ymm9[4,5,6,7] @@ -2502,7 +2502,7 @@ ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm10[1,2],ymm9[1,2],ymm10[5,6],ymm9[5,6] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm7[2,3,2,3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm7 = ymm7[0,2,1,3,4,6,5,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm7[0,2,1,3,4,6,5,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm1[4,5],ymm7[6,7] ; AVX1-ONLY-NEXT: vbroadcastss 52(%r8), %xmm8 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm8[2,3],ymm7[4,5,6,7] @@ -2513,7 +2513,7 @@ ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm7 = ymm1[2],ymm5[2],ymm1[3],ymm5[3],ymm1[6],ymm5[6],ymm1[7],ymm5[7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm6[1,2],ymm14[1,2],ymm6[5,6],ymm14[5,6] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm5[2,3,2,3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm5 = ymm5[0,2,1,3,4,6,5,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[0,2,1,3,4,6,5,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm7[4,5],ymm5[6,7] ; AVX1-ONLY-NEXT: vbroadcastss 84(%r8), %xmm6 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm6[2,3],ymm5[4,5,6,7] @@ -2521,7 +2521,7 @@ ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1,2],ymm6[3],ymm5[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm4[0],xmm13[0],xmm4[1],xmm13[1] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm4 = xmm15[0,0,0,0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm15[0,0,0,0] ; AVX1-ONLY-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm5 = mem[0,0,0,0] ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] @@ -2535,7 +2535,7 @@ ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm12[1,2],ymm13[1,2],ymm12[5,6],ymm13[5,6] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,2,3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,2,1,3,4,6,5,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2,1,3,4,6,5,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vbroadcastss 116(%r8), %xmm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5,6,7] @@ -2558,26 +2558,26 @@ ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 ; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm2 = mem[2,1,3,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm2, %ymm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5],ymm2[6,7] ; AVX1-ONLY-NEXT: vmovaps (%r9), %xmm2 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm3 = xmm2[0,2,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm2[0,2,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm1[0],ymm2[1],ymm1[2,3,4,5,6],ymm2[7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[3,0],ymm0[3,0],ymm1[7,4],ymm0[7,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm11[2,3],ymm1[2,3] ; AVX1-ONLY-NEXT: vperm2f128 $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm1 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[2,1,3,3,6,5,7,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,1,3,3,6,5,7,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6],ymm1[7] ; AVX1-ONLY-NEXT: vbroadcastss 32(%rcx), %xmm1 ; AVX1-ONLY-NEXT: vbroadcastss 32(%rdx), %xmm2 @@ -2594,26 +2594,26 @@ ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 ; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm3 = mem[2,1,3,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm3, %ymm3 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3,4,5],ymm3[6,7] ; AVX1-ONLY-NEXT: vmovaps 32(%r9), %xmm3 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm5 = xmm3[0,2,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm3[0,2,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm5, %ymm3 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm1[0],ymm3[1],ymm1[2,3,4,5,6],ymm3[7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm9[3,0],ymm10[3,0],ymm9[7,4],ymm10[7,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm1 = mem[2,3],ymm1[2,3] ; AVX1-ONLY-NEXT: vperm2f128 $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm3 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm3 = ymm3[2,1,3,3,6,5,7,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[2,1,3,3,6,5,7,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3,4,5],ymm3[6,7] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2,3,4,5,6],ymm3[7] ; AVX1-ONLY-NEXT: vbroadcastss 64(%rcx), %xmm3 ; AVX1-ONLY-NEXT: vbroadcastss 64(%rdx), %xmm5 @@ -2630,50 +2630,50 @@ ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm5 = xmm5[2],mem[2],xmm5[3],mem[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm5 = xmm5[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm5[2,3,2,3] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm8, %ymm5 ; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm8 = mem[2,1,3,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm8, %ymm8 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm8[0,1],ymm5[2,3,4,5],ymm8[6,7] ; AVX1-ONLY-NEXT: vmovaps 64(%r9), %xmm8 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm9 = xmm8[0,2,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm9 = xmm8[0,2,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm9, %ymm8 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm8[1],ymm5[2,3,4,5,6],ymm8[7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm8 = ymm8[3,0],mem[3,0],ymm8[7,4],mem[7,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm8 = ymm8[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm7[2,3],ymm8[2,3] ; AVX1-ONLY-NEXT: vperm2f128 $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm8 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm8 = ymm8[2,1,3,3,6,5,7,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[2,1,3,3,6,5,7,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1],ymm7[2,3,4,5],ymm8[6,7] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm8 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm8 = ymm8[0,2,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[0,2,2,3,4,6,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0],ymm8[1],ymm7[2,3,4,5,6],ymm8[7] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm8 = xmm8[2],mem[2],xmm8[3],mem[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm8 = xmm8[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm8 = xmm8[2,3,2,3] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm9, %ymm8 ; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm9 = mem[2,1,3,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm9, %ymm9 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1],ymm8[2,3,4,5],ymm9[6,7] ; AVX1-ONLY-NEXT: vmovaps 96(%r9), %xmm9 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm10 = xmm9[0,2,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm9[0,2,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm10, %ymm9 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0],ymm9[1],ymm8[2,3,4,5,6],ymm9[7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm13[3,0],ymm12[3,0],ymm13[7,4],ymm12[7,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm9 = ymm9[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm9[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3],ymm9[2,3] ; AVX1-ONLY-NEXT: vperm2f128 $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm9 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm9 = ymm9[2,1,3,3,6,5,7,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm9[2,1,3,3,6,5,7,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm9[0,1],ymm4[2,3,4,5],ymm9[6,7] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm9 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm9 = ymm9[0,2,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm9[0,2,2,3,4,6,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm9[1],ymm4[2,3,4,5,6],ymm9[7] ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX1-ONLY-NEXT: vmovaps %ymm4, 736(%rax) @@ -2929,7 +2929,7 @@ ; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload ; AVX2-SLOW-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm13 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm13 = xmm10[2],mem[2],xmm10[3],mem[3] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm13 = xmm13[2,3,2,3] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm13 = xmm13[2,3,2,3] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm13, %ymm10, %ymm13 ; AVX2-SLOW-NEXT: vpermilps $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload @@ -2962,7 +2962,7 @@ ; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-SLOW-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[2,3,2,3] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm8, %ymm1 ; AVX2-SLOW-NEXT: vpermilps $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload @@ -2995,7 +2995,7 @@ ; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX2-SLOW-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm3 = xmm3[2,3,2,3] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm3[2,3,2,3] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm3, %ymm9, %ymm3 ; AVX2-SLOW-NEXT: vpermilps $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload @@ -3030,7 +3030,7 @@ ; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX2-SLOW-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm5 = xmm5[2],mem[2],xmm5[3],mem[3] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm5 = xmm5[2,3,2,3] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm5 = xmm5[2,3,2,3] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm5, %ymm10, %ymm5 ; AVX2-SLOW-NEXT: vpermilps $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload @@ -3684,7 +3684,7 @@ ; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm13 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm13 = xmm10[2],mem[2],xmm10[3],mem[3] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm13 = xmm13[2,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm13 = xmm13[2,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm13, %ymm10, %ymm13 ; AVX2-FAST-PERLANE-NEXT: vpermilps $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload @@ -3717,7 +3717,7 @@ ; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[2,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm1, %ymm8, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vpermilps $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload @@ -3750,7 +3750,7 @@ ; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm3 = xmm3[2,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm3 = xmm3[2,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm3, %ymm9, %ymm3 ; AVX2-FAST-PERLANE-NEXT: vpermilps $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload @@ -3785,7 +3785,7 @@ ; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm5 = xmm5[2],mem[2],xmm5[3],mem[3] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm5 = xmm5[2,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm5 = xmm5[2,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm5, %ymm10, %ymm5 ; AVX2-FAST-PERLANE-NEXT: vpermilps $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload @@ -5146,7 +5146,7 @@ ; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %xmm3 ; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,2],xmm1[1,2] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,1,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,1,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -5164,7 +5164,7 @@ ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm0 = ymm8[0],ymm9[0],ymm8[1],ymm9[1],ymm8[4],ymm9[4],ymm8[5],ymm9[5] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm5[0],ymm4[0],ymm5[2],ymm4[2] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5],ymm0[6,7] @@ -5172,7 +5172,7 @@ ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm3[1,2],xmm2[1,2] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,1,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,1,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -5198,7 +5198,7 @@ ; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 32(%r8), %ymm1 @@ -5212,7 +5212,7 @@ ; AVX1-ONLY-NEXT: vmovaps 64(%rdx), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm1[1,2],xmm0[1,2] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,1,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,1,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovaps 64(%rsi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -5238,7 +5238,7 @@ ; AVX1-ONLY-NEXT: vmovaps 64(%rcx), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 64(%r8), %ymm1 @@ -5252,7 +5252,7 @@ ; AVX1-ONLY-NEXT: vmovaps 96(%rdx), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,2],xmm1[1,2] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,1,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,1,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovaps 96(%rsi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -5277,7 +5277,7 @@ ; AVX1-ONLY-NEXT: vmovaps 96(%rcx), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 96(%r8), %ymm1 @@ -5291,7 +5291,7 @@ ; AVX1-ONLY-NEXT: vmovaps 128(%rdx), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,2],xmm1[1,2] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,1,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,1,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovaps 128(%rsi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -5316,7 +5316,7 @@ ; AVX1-ONLY-NEXT: vmovaps 128(%rcx), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 128(%r8), %ymm1 @@ -5330,7 +5330,7 @@ ; AVX1-ONLY-NEXT: vmovaps 160(%rdx), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,2],xmm1[1,2] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,1,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,1,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovaps 160(%rsi), %xmm7 ; AVX1-ONLY-NEXT: vmovaps 160(%rdi), %xmm6 @@ -5354,7 +5354,7 @@ ; AVX1-ONLY-NEXT: vmovaps 160(%rcx), %ymm2 ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[2],ymm1[2] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 160(%r8), %ymm1 @@ -5368,7 +5368,7 @@ ; AVX1-ONLY-NEXT: vmovaps 192(%rdx), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm1[1,2],xmm0[1,2] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,1,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,1,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovaps 192(%rsi), %xmm3 ; AVX1-ONLY-NEXT: vmovaps 192(%rdi), %xmm2 @@ -5390,7 +5390,7 @@ ; AVX1-ONLY-NEXT: vmovaps 192(%rcx), %ymm1 ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm5[0],ymm1[2],ymm5[2] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 192(%r8), %ymm1 @@ -5404,7 +5404,7 @@ ; AVX1-ONLY-NEXT: vmovaps 224(%rdx), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,2],xmm1[1,2] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,1,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,1,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovaps 224(%rsi), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -5428,7 +5428,7 @@ ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm15 = ymm5[0],ymm1[0],ymm5[2],ymm1[2] ; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm15 = ymm15[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm15, %xmm15 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1],ymm15[2,3],ymm12[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 224(%r8), %ymm0 @@ -5444,7 +5444,7 @@ ; AVX1-ONLY-NEXT: vshufps $153, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm15 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm15 = ymm0[1,2],mem[1,2],ymm0[5,6],mem[5,6] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm15 = ymm15[2,3,2,3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm15 = ymm15[0,2,1,3,4,6,5,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm15[0,2,1,3,4,6,5,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0,1,2,3],ymm12[4,5],ymm15[6,7] ; AVX1-ONLY-NEXT: vbroadcastss 20(%r8), %xmm14 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1],ymm14[2,3],ymm15[4,5,6,7] @@ -5459,7 +5459,7 @@ ; AVX1-ONLY-NEXT: vshufps $153, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm14 = ymm0[1,2],mem[1,2],ymm0[5,6],mem[5,6] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm14 = ymm14[2,3,2,3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm14 = ymm14[0,2,1,3,4,6,5,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[0,2,1,3,4,6,5,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm15[4,5],ymm14[6,7] ; AVX1-ONLY-NEXT: vbroadcastss 52(%r8), %xmm15 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1],ymm15[2,3],ymm14[4,5,6,7] @@ -5474,7 +5474,7 @@ ; AVX1-ONLY-NEXT: vshufps $153, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm14 = ymm0[1,2],mem[1,2],ymm0[5,6],mem[5,6] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm14 = ymm14[2,3,2,3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm14 = ymm14[0,2,1,3,4,6,5,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[0,2,1,3,4,6,5,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm15[4,5],ymm14[6,7] ; AVX1-ONLY-NEXT: vbroadcastss 84(%r8), %xmm15 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1],ymm15[2,3],ymm14[4,5,6,7] @@ -5502,7 +5502,7 @@ ; AVX1-ONLY-NEXT: vshufps $153, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm14 = ymm0[1,2],mem[1,2],ymm0[5,6],mem[5,6] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm14 = ymm14[2,3,2,3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm14 = ymm14[0,2,1,3,4,6,5,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[0,2,1,3,4,6,5,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm13[4,5],ymm14[6,7] ; AVX1-ONLY-NEXT: vbroadcastss 116(%r8), %xmm15 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1],ymm15[2,3],ymm14[4,5,6,7] @@ -5516,7 +5516,7 @@ ; AVX1-ONLY-NEXT: vshufps $153, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm11 = ymm11[1,2],mem[1,2],ymm11[5,6],mem[5,6] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm11[2,3,2,3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm11 = ymm11[0,2,1,3,4,6,5,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm11[0,2,1,3,4,6,5,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm0[4,5],ymm11[6,7] ; AVX1-ONLY-NEXT: vbroadcastss 148(%r8), %xmm14 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1],ymm14[2,3],ymm11[4,5,6,7] @@ -5525,9 +5525,9 @@ ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm7 = xmm15[0,0,0,0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm15[0,0,0,0] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm11 = xmm14[0,0,0,0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm11 = xmm14[0,0,0,0] ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm7 = xmm11[0],xmm7[0],xmm11[1],xmm7[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm11 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm11[0,1],ymm7[2,3],ymm11[4,5,6,7] @@ -5544,7 +5544,7 @@ ; AVX1-ONLY-NEXT: vshufps $153, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm6 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm6 = ymm0[1,2],mem[1,2],ymm0[5,6],mem[5,6] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm6 = ymm6[2,3,2,3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm6 = ymm6[0,2,1,3,4,6,5,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm6[0,2,1,3,4,6,5,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5],ymm6[6,7] ; AVX1-ONLY-NEXT: vbroadcastss 180(%r8), %xmm7 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm7[2,3],ymm6[4,5,6,7] @@ -5555,7 +5555,7 @@ ; AVX1-ONLY-NEXT: vpermilps $0, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm3 = mem[0,0,0,0] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm6 = xmm7[0,0,0,0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm7[0,0,0,0] ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm3 = xmm6[0],xmm3[0],xmm6[1],xmm3[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm6 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm6[0,1],ymm3[2,3],ymm6[4,5,6,7] @@ -5569,7 +5569,7 @@ ; AVX1-ONLY-NEXT: vshufps $153, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm2 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm2 = ymm3[1,2],mem[1,2],ymm3[5,6],mem[5,6] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3,2,3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm2 = ymm2[0,2,1,3,4,6,5,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,2,1,3,4,6,5,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm6[4,5],ymm2[6,7] ; AVX1-ONLY-NEXT: vbroadcastss 212(%r8), %xmm4 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm4[2,3],ymm2[4,5,6,7] @@ -5580,7 +5580,7 @@ ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[1,2],ymm5[1,2],ymm1[5,6],ymm5[5,6] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5],ymm0[6,7] ; AVX1-ONLY-NEXT: vbroadcastss 244(%r8), %xmm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] @@ -5603,28 +5603,28 @@ ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = mem[2,1,3,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovaps (%r9), %xmm1 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[0,2,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm1[0,2,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6],ymm1[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[3,0],mem[3,0],ymm0[7,4],mem[7,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm12[2,3],ymm0[2,3] ; AVX1-ONLY-NEXT: vperm2f128 $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm1 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[2,1,3,3,6,5,7,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,1,3,3,6,5,7,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6],ymm1[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vbroadcastss 32(%rcx), %xmm0 @@ -5643,29 +5643,29 @@ ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = mem[2,1,3,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vmovaps 32(%r9), %xmm1 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[0,2,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm1[0,2,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6],ymm1[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[3,0],mem[3,0],ymm0[7,4],mem[7,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = mem[2,3],ymm0[2,3] ; AVX1-ONLY-NEXT: vperm2f128 $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm1 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[2,1,3,3,6,5,7,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,1,3,3,6,5,7,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6],ymm1[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vbroadcastss 64(%rcx), %xmm1 @@ -5684,53 +5684,53 @@ ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm1 = xmm0[2],mem[2],xmm0[3],mem[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm2 = mem[2,1,3,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm2, %ymm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5],ymm2[6,7] ; AVX1-ONLY-NEXT: vmovaps 64(%r9), %xmm2 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm4 = xmm2[0,2,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm2[0,2,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm4, %ymm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4,5,6],ymm2[7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm2 = ymm0[3,0],mem[3,0],ymm0[7,4],mem[7,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm2 = ymm2[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm2 = mem[2,3],ymm2[2,3] ; AVX1-ONLY-NEXT: vperm2f128 $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm4 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm4 = ymm4[2,1,3,3,6,5,7,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[2,1,3,3,6,5,7,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3,4,5],ymm4[6,7] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm4 = ymm4[0,2,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,2,2,3,4,6,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm2[0],ymm4[1],ymm2[2,3,4,5,6],ymm4[7] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm2 = xmm0[2],mem[2],xmm0[3],mem[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm2[2,3,2,3] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 ; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm4 = mem[2,1,3,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm4, %ymm4 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3,4,5],ymm4[6,7] ; AVX1-ONLY-NEXT: vmovaps 96(%r9), %xmm4 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm5 = xmm4[0,2,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm4[0,2,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm5, %ymm4 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[2,3,4,5,6],ymm4[7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm4 = ymm0[3,0],mem[3,0],ymm0[7,4],mem[7,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm4 = ymm4[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm13[2,3],ymm4[2,3] ; AVX1-ONLY-NEXT: vperm2f128 $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm5 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm5 = ymm5[2,1,3,3,6,5,7,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[2,1,3,3,6,5,7,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1],ymm4[2,3,4,5],ymm5[6,7] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm5 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm5 = ymm5[0,2,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[0,2,2,3,4,6,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm4[0],ymm5[1],ymm4[2,3,4,5,6],ymm5[7] ; AVX1-ONLY-NEXT: vbroadcastss 128(%rcx), %xmm4 ; AVX1-ONLY-NEXT: vbroadcastss 128(%rdx), %xmm11 @@ -5747,75 +5747,75 @@ ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm11 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm11 = xmm0[2],mem[2],xmm0[3],mem[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm11 = xmm11[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm11 = xmm11[2,3,2,3] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm11 ; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm12 = mem[2,1,3,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm12, %ymm12 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm12[0,1],ymm11[2,3,4,5],ymm12[6,7] ; AVX1-ONLY-NEXT: vmovaps 128(%r9), %xmm12 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm13 = xmm12[0,2,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm13 = xmm12[0,2,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm13, %ymm12 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0],ymm12[1],ymm11[2,3,4,5,6],ymm12[7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm12 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm12 = ymm0[3,0],mem[3,0],ymm0[7,4],mem[7,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm12 = ymm12[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm12[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm10 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm10 = mem[2,3],ymm12[2,3] ; AVX1-ONLY-NEXT: vperm2f128 $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm12 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm12 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm12 = ymm12[2,1,3,3,6,5,7,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm12[2,1,3,3,6,5,7,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm12[0,1],ymm10[2,3,4,5],ymm12[6,7] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm12 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm12 = ymm12[0,2,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm12[0,2,2,3,4,6,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0],ymm12[1],ymm10[2,3,4,5,6],ymm12[7] ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm12 = xmm14[2],xmm15[2],xmm14[3],xmm15[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm12 = xmm12[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm12 = xmm12[2,3,2,3] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm12 ; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm13 = mem[2,1,3,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm13, %ymm13 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0,1],ymm12[2,3,4,5],ymm13[6,7] ; AVX1-ONLY-NEXT: vmovaps 160(%r9), %xmm13 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm14 = xmm13[0,2,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm13[0,2,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm14, %ymm13 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0],ymm13[1],ymm12[2,3,4,5,6],ymm13[7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm13 = ymm0[3,0],mem[3,0],ymm0[7,4],mem[7,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm13 = ymm13[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm13[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm9 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm9 = mem[2,3],ymm13[2,3] ; AVX1-ONLY-NEXT: vperm2f128 $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm13 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm13 = ymm13[2,1,3,3,6,5,7,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm13[2,1,3,3,6,5,7,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm13[0,1],ymm9[2,3,4,5],ymm13[6,7] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm13 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm13 = ymm13[0,2,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm13[0,2,2,3,4,6,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0],ymm13[1],ymm9[2,3,4,5,6],ymm13[7] ; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm13 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm13 = xmm7[2],mem[2],xmm7[3],mem[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm13 = xmm13[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm13 = xmm13[2,3,2,3] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm0, %ymm13 ; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm14 = mem[2,1,3,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm14, %ymm14 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0,1],ymm13[2,3,4,5],ymm14[6,7] ; AVX1-ONLY-NEXT: vmovaps 192(%r9), %xmm14 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm15 = xmm14[0,2,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm14[0,2,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm15, %ymm14 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0],ymm14[1],ymm13[2,3,4,5,6],ymm14[7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm0[3,0],ymm3[3,0],ymm0[7,4],ymm3[7,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm14 = ymm14[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm6[2,3],ymm14[2,3] ; AVX1-ONLY-NEXT: vperm2f128 $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm14 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm14 = ymm14[2,1,3,3,6,5,7,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[2,1,3,3,6,5,7,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm14[0,1],ymm3[2,3,4,5],ymm14[6,7] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm14 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm14 = ymm14[0,2,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[0,2,2,3,4,6,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm14[1],ymm3[2,3,4,5,6],ymm14[7] ; AVX1-ONLY-NEXT: vbroadcastss 224(%rcx), %xmm14 ; AVX1-ONLY-NEXT: vbroadcastss 224(%rdx), %xmm15 @@ -5832,28 +5832,28 @@ ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload ; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm14 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm14 = xmm7[2],mem[2],xmm7[3],mem[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm14 = xmm14[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm14[2,3,2,3] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm7, %ymm14 ; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm15 = mem[2,1,3,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm15, %ymm15 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1],ymm14[2,3,4,5],ymm15[6,7] ; AVX1-ONLY-NEXT: vmovaps 224(%r9), %xmm15 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm7 = xmm15[0,2,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm15[0,2,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm7, %ymm7 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm14[0],ymm7[1],ymm14[2,3,4,5,6],ymm7[7] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm14 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm14 = ymm6[3,0],mem[3,0],ymm6[7,4],mem[7,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm14 = ymm14[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm6 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm6 = mem[2,3],ymm14[2,3] ; AVX1-ONLY-NEXT: vperm2f128 $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm14 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm14 = ymm14[2,1,3,3,6,5,7,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[2,1,3,3,6,5,7,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm14[0,1],ymm6[2,3,4,5],ymm14[6,7] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm14 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm14 = ymm14[0,2,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm14[0,2,2,3,4,6,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm14[1],ymm6[2,3,4,5,6],ymm14[7] ; AVX1-ONLY-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX1-ONLY-NEXT: vmovaps %ymm6, 1504(%rax) @@ -6379,7 +6379,7 @@ ; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-SLOW-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm0, %ymm14, %ymm0 ; AVX2-SLOW-NEXT: vpermilps $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload @@ -6406,10 +6406,10 @@ ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm14[5],ymm0[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm0 = ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[6],ymm2[6],ymm1[7],ymm2[7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] ; AVX2-SLOW-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm0 = mem[2,3],ymm0[2,3] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm15[2,1,3,3,6,5,7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm15[2,1,3,3,6,5,7,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] ; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = mem[0,2,2,3,4,6,6,7] @@ -6419,7 +6419,7 @@ ; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-SLOW-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX2-SLOW-NEXT: vpermilps $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload @@ -6458,7 +6458,7 @@ ; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-SLOW-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX2-SLOW-NEXT: vpermilps $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload @@ -6496,7 +6496,7 @@ ; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-SLOW-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX2-SLOW-NEXT: vpermilps $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload @@ -6533,7 +6533,7 @@ ; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-SLOW-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX2-SLOW-NEXT: vpermilps $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload @@ -6568,7 +6568,7 @@ ; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-SLOW-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX2-SLOW-NEXT: vpermilps $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload @@ -6603,7 +6603,7 @@ ; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-SLOW-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0 ; AVX2-SLOW-NEXT: vpermilps $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload @@ -6638,7 +6638,7 @@ ; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX2-SLOW-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm13 # 16-byte Folded Reload ; AVX2-SLOW-NEXT: # xmm13 = xmm3[2],mem[2],xmm3[3],mem[3] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm13 = xmm13[2,3,2,3] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm13 = xmm13[2,3,2,3] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm13, %ymm3, %ymm13 ; AVX2-SLOW-NEXT: vpermilps $250, (%rsp), %xmm14 # 16-byte Folded Reload @@ -6663,10 +6663,10 @@ ; AVX2-SLOW-NEXT: vbroadcastss 240(%r9), %ymm3 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm14[0,1,2,3,4],ymm3[5],ymm14[6,7] ; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm14 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm14 = ymm14[2,3,2,3,6,7,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm14 = ymm14[2,3,2,3,6,7,6,7] ; AVX2-SLOW-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm14 = mem[2,3],ymm14[2,3] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm15 = ymm15[2,1,3,3,6,5,7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm15 = ymm15[2,1,3,3,6,5,7,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[2,1,2,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1],ymm14[2,3,4,5],ymm15[6,7] ; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm15 = mem[0,2,2,3,4,6,6,7] @@ -7974,7 +7974,7 @@ ; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm0, %ymm14, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vpermilps $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload @@ -8001,10 +8001,10 @@ ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm14[5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm0 = ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[6],ymm2[6],ymm1[7],ymm2[7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] ; AVX2-FAST-PERLANE-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm0 = mem[2,3],ymm0[2,3] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm15[2,1,3,3,6,5,7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm15[2,1,3,3,6,5,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = mem[0,2,2,3,4,6,6,7] @@ -8014,7 +8014,7 @@ ; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vpermilps $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload @@ -8053,7 +8053,7 @@ ; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vpermilps $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload @@ -8091,7 +8091,7 @@ ; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vpermilps $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload @@ -8128,7 +8128,7 @@ ; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vpermilps $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload @@ -8163,7 +8163,7 @@ ; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vpermilps $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload @@ -8198,7 +8198,7 @@ ; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vpermilps $250, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload @@ -8233,7 +8233,7 @@ ; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm13 # 16-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # xmm13 = xmm3[2],mem[2],xmm3[3],mem[3] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm13 = xmm13[2,3,2,3] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm13 = xmm13[2,3,2,3] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm13, %ymm3, %ymm13 ; AVX2-FAST-PERLANE-NEXT: vpermilps $250, (%rsp), %xmm14 # 16-byte Folded Reload @@ -8258,10 +8258,10 @@ ; AVX2-FAST-PERLANE-NEXT: vbroadcastss 240(%r9), %ymm3 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm14[0,1,2,3,4],ymm3[5],ymm14[6,7] ; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm14 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm14 = ymm14[2,3,2,3,6,7,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm14 = ymm14[2,3,2,3,6,7,6,7] ; AVX2-FAST-PERLANE-NEXT: vperm2f128 $19, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm14 = mem[2,3],ymm14[2,3] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm15 = ymm15[2,1,3,3,6,5,7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm15 = ymm15[2,1,3,3,6,5,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[2,1,2,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1],ymm14[2,3,4,5],ymm15[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm15 = mem[0,2,2,3,4,6,6,7] diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-7.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-7.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-7.ll @@ -96,9 +96,9 @@ ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm2[0,2,2,1,4,6,6,5] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,2,2,1,4,6,6,5] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,2,1] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm3 = ymm0[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm3 = ymm0[0,2,2,3,4,6,6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,2] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6],ymm1[7] ; AVX2-SLOW-NEXT: vmovaps {{.*#+}} xmm3 = <3,5,7,u> @@ -166,9 +166,9 @@ ; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm2[0,2,2,1,4,6,6,5] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,2,2,1,4,6,6,5] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,2,1] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm3 = ymm0[0,2,2,3,4,6,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm3 = ymm0[0,2,2,3,4,6,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,2] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} xmm3 = <3,5,7,u> @@ -391,7 +391,7 @@ ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm10[6,7] ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm5 = ymm7[0],ymm5[0],ymm7[1],ymm5[1],ymm7[4],ymm5[4],ymm7[5],ymm5[5] ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm3 = xmm4[0],xmm3[0] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm3 = xmm3[0,1,2,0] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,1,2,0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1],ymm3[2,3],ymm5[4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm8[4,5,6],ymm3[7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3],xmm2[3,3] @@ -421,15 +421,15 @@ ; AVX2-SLOW-NEXT: vbroadcastf128 {{.*#+}} ymm8 = [2,6,0,3,2,6,0,3] ; AVX2-SLOW-NEXT: # ymm8 = mem[0,1,0,1] ; AVX2-SLOW-NEXT: vpermps %ymm6, %ymm8, %ymm8 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm9 = ymm4[3,3,3,3,7,7,7,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm9 = ymm4[3,3,3,3,7,7,7,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[0,1,0,2] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm9[5,6],ymm8[7] ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm9 ; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm9 = ymm9[2],ymm3[2],ymm9[3],ymm3[3],ymm9[6],ymm3[6],ymm9[7],ymm3[7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm9 = ymm9[0,1,0,1,4,5,4,5] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm9 = ymm9[0,1,0,1,4,5,4,5] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm9[2,3,4],ymm8[5,6,7] ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm9 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm10 = ymm7[1,1,1,1,5,5,5,5] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm10 = ymm7[1,1,1,1,5,5,5,5] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1,2,3,4],ymm9[5],ymm10[6,7] ; AVX2-SLOW-NEXT: vinsertps {{.*#+}} xmm5 = zero,xmm5[1],xmm0[1],zero ; AVX2-SLOW-NEXT: vbroadcastf128 {{.*#+}} ymm10 = [5,0,2,6,5,0,2,6] @@ -472,7 +472,7 @@ ; AVX2-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm3 ; AVX2-FAST-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm6 ; AVX2-FAST-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm7 -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm8 = ymm6[1,1,1,1,5,5,5,5] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm8 = ymm6[1,1,1,1,5,5,5,5] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1,2,3,4],ymm7[5],ymm8[6,7] ; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm8 = ; AVX2-FAST-NEXT: vpermps %ymm3, %ymm8, %ymm8 @@ -489,7 +489,7 @@ ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1,2,3,4],ymm8[5,6],ymm9[7] ; AVX2-FAST-NEXT: vinsertf128 $1, %xmm0, %ymm4, %ymm4 ; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm4 = ymm4[2],ymm5[2],ymm4[3],ymm5[3],ymm4[6],ymm5[6],ymm4[7],ymm5[7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm4 = ymm4[0,1,0,1,4,5,4,5] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,1,0,1,4,5,4,5] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm8[0,1],ymm4[2,3,4],ymm8[5,6,7] ; AVX2-FAST-NEXT: vmovaps {{.*#+}} xmm5 = ; AVX2-FAST-NEXT: vpermps %ymm3, %ymm5, %ymm3 @@ -504,7 +504,7 @@ ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6],ymm2[7] ; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm3 = ; AVX2-FAST-NEXT: vpermps %ymm6, %ymm3, %ymm3 -; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,3,3,3] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3] ; AVX2-FAST-NEXT: vmovaps %ymm2, (%rax) ; AVX2-FAST-NEXT: vmovaps %ymm4, 64(%rax) @@ -530,15 +530,15 @@ ; AVX2-FAST-PERLANE-NEXT: vbroadcastf128 {{.*#+}} ymm8 = [2,6,0,3,2,6,0,3] ; AVX2-FAST-PERLANE-NEXT: # ymm8 = mem[0,1,0,1] ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm6, %ymm8, %ymm8 -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm9 = ymm4[3,3,3,3,7,7,7,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm9 = ymm4[3,3,3,3,7,7,7,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[0,1,0,2] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm9[5,6],ymm8[7] ; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm9 ; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm9 = ymm9[2],ymm3[2],ymm9[3],ymm3[3],ymm9[6],ymm3[6],ymm9[7],ymm3[7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm9 = ymm9[0,1,0,1,4,5,4,5] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm9 = ymm9[0,1,0,1,4,5,4,5] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm9[2,3,4],ymm8[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm9 -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm10 = ymm7[1,1,1,1,5,5,5,5] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm10 = ymm7[1,1,1,1,5,5,5,5] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1,2,3,4],ymm9[5],ymm10[6,7] ; AVX2-FAST-PERLANE-NEXT: vinsertps {{.*#+}} xmm5 = zero,xmm5[1],xmm0[1],zero ; AVX2-FAST-PERLANE-NEXT: vbroadcastf128 {{.*#+}} ymm10 = [5,0,2,6,5,0,2,6] @@ -790,12 +790,12 @@ ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm12 = xmm12[2,0],xmm13[2,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm12, %ymm12 ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm9 = xmm14[0],xmm15[0],xmm14[1],xmm15[1] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm9 = xmm9[0,1,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm9 = xmm9[0,1,0,1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm12[0,1],ymm9[2,3],ymm12[4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm9[0,1,2,3],ymm4[4,5,6],ymm9[7] ; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm4 = ymm4[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm9 = ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[6],ymm3[6],ymm2[7],ymm3[7] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3],ymm9[2,3] ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm9 = ymm7[1],ymm8[1],ymm7[3],ymm8[3] @@ -818,7 +818,7 @@ ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm9 = xmm14[2],xmm15[2],xmm14[3],xmm15[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm9, %ymm9 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1,2,3,4],ymm8[5,6],ymm9[7] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm9 = xmm6[2,2,2,2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm9 = xmm6[2,2,2,2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm9 = xmm5[0,1,2],xmm9[3] ; AVX1-ONLY-NEXT: vbroadcastsd 8(%rax), %ymm11 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm11[4,5,6,7] @@ -857,7 +857,7 @@ ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm9 ; AVX2-SLOW-NEXT: vmovaps (%r8), %xmm4 ; AVX2-SLOW-NEXT: vmovaps (%r9), %xmm5 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm10 = xmm5[1,1,1,1] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm10 = xmm5[1,1,1,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm10 = xmm10[0],xmm4[1],xmm10[2,3] ; AVX2-SLOW-NEXT: vbroadcastsd %xmm10, %ymm10 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1,2,3,4],ymm9[5],ymm10[6,7] @@ -866,7 +866,7 @@ ; AVX2-SLOW-NEXT: vinsertps {{.*#+}} xmm10 = zero,xmm13[1],xmm12[1],zero ; AVX2-SLOW-NEXT: vmovaps (%rdi), %xmm14 ; AVX2-SLOW-NEXT: vmovaps (%rsi), %xmm15 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm11 = xmm15[1,1,2,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm11 = xmm15[1,1,2,2] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm11 = xmm11[0,1],xmm14[2],xmm11[3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[0,1,2,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0],ymm10[1,2],ymm11[3,4,5,6,7] @@ -876,7 +876,7 @@ ; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm11 = ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[6],ymm2[6],ymm0[7],ymm2[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[2,2,2,2] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm11 = ymm7[1,1,2,2,5,5,6,6] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm11 = ymm7[1,1,2,2,5,5,6,6] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1],ymm6[2],ymm11[3,4,5],ymm6[6],ymm11[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[2,1,2,3] ; AVX2-SLOW-NEXT: vmovaps 16(%rax), %xmm9 @@ -886,10 +886,10 @@ ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm9 = xmm14[3,3],xmm15[3,3] ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm9 ; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} xmm11 = xmm13[2],xmm12[2],xmm13[3],xmm12[3] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm11 = xmm11[0,1,2,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm11 = xmm11[0,1,2,2] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[0,1,2,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm11[0,1,2,3,4],ymm9[5,6],ymm11[7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm11 = xmm5[2,2,2,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm11 = xmm5[2,2,2,2] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm11 = xmm4[0,1,2],xmm11[3] ; AVX2-SLOW-NEXT: vbroadcastsd 8(%rax), %ymm10 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7] @@ -898,7 +898,7 @@ ; AVX2-SLOW-NEXT: vbroadcastss %xmm13, %xmm10 ; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] ; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm10 = xmm14[0],xmm15[0],xmm14[1],xmm15[1] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm10 = xmm10[0,1,2,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm10 = xmm10[0,1,2,2] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[0,1,2,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1],ymm9[2,3],ymm10[4,5,6,7] ; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm10 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] @@ -907,12 +907,12 @@ ; AVX2-SLOW-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm10[0],ymm12[0],ymm10[2],ymm12[2] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm12 = ymm9[0,1,2,3],ymm10[4,5,6],ymm9[7] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm9 = ymm8[1,1],ymm3[1,1],ymm8[5,5],ymm3[5,5] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm10 = ymm2[1,1,1,1,5,5,5,5] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm10 = ymm2[1,1,1,1,5,5,5,5] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0],ymm0[1],ymm10[2,3,4],ymm0[5],ymm10[6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[2,2,2,2] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1,2,3,4],ymm9[5,6],ymm10[7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm10 = ymm7[0,0,0,0,4,4,4,4] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm13 = ymm6[0,1,0,1,4,5,4,5] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm10 = ymm7[0,0,0,0,4,4,4,4] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm13 = ymm6[0,1,0,1,4,5,4,5] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm10 = ymm13[0],ymm10[1],ymm13[2,3,4],ymm10[5],ymm13[6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[2,1,2,3] ; AVX2-SLOW-NEXT: vbroadcastsd 16(%rax), %ymm13 @@ -929,7 +929,7 @@ ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0],ymm6[1,2,3,4,5,6],ymm7[7] ; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[4],ymm2[4],ymm0[5],ymm2[5] ; AVX2-SLOW-NEXT: vbroadcastss 16(%rdx), %ymm2 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm3 = ymm3[3,1,2,0,7,5,6,4] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm3 = ymm3[3,1,2,0,7,5,6,4] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6],ymm3[7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5],ymm2[6,7] ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm4[3,3],xmm5[3,3] @@ -962,7 +962,7 @@ ; AVX2-FAST-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm6 ; AVX2-FAST-NEXT: vmovaps (%r8), %xmm4 ; AVX2-FAST-NEXT: vmovaps (%r9), %xmm5 -; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm9 = xmm5[1,1,1,1] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm9 = xmm5[1,1,1,1] ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm9 = xmm9[0],xmm4[1],xmm9[2,3] ; AVX2-FAST-NEXT: vbroadcastsd %xmm9, %ymm9 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm9[0,1,2,3,4],ymm6[5],ymm9[6,7] @@ -971,7 +971,7 @@ ; AVX2-FAST-NEXT: vinsertps {{.*#+}} xmm9 = zero,xmm12[1],xmm11[1],zero ; AVX2-FAST-NEXT: vmovaps (%rdi), %xmm13 ; AVX2-FAST-NEXT: vmovaps (%rsi), %xmm14 -; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm15 = xmm14[1,1,2,2] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm15 = xmm14[1,1,2,2] ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm15 = xmm15[0,1],xmm13[2],xmm15[3] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[0,1,2,1] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm9 = ymm15[0],ymm9[1,2],ymm15[3,4,5,6,7] @@ -981,20 +981,20 @@ ; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm9 = ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[6],ymm2[6],ymm1[7],ymm2[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[2,2,2,2] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm9[0,1,2,3],ymm6[4,5,6,7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm9 = ymm8[1,1,2,2,5,5,6,6] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm9 = ymm8[1,1,2,2,5,5,6,6] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1],ymm7[2],ymm9[3,4,5],ymm7[6],ymm9[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[2,1,2,3] ; AVX2-FAST-NEXT: vmovaps 16(%rax), %xmm15 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0],ymm15[1],ymm9[2,3,4,5,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1],ymm6[2,3,4,5],ymm9[6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm6 = ymm2[1,1,1,1,5,5,5,5] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm6 = ymm2[1,1,1,1,5,5,5,5] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm1[1],ymm6[2,3,4],ymm1[5],ymm6[6,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[2,2,2,2] ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm15 = ymm10[1,1],ymm3[1,1],ymm10[5,5],ymm3[5,5] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm15[5,6],ymm6[7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm15 = ymm8[0,0,0,0,4,4,4,4] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm9 = ymm7[0,1,0,1,4,5,4,5] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm15 = ymm8[0,0,0,0,4,4,4,4] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm9 = ymm7[0,1,0,1,4,5,4,5] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0],ymm15[1],ymm9[2,3,4],ymm15[5],ymm9[6,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[2,1,2,3] ; AVX2-FAST-NEXT: vbroadcastsd 16(%rax), %ymm15 @@ -1008,7 +1008,7 @@ ; AVX2-FAST-NEXT: # ymm0 = mem[0,1,0,1] ; AVX2-FAST-NEXT: vpermps %ymm9, %ymm0, %ymm9 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm9[0,1,2,3,4],ymm6[5,6],ymm9[7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm9 = xmm5[2,2,2,2] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm9 = xmm5[2,2,2,2] ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm9 = xmm4[0,1,2],xmm9[3] ; AVX2-FAST-NEXT: vbroadcastsd 8(%rax), %ymm15 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm15[4,5,6,7] @@ -1036,7 +1036,7 @@ ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm8[4,5,6],ymm0[7] ; AVX2-FAST-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[4],ymm2[4],ymm1[5],ymm2[5] ; AVX2-FAST-NEXT: vbroadcastss 16(%rdx), %ymm2 -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm3 = ymm3[3,1,2,0,7,5,6,4] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm3 = ymm3[3,1,2,0,7,5,6,4] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6],ymm3[7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5],ymm2[6,7] ; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm2 = xmm4[3,3],xmm5[3,3] @@ -1069,7 +1069,7 @@ ; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm9 ; AVX2-FAST-PERLANE-NEXT: vmovaps (%r8), %xmm4 ; AVX2-FAST-PERLANE-NEXT: vmovaps (%r9), %xmm5 -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm10 = xmm5[1,1,1,1] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm10 = xmm5[1,1,1,1] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm10 = xmm10[0],xmm4[1],xmm10[2,3] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd %xmm10, %ymm10 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1,2,3,4],ymm9[5],ymm10[6,7] @@ -1078,7 +1078,7 @@ ; AVX2-FAST-PERLANE-NEXT: vinsertps {{.*#+}} xmm10 = zero,xmm13[1],xmm12[1],zero ; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdi), %xmm14 ; AVX2-FAST-PERLANE-NEXT: vmovaps (%rsi), %xmm15 -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm11 = xmm15[1,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm11 = xmm15[1,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm11 = xmm11[0,1],xmm14[2],xmm11[3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[0,1,2,1] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0],ymm10[1,2],ymm11[3,4,5,6,7] @@ -1088,7 +1088,7 @@ ; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm11 = ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[6],ymm2[6],ymm0[7],ymm2[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[2,2,2,2] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm11 = ymm7[1,1,2,2,5,5,6,6] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm11 = ymm7[1,1,2,2,5,5,6,6] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1],ymm6[2],ymm11[3,4,5],ymm6[6],ymm11[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[2,1,2,3] ; AVX2-FAST-PERLANE-NEXT: vmovaps 16(%rax), %xmm9 @@ -1098,10 +1098,10 @@ ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm9 = xmm14[3,3],xmm15[3,3] ; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm9 ; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} xmm11 = xmm13[2],xmm12[2],xmm13[3],xmm12[3] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm11 = xmm11[0,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm11 = xmm11[0,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[0,1,2,1] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm11[0,1,2,3,4],ymm9[5,6],ymm11[7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm11 = xmm5[2,2,2,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm11 = xmm5[2,2,2,2] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm11 = xmm4[0,1,2],xmm11[3] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 8(%rax), %ymm10 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7] @@ -1110,7 +1110,7 @@ ; AVX2-FAST-PERLANE-NEXT: vbroadcastss %xmm13, %xmm10 ; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] ; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm10 = xmm14[0],xmm15[0],xmm14[1],xmm15[1] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm10 = xmm10[0,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm10 = xmm10[0,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[0,1,2,1] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1],ymm9[2,3],ymm10[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm10 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] @@ -1119,12 +1119,12 @@ ; AVX2-FAST-PERLANE-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm10[0],ymm12[0],ymm10[2],ymm12[2] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm12 = ymm9[0,1,2,3],ymm10[4,5,6],ymm9[7] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm9 = ymm8[1,1],ymm3[1,1],ymm8[5,5],ymm3[5,5] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm10 = ymm2[1,1,1,1,5,5,5,5] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm10 = ymm2[1,1,1,1,5,5,5,5] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0],ymm0[1],ymm10[2,3,4],ymm0[5],ymm10[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[2,2,2,2] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1,2,3,4],ymm9[5,6],ymm10[7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm10 = ymm7[0,0,0,0,4,4,4,4] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm13 = ymm6[0,1,0,1,4,5,4,5] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm10 = ymm7[0,0,0,0,4,4,4,4] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm13 = ymm6[0,1,0,1,4,5,4,5] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm10 = ymm13[0],ymm10[1],ymm13[2,3,4],ymm10[5],ymm13[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[2,1,2,3] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 16(%rax), %ymm13 @@ -1141,7 +1141,7 @@ ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0],ymm6[1,2,3,4,5,6],ymm7[7] ; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[4],ymm2[4],ymm0[5],ymm2[5] ; AVX2-FAST-PERLANE-NEXT: vbroadcastss 16(%rdx), %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm3 = ymm3[3,1,2,0,7,5,6,4] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm3 = ymm3[3,1,2,0,7,5,6,4] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6],ymm3[7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5],ymm2[6,7] ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm2 = xmm4[3,3],xmm5[3,3] @@ -1581,7 +1581,7 @@ ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm4[2,3],ymm3[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm0[6,7] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm1[2,3,2,3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm4 = ymm4[1,1,2,2,5,5,6,6] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[1,1,2,2,5,5,6,6] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0],ymm3[1,2,3,4,5,6],ymm4[7] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm2[2,3],ymm3[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3,4],ymm4[5],ymm3[6,7] @@ -1604,7 +1604,7 @@ ; AVX1-ONLY-NEXT: vmovaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps %xmm5, %xmm14 ; AVX1-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm4 = xmm4[0,1,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm4[0,1,0,1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3],ymm3[4,5,6,7] ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm4 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 @@ -1666,7 +1666,7 @@ ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,0],xmm12[2,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm9[0],xmm3[0],xmm9[1],xmm3[1] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[0,1,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,1,0,1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6],ymm1[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -1689,7 +1689,7 @@ ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm5[3,4,5,6],ymm0[7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm8[1],ymm10[1],ymm8[3],ymm10[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm3 = ymm7[2],ymm6[2],ymm7[3],ymm6[3],ymm7[6],ymm6[6],ymm7[7],ymm6[7] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm3[2,3] ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm2[1],ymm1[1],ymm2[3],ymm1[3] @@ -1722,7 +1722,7 @@ ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[3,3],ymm1[3,3],ymm2[7,7],ymm1[7,7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm4[2,3],ymm1[1,2],ymm4[6,7],ymm1[5,6] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,2,3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,2,3,1,4,6,7,5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2,3,1,4,6,7,5] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4],ymm1[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -1735,7 +1735,7 @@ ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1,2,3,4],ymm0[5,6],ymm1[7] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm5[2,2,2,2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm5[2,2,2,2] ; AVX1-ONLY-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm0[0,1,2],xmm1[3] ; AVX1-ONLY-NEXT: vbroadcastsd 40(%rax), %ymm2 @@ -1748,7 +1748,7 @@ ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm2, %ymm2 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6],ymm2[7] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm2 = xmm15[2,2,2,2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm15[2,2,2,2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm14[0,1,2],xmm2[3] ; AVX1-ONLY-NEXT: vbroadcastsd 8(%rax), %ymm3 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] @@ -1819,7 +1819,7 @@ ; AVX2-SLOW-NEXT: vmovaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vmovaps 32(%r9), %xmm1 ; AVX2-SLOW-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[1,1,1,1] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,1,1,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3] ; AVX2-SLOW-NEXT: vbroadcastsd %xmm1, %ymm1 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm1[0,1,2,3,4],ymm0[5],ymm1[6,7] @@ -1833,18 +1833,18 @@ ; AVX2-SLOW-NEXT: vmovaps 32(%rdi), %xmm8 ; AVX2-SLOW-NEXT: vmovaps (%rsi), %xmm12 ; AVX2-SLOW-NEXT: vmovaps 32(%rsi), %xmm7 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm6 = xmm7[1,1,2,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm6 = xmm7[1,1,2,2] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1],xmm8[2],xmm6[3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,2,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0],ymm5[1,2],ymm6[3,4,5,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2],ymm4[3,4,5],ymm5[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm4 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm5 = xmm14[1,1,1,1] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm5 = xmm14[1,1,1,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0],xmm13[1],xmm5[2,3] ; AVX2-SLOW-NEXT: vbroadcastsd %xmm5, %ymm5 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5],ymm5[6,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm5 = xmm12[1,1,2,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm5 = xmm12[1,1,2,2] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1],xmm2[2],xmm5[3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm6 = ymm5[0,1,2,1] ; AVX2-SLOW-NEXT: vmovaps (%rdx), %xmm5 @@ -1867,7 +1867,7 @@ ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps (%r9), %ymm1 ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm6 = ymm1[1,1,2,2,5,5,6,6] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm6 = ymm1[1,1,2,2,5,5,6,6] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm0[2],ymm6[3,4,5],ymm0[6],ymm6[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[2,1,2,3] ; AVX2-SLOW-NEXT: vmovaps 16(%rax), %xmm14 @@ -1887,7 +1887,7 @@ ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] ; AVX2-SLOW-NEXT: vmovaps 32(%r9), %ymm2 ; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm10 = ymm2[1,2,2,3,5,6,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm10 = ymm2[1,2,2,3,5,6,6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[2,2,2,2] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm10[0],ymm1[1,2,3,4,5,6],ymm10[7] ; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm0[2,3] @@ -1908,11 +1908,11 @@ ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm8[3,3],xmm7[3,3] ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} xmm1 = xmm9[2],xmm11[2],xmm9[3],xmm11[3] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,1,2,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,2] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1,2,3,4],ymm0[5,6],ymm1[7] ; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm1 = xmm2[2,2,2,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm2[2,2,2,2] ; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm0[0,1,2],xmm1[3] ; AVX2-SLOW-NEXT: vbroadcastsd 40(%rax), %ymm10 @@ -1923,7 +1923,7 @@ ; AVX2-SLOW-NEXT: vbroadcastss %xmm9, %xmm1 ; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm3 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] ; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm1 = xmm8[0],xmm7[0],xmm8[1],xmm7[1] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,1,2,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,2] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1],ymm3[2,3],ymm1[4,5,6,7] ; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm1 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] @@ -1938,11 +1938,11 @@ ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload ; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} xmm1 = xmm5[2],xmm9[2],xmm5[3],xmm9[3] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,1,2,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,2] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6],ymm1[7] ; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm1 = xmm3[2,2,2,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm3[2,2,2,2] ; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm8[0,1,2],xmm1[3] ; AVX2-SLOW-NEXT: vbroadcastsd 8(%rax), %ymm2 @@ -1953,7 +1953,7 @@ ; AVX2-SLOW-NEXT: vbroadcastss %xmm5, %xmm1 ; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm1 = xmm10[0],xmm12[0],xmm10[1],xmm12[1] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,1,2,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,2] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7] ; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm1 = xmm8[0],xmm3[0],xmm8[1],xmm3[1] @@ -1965,7 +1965,7 @@ ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6],ymm0[7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm4[1,1],ymm6[1,1],ymm4[5,5],ymm6[5,5] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm2 = ymm15[1,1,1,1,5,5,5,5] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm15[1,1,1,1,5,5,5,5] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm14[1],ymm2[2,3,4],ymm14[5],ymm2[6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[2,2,2,2] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6],ymm2[7] @@ -1979,7 +1979,7 @@ ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3],ymm2[4,5,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm2[0,1,2],ymm1[3,4,5,6],ymm2[7] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm2 = ymm12[1,1,1,1,5,5,5,5] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm12[1,1,1,1,5,5,5,5] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm5[1],ymm2[2,3,4],ymm5[5],ymm2[6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm2[2,2,2,2] @@ -1987,9 +1987,9 @@ ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm3 = ymm2[1,1],ymm13[1,1],ymm2[5,5],ymm13[5,5] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm0[0,1,2,3,4],ymm3[5,6],ymm0[7] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm3 = ymm1[0,0,0,0,4,4,4,4] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm3 = ymm1[0,0,0,0,4,4,4,4] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm4 = ymm0[0,1,0,1,4,5,4,5] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm4 = ymm0[0,1,0,1,4,5,4,5] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3,4],ymm3[5],ymm4[6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[2,1,2,3] ; AVX2-SLOW-NEXT: vbroadcastsd 16(%rax), %ymm4 @@ -2008,7 +2008,7 @@ ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[2,1,2,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0],ymm3[1,2,3,4],ymm4[5,6,7] ; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} ymm4 = ymm14[0],ymm15[0],ymm14[1],ymm15[1],ymm14[4],ymm15[4],ymm14[5],ymm15[5] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm5 = ymm6[3,1,2,0,7,5,6,4] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm5 = ymm6[3,1,2,0,7,5,6,4] ; AVX2-SLOW-NEXT: vbroadcastss 48(%rdx), %ymm6 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm6[6],ymm5[7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5],ymm5[6,7] @@ -2018,7 +2018,7 @@ ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1,2],xmm7[3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm5[1,2,3],ymm4[4,5,6,7] ; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} ymm5 = ymm12[0],ymm2[0],ymm12[1],ymm2[1],ymm12[4],ymm2[4],ymm12[5],ymm2[5] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm6 = ymm13[3,1,2,0,7,5,6,4] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm6 = ymm13[3,1,2,0,7,5,6,4] ; AVX2-SLOW-NEXT: vbroadcastss 16(%rdx), %ymm7 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm7[6],ymm6[7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5],ymm6[6,7] @@ -2070,7 +2070,7 @@ ; AVX2-FAST-NEXT: vmovaps (%r9), %xmm8 ; AVX2-FAST-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vmovaps 32(%r9), %xmm4 -; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm1 = xmm4[1,1,1,1] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm4[1,1,1,1] ; AVX2-FAST-NEXT: vmovaps %xmm4, %xmm14 ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2,3] ; AVX2-FAST-NEXT: vbroadcastsd %xmm1, %ymm1 @@ -2086,18 +2086,18 @@ ; AVX2-FAST-NEXT: vmovaps (%rsi), %xmm10 ; AVX2-FAST-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vmovaps 32(%rsi), %xmm3 -; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm4 = xmm3[1,1,2,2] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm4 = xmm3[1,1,2,2] ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],xmm5[2],xmm4[3] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,2,1] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0],ymm1[1,2],ymm4[3,4,5,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5],ymm1[6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm1 = xmm8[1,1,1,1] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm8[1,1,1,1] ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm7[1],xmm1[2,3] ; AVX2-FAST-NEXT: vbroadcastsd %xmm1, %ymm1 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5],ymm1[6,7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm1 = xmm10[1,1,2,2] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm10[1,1,2,2] ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm12[2],xmm1[3] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-FAST-NEXT: vmovaps (%rdx), %xmm2 @@ -2122,7 +2122,7 @@ ; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps (%r9), %ymm1 ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[1,1,2,2,5,5,6,6] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,1,2,2,5,5,6,6] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4,5],ymm2[6],ymm1[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] ; AVX2-FAST-NEXT: vmovaps 16(%rax), %xmm4 @@ -2167,7 +2167,7 @@ ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm15[0,1,2,3,4],ymm1[5,6],ymm15[7] ; AVX2-FAST-NEXT: vmovaps %xmm14, %xmm0 ; AVX2-FAST-NEXT: vmovaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm15 = xmm14[2,2,2,2] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm15 = xmm14[2,2,2,2] ; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm15 = xmm8[0,1,2],xmm15[3] ; AVX2-FAST-NEXT: vbroadcastsd 40(%rax), %ymm14 @@ -2188,12 +2188,12 @@ ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm3[4,5,6],ymm1[7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm2[1,1],ymm11[1,1],ymm2[5,5],ymm11[5,5] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm2 = ymm12[1,1,1,1,5,5,5,5] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm12[1,1,1,1,5,5,5,5] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm13[1],ymm2[2,3,4],ymm13[5],ymm2[6,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[2,2,2,2] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6],ymm2[7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm2 = ymm4[0,0,0,0,4,4,4,4] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm3 = ymm10[0,1,0,1,4,5,4,5] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm4[0,0,0,0,4,4,4,4] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm3 = ymm10[0,1,0,1,4,5,4,5] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3,4],ymm2[5],ymm3[6,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[2,1,2,3] ; AVX2-FAST-NEXT: vbroadcastsd 48(%rax), %ymm3 @@ -2210,7 +2210,7 @@ ; AVX2-FAST-NEXT: vpermps %ymm3, %ymm7, %ymm3 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3,4],ymm1[5,6],ymm3[7] ; AVX2-FAST-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm3 = xmm10[2,2,2,2] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm3 = xmm10[2,2,2,2] ; AVX2-FAST-NEXT: vmovaps (%rsp), %xmm14 # 16-byte Reload ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm3 = xmm14[0,1,2],xmm3[3] ; AVX2-FAST-NEXT: vbroadcastsd 8(%rax), %ymm4 @@ -2231,7 +2231,7 @@ ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6],ymm0[7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm3 = ymm0[1,1,1,1,5,5,5,5] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm3 = ymm0[1,1,1,1,5,5,5,5] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm6[1],ymm3[2,3,4],ymm6[5],ymm3[6,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm3[2,2,2,2] @@ -2240,9 +2240,9 @@ ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm3[1,1],ymm1[1,1],ymm3[5,5],ymm1[5,5] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm9 = ymm2[0,1,2,3,4],ymm4[5,6],ymm2[7] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm4 = ymm7[0,0,0,0,4,4,4,4] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm4 = ymm7[0,0,0,0,4,4,4,4] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm5 = ymm2[0,1,0,1,4,5,4,5] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm5 = ymm2[0,1,0,1,4,5,4,5] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2,3,4],ymm4[5],ymm5[6,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[2,1,2,3] ; AVX2-FAST-NEXT: vbroadcastsd 16(%rax), %ymm5 @@ -2262,7 +2262,7 @@ ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[2,1,2,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0],ymm4[1,2,3,4],ymm5[5,6,7] ; AVX2-FAST-NEXT: vunpcklps {{.*#+}} ymm5 = ymm13[0],ymm12[0],ymm13[1],ymm12[1],ymm13[4],ymm12[4],ymm13[5],ymm12[5] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm6 = ymm11[3,1,2,0,7,5,6,4] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm6 = ymm11[3,1,2,0,7,5,6,4] ; AVX2-FAST-NEXT: vbroadcastss 48(%rdx), %ymm7 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm7[6],ymm6[7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5],ymm6[6,7] @@ -2271,7 +2271,7 @@ ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1,2],xmm15[3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm6[1,2,3],ymm5[4,5,6,7] ; AVX2-FAST-NEXT: vunpcklps {{.*#+}} ymm6 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm7 = ymm3[3,1,2,0,7,5,6,4] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm7 = ymm3[3,1,2,0,7,5,6,4] ; AVX2-FAST-NEXT: vbroadcastss 16(%rdx), %ymm8 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm8[6],ymm7[7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5],ymm7[6,7] @@ -2325,7 +2325,7 @@ ; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%r9), %xmm1 ; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[1,1,1,1] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,1,1,1] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd %xmm1, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm1[0,1,2,3,4],ymm0[5],ymm1[6,7] @@ -2339,18 +2339,18 @@ ; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rdi), %xmm8 ; AVX2-FAST-PERLANE-NEXT: vmovaps (%rsi), %xmm12 ; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rsi), %xmm7 -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm6 = xmm7[1,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm6 = xmm7[1,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1],xmm8[2],xmm6[3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,2,1] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0],ymm5[1,2],ymm6[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2],ymm4[3,4,5],ymm5[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm4 -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm5 = xmm14[1,1,1,1] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm5 = xmm14[1,1,1,1] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0],xmm13[1],xmm5[2,3] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd %xmm5, %ymm5 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5],ymm5[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm5 = xmm12[1,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm5 = xmm12[1,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1],xmm2[2],xmm5[3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm6 = ymm5[0,1,2,1] ; AVX2-FAST-PERLANE-NEXT: vmovaps (%rdx), %xmm5 @@ -2373,7 +2373,7 @@ ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps (%r9), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm6 = ymm1[1,1,2,2,5,5,6,6] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm6 = ymm1[1,1,2,2,5,5,6,6] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm0[2],ymm6[3,4,5],ymm0[6],ymm6[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[2,1,2,3] ; AVX2-FAST-PERLANE-NEXT: vmovaps 16(%rax), %xmm14 @@ -2393,7 +2393,7 @@ ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%r9), %ymm2 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm10 = ymm2[1,2,2,3,5,6,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm10 = ymm2[1,2,2,3,5,6,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm10 = ymm10[2,2,2,2] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm10[0],ymm1[1,2,3,4,5,6],ymm10[7] ; AVX2-FAST-PERLANE-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[2,3],ymm0[2,3] @@ -2414,11 +2414,11 @@ ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm8[3,3],xmm7[3,3] ; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} xmm1 = xmm9[2],xmm11[2],xmm9[3],xmm11[3] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1,2,3,4],ymm0[5,6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm1 = xmm2[2,2,2,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm2[2,2,2,2] ; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm1 = xmm0[0,1,2],xmm1[3] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 40(%rax), %ymm10 @@ -2429,7 +2429,7 @@ ; AVX2-FAST-PERLANE-NEXT: vbroadcastss %xmm9, %xmm1 ; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm3 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] ; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm1 = xmm8[0],xmm7[0],xmm8[1],xmm7[1] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1],ymm3[2,3],ymm1[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm1 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] @@ -2444,11 +2444,11 @@ ; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} xmm1 = xmm5[2],xmm9[2],xmm5[3],xmm9[3] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm1 = xmm3[2,2,2,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm3[2,2,2,2] ; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm1 = xmm8[0,1,2],xmm1[3] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 8(%rax), %ymm2 @@ -2459,7 +2459,7 @@ ; AVX2-FAST-PERLANE-NEXT: vbroadcastss %xmm5, %xmm1 ; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm1 = xmm10[0],xmm12[0],xmm10[1],xmm12[1] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm1 = xmm8[0],xmm3[0],xmm8[1],xmm3[1] @@ -2471,7 +2471,7 @@ ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6],ymm0[7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm4[1,1],ymm6[1,1],ymm4[5,5],ymm6[5,5] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm2 = ymm15[1,1,1,1,5,5,5,5] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm15[1,1,1,1,5,5,5,5] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm14[1],ymm2[2,3,4],ymm14[5],ymm2[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[2,2,2,2] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6],ymm2[7] @@ -2485,7 +2485,7 @@ ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3],ymm2[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm2[0,1,2],ymm1[3,4,5,6],ymm2[7] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm2 = ymm12[1,1,1,1,5,5,5,5] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm12[1,1,1,1,5,5,5,5] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm5[1],ymm2[2,3,4],ymm5[5],ymm2[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm2[2,2,2,2] @@ -2493,9 +2493,9 @@ ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm3 = ymm2[1,1],ymm13[1,1],ymm2[5,5],ymm13[5,5] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm0[0,1,2,3,4],ymm3[5,6],ymm0[7] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm3 = ymm1[0,0,0,0,4,4,4,4] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm3 = ymm1[0,0,0,0,4,4,4,4] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm4 = ymm0[0,1,0,1,4,5,4,5] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm4 = ymm0[0,1,0,1,4,5,4,5] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3,4],ymm3[5],ymm4[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[2,1,2,3] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 16(%rax), %ymm4 @@ -2514,7 +2514,7 @@ ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[2,1,2,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0],ymm3[1,2,3,4],ymm4[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} ymm4 = ymm14[0],ymm15[0],ymm14[1],ymm15[1],ymm14[4],ymm15[4],ymm14[5],ymm15[5] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm5 = ymm6[3,1,2,0,7,5,6,4] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm5 = ymm6[3,1,2,0,7,5,6,4] ; AVX2-FAST-PERLANE-NEXT: vbroadcastss 48(%rdx), %ymm6 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm6[6],ymm5[7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5],ymm5[6,7] @@ -2524,7 +2524,7 @@ ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1,2],xmm7[3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm5[1,2,3],ymm4[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} ymm5 = ymm12[0],ymm2[0],ymm12[1],ymm2[1],ymm12[4],ymm2[4],ymm12[5],ymm2[5] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm6 = ymm13[3,1,2,0,7,5,6,4] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm6 = ymm13[3,1,2,0,7,5,6,4] ; AVX2-FAST-PERLANE-NEXT: vbroadcastss 16(%rdx), %ymm7 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm7[6],ymm6[7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5],ymm6[6,7] @@ -3443,7 +3443,7 @@ ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[1,1,2,2,5,5,6,6] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,1,2,2,5,5,6,6] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6],ymm1[7] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm3[2,3],ymm0[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7] @@ -3467,7 +3467,7 @@ ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm8[0],xmm7[0],xmm8[1],xmm7[1] ; AVX1-ONLY-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[0,1,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,1,0,1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm2 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 @@ -3524,7 +3524,7 @@ ; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps %xmm2, %xmm7 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,1,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,0,1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 32(%r9), %xmm3 ; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -3587,7 +3587,7 @@ ; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps %xmm2, %xmm10 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,1,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,0,1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 64(%r9), %xmm3 ; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -3647,7 +3647,7 @@ ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm14 = xmm3[0],xmm5[0],xmm3[1],xmm5[1] ; AVX1-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps %xmm3, (%rsp) # 16-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm14 = xmm14[0,1,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm14[0,1,0,1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1],ymm14[2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 96(%r9), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -3673,7 +3673,7 @@ ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[6],ymm3[6],ymm4[7],ymm3[7] @@ -3689,7 +3689,7 @@ ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm8[1],ymm0[3],ymm8[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm8[2],ymm5[2],ymm8[3],ymm5[3],ymm8[6],ymm5[6],ymm8[7],ymm5[7] @@ -3704,7 +3704,7 @@ ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm15[1],ymm0[3],ymm15[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm10[2],ymm9[2],ymm10[3],ymm9[3],ymm10[6],ymm9[6],ymm10[7],ymm9[7] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] ; AVX1-ONLY-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -3783,7 +3783,7 @@ ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[2,3],ymm1[1,2],ymm2[6,7],ymm1[5,6] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,2,3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,2,3,1,4,6,7,5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2,3,1,4,6,7,5] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4],ymm1[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -3796,7 +3796,7 @@ ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6],ymm1[7] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm15[2,2,2,2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm15[2,2,2,2] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm14[0,1,2],xmm1[3] ; AVX1-ONLY-NEXT: vbroadcastsd 40(%rax), %ymm13 @@ -3815,7 +3815,7 @@ ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm2[2,3],ymm11[1,2],ymm2[6,7],ymm11[5,6] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm11[2,3,2,3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm11 = ymm11[0,2,3,1,4,6,7,5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm11[0,2,3,1,4,6,7,5] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm11[0],ymm1[1,2,3,4],ymm11[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload @@ -3828,7 +3828,7 @@ ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm9 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm9[5,6],ymm8[7] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm9 = xmm12[2,2,2,2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm9 = xmm12[2,2,2,2] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm9 = xmm13[0,1,2],xmm9[3] ; AVX1-ONLY-NEXT: vbroadcastsd 72(%rax), %ymm10 @@ -3850,7 +3850,7 @@ ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm2[2,3],ymm4[1,2],ymm2[6,7],ymm4[5,6] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3,2,3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm4 = ymm4[0,2,3,1,4,6,7,5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,2,3,1,4,6,7,5] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm9[1,2,3,4],ymm4[5,6,7] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; AVX1-ONLY-NEXT: vshufps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm3 # 16-byte Folded Reload @@ -3971,7 +3971,7 @@ ; AVX2-SLOW-NEXT: vmovaps (%r9), %xmm8 ; AVX2-SLOW-NEXT: vmovaps 32(%r9), %xmm9 ; AVX2-SLOW-NEXT: vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm1 = xmm8[1,1,1,1] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm8[1,1,1,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm14[1],xmm1[2,3] ; AVX2-SLOW-NEXT: vbroadcastsd %xmm1, %ymm1 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5],ymm1[6,7] @@ -3986,18 +3986,18 @@ ; AVX2-SLOW-NEXT: vmovaps (%rsi), %xmm4 ; AVX2-SLOW-NEXT: vmovaps 32(%rsi), %xmm12 ; AVX2-SLOW-NEXT: vmovaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm2 = xmm4[1,1,2,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm4[1,1,2,2] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm5[2],xmm2[3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,2,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3,4,5,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5],ymm1[6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm1 = xmm9[1,1,1,1] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm9[1,1,1,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm6[1],xmm1[2,3] ; AVX2-SLOW-NEXT: vbroadcastsd %xmm1, %ymm1 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5],ymm1[6,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm1 = xmm12[1,1,2,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm12[1,1,2,2] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm13[2],xmm1[3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-SLOW-NEXT: vmovaps 32(%rdx), %xmm6 @@ -4009,7 +4009,7 @@ ; AVX2-SLOW-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vmovaps 64(%r9), %xmm0 ; AVX2-SLOW-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,1,1] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1,1,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] ; AVX2-SLOW-NEXT: vbroadcastsd %xmm0, %ymm0 ; AVX2-SLOW-NEXT: vmovaps 64(%rax), %xmm1 @@ -4020,7 +4020,7 @@ ; AVX2-SLOW-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vmovaps 64(%rsi), %xmm1 ; AVX2-SLOW-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[1,1,2,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,1,2,2] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-SLOW-NEXT: vmovaps 64(%rcx), %xmm13 @@ -4032,7 +4032,7 @@ ; AVX2-SLOW-NEXT: vmovaps 96(%r8), %xmm1 ; AVX2-SLOW-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vmovaps 96(%r9), %xmm11 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm11[1,1,1,1] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm11[1,1,1,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] ; AVX2-SLOW-NEXT: vbroadcastsd %xmm0, %ymm0 ; AVX2-SLOW-NEXT: vmovaps 96(%rax), %xmm1 @@ -4043,7 +4043,7 @@ ; AVX2-SLOW-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vmovaps 96(%rsi), %xmm1 ; AVX2-SLOW-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[1,1,2,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,1,2,2] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-SLOW-NEXT: vmovaps 96(%rcx), %xmm3 @@ -4070,7 +4070,7 @@ ; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps (%r9), %ymm1 ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[1,1,2,2,5,5,6,6] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,1,2,2,5,5,6,6] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4,5],ymm2[6],ymm1[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] ; AVX2-SLOW-NEXT: vmovaps 16(%rax), %xmm2 @@ -4093,7 +4093,7 @@ ; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps 32(%r9), %ymm1 ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[1,1,2,2,5,5,6,6] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,1,2,2,5,5,6,6] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4,5],ymm2[6],ymm1[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] ; AVX2-SLOW-NEXT: vmovaps 48(%rax), %xmm2 @@ -4116,7 +4116,7 @@ ; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps 64(%r9), %ymm1 ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[1,1,2,2,5,5,6,6] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,1,2,2,5,5,6,6] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4,5],ymm2[6],ymm1[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] ; AVX2-SLOW-NEXT: vmovaps 80(%rax), %xmm2 @@ -4125,7 +4125,7 @@ ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps 96(%rdi), %ymm3 ; AVX2-SLOW-NEXT: vmovaps 96(%rsi), %ymm2 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm2[1,1,1,1,5,5,5,5] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm2[1,1,1,1,5,5,5,5] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3,4],ymm3[5],ymm0[6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] ; AVX2-SLOW-NEXT: vmovaps 96(%rdx), %ymm9 @@ -4144,7 +4144,7 @@ ; AVX2-SLOW-NEXT: vbroadcastss %xmm7, %xmm15 ; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm0 = xmm15[0],xmm0[0],xmm15[1],xmm0[1] ; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm15 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm15 = xmm15[0,1,2,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm15 = xmm15[0,1,2,2] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[0,1,2,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1],ymm0[2,3],ymm15[4,5,6,7] ; AVX2-SLOW-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -4159,10 +4159,10 @@ ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm5[3,3],xmm4[3,3] ; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} xmm4 = xmm7[2],xmm10[2],xmm7[3],xmm10[3] ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm4 = xmm4[0,1,2,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm4 = xmm4[0,1,2,2] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,2,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3,4],ymm0[5,6],ymm4[7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm4 = xmm8[2,2,2,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm4 = xmm8[2,2,2,2] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm4 = xmm9[0,1,2],xmm4[3] ; AVX2-SLOW-NEXT: vbroadcastsd 8(%rax), %ymm5 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] @@ -4175,7 +4175,7 @@ ; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload ; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload ; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm4 = xmm10[0],xmm14[0],xmm10[1],xmm14[1] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm4 = xmm4[0,1,2,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm4 = xmm4[0,1,2,2] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,2,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,3],ymm4[4,5,6,7] ; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload @@ -4189,10 +4189,10 @@ ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm10[3,3],xmm14[3,3] ; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} xmm4 = xmm6[2],xmm8[2],xmm6[3],xmm8[3] ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm4 = xmm4[0,1,2,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm4 = xmm4[0,1,2,2] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,2,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3,4],ymm0[5,6],ymm4[7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm4 = xmm9[2,2,2,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm4 = xmm9[2,2,2,2] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm4 = xmm7[0,1,2],xmm4[3] ; AVX2-SLOW-NEXT: vbroadcastsd 40(%rax), %ymm5 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] @@ -4204,7 +4204,7 @@ ; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload ; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload ; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm4 = xmm9[0],xmm8[0],xmm9[1],xmm8[1] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm4 = xmm4[0,1,2,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm4 = xmm4[0,1,2,2] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,2,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,3],ymm4[4,5,6,7] ; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload @@ -4218,10 +4218,10 @@ ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm9[3,3],xmm8[3,3] ; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} xmm4 = xmm12[2],xmm13[2],xmm12[3],xmm13[3] ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm4 = xmm4[0,1,2,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm4 = xmm4[0,1,2,2] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,2,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3,4],ymm0[5,6],ymm4[7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm4 = xmm6[2,2,2,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm4 = xmm6[2,2,2,2] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm4 = xmm7[0,1,2],xmm4[3] ; AVX2-SLOW-NEXT: vbroadcastsd 72(%rax), %ymm5 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] @@ -4235,7 +4235,7 @@ ; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload ; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload ; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm4 = xmm8[0],xmm7[0],xmm8[1],xmm7[1] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm4 = xmm4[0,1,2,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm4 = xmm4[0,1,2,2] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,2,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,3],ymm4[4,5,6,7] ; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload @@ -4248,23 +4248,23 @@ ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm8[3,3],xmm7[3,3] ; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} xmm4 = xmm10[2],xmm9[2],xmm10[3],xmm9[3] ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm4 = xmm4[0,1,2,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm4 = xmm4[0,1,2,2] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,2,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3,4],ymm0[5,6],ymm4[7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm4 = xmm11[2,2,2,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm4 = xmm11[2,2,2,2] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm4 = xmm6[0,1,2],xmm4[3] ; AVX2-SLOW-NEXT: vbroadcastsd 104(%rax), %ymm5 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm4[2,3,4],ymm0[5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vbroadcastss 112(%rdx), %ymm0 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm5 = ymm1[3,1,2,0,7,5,6,4] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm5 = ymm1[3,1,2,0,7,5,6,4] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5],ymm0[6],ymm5[7] ; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} ymm5 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5],ymm0[6,7] ; AVX2-SLOW-NEXT: vbroadcastss 108(%r8), %ymm5 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm5[1],ymm0[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm5 = xmm11[2,2,3,3] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm5 = xmm11[2,2,3,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm5[2,3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm5 = ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[6],ymm2[6],ymm3[7],ymm2[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[2,2,2,2] @@ -4294,7 +4294,7 @@ ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6],ymm1[7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm8[1,1,1,1,5,5,5,5] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm8[1,1,1,1,5,5,5,5] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm5[1],ymm1[2,3,4],ymm5[5],ymm1[6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,2] @@ -4303,9 +4303,9 @@ ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm9[1,1],ymm4[1,1],ymm9[5,5],ymm4[5,5] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6],ymm1[7] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm2 = ymm13[0,0,0,0,4,4,4,4] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm13[0,0,0,0,4,4,4,4] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm3 = ymm11[0,1,0,1,4,5,4,5] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm3 = ymm11[0,1,0,1,4,5,4,5] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3,4],ymm2[5],ymm3[6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[2,1,2,3] ; AVX2-SLOW-NEXT: vbroadcastsd 16(%rax), %ymm3 @@ -4313,7 +4313,7 @@ ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm1[3,4,5,6],ymm2[7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm2 = ymm14[1,1,1,1,5,5,5,5] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm14[1,1,1,1,5,5,5,5] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm12[1],ymm2[2,3,4],ymm12[5],ymm2[6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[2,2,2,2] @@ -4351,7 +4351,7 @@ ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2],ymm3[3,4,5,6],ymm6[7] ; AVX2-SLOW-NEXT: vbroadcastss 16(%rdx), %ymm3 ; AVX2-SLOW-NEXT: vmovaps %ymm4, %ymm1 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm7 = ymm4[3,1,2,0,7,5,6,4] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm7 = ymm4[3,1,2,0,7,5,6,4] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm7[0,1,2,3,4,5],ymm3[6],ymm7[7] ; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} ymm7 = ymm5[0],ymm8[0],ymm5[1],ymm8[1],ymm5[4],ymm8[4],ymm5[5],ymm8[5] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm7[4,5],ymm3[6,7] @@ -4372,7 +4372,7 @@ ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[2,1,2,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0],ymm3[1,2,3,4],ymm8[5,6,7] ; AVX2-SLOW-NEXT: vbroadcastss 48(%rdx), %ymm3 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm9 = ymm10[3,1,2,0,7,5,6,4] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm9 = ymm10[3,1,2,0,7,5,6,4] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm9[0,1,2,3,4,5],ymm3[6],ymm9[7] ; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} ymm9 = ymm12[0],ymm14[0],ymm12[1],ymm14[1],ymm12[4],ymm14[4],ymm12[5],ymm14[5] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm9[4,5],ymm3[6,7] @@ -4397,7 +4397,7 @@ ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm11[0],ymm3[1,2,3,4],ymm11[5,6,7] ; AVX2-SLOW-NEXT: vbroadcastss 80(%rdx), %ymm11 ; AVX2-SLOW-NEXT: vmovaps %ymm15, %ymm1 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm12 = ymm15[3,1,2,0,7,5,6,4] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm12 = ymm15[3,1,2,0,7,5,6,4] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm11 = ymm12[0,1,2,3,4,5],ymm11[6],ymm12[7] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -4493,7 +4493,7 @@ ; AVX2-FAST-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vmovaps 32(%r9), %xmm5 ; AVX2-FAST-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[1,1,1,1] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,1,1,1] ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2,3] ; AVX2-FAST-NEXT: vbroadcastsd %xmm1, %ymm1 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5],ymm1[6,7] @@ -4511,18 +4511,18 @@ ; AVX2-FAST-NEXT: vmovaps (%rsi), %xmm13 ; AVX2-FAST-NEXT: vmovaps 32(%rsi), %xmm8 ; AVX2-FAST-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm6 = xmm13[1,1,2,2] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm6 = xmm13[1,1,2,2] ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1],xmm9[2],xmm6[3] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,2,1] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0],ymm1[1,2],ymm6[3,4,5,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5],ymm1[6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm1 = xmm5[1,1,1,1] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm5[1,1,1,1] ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm4[1],xmm1[2,3] ; AVX2-FAST-NEXT: vbroadcastsd %xmm1, %ymm1 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5],ymm1[6,7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm1 = xmm8[1,1,2,2] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm8[1,1,2,2] ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm7[2],xmm1[3] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-FAST-NEXT: vmovaps 32(%rdx), %xmm2 @@ -4535,7 +4535,7 @@ ; AVX2-FAST-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vmovaps 64(%r9), %xmm0 ; AVX2-FAST-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,1,1] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1,1,1] ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] ; AVX2-FAST-NEXT: vbroadcastsd %xmm0, %ymm0 ; AVX2-FAST-NEXT: vmovaps 64(%rax), %xmm1 @@ -4546,7 +4546,7 @@ ; AVX2-FAST-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vmovaps 64(%rsi), %xmm1 ; AVX2-FAST-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[1,1,2,2] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,1,2,2] ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-FAST-NEXT: vmovaps 64(%rcx), %xmm2 @@ -4560,7 +4560,7 @@ ; AVX2-FAST-NEXT: vmovaps 96(%r8), %xmm1 ; AVX2-FAST-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vmovaps 96(%r9), %xmm2 -; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm0 = xmm2[1,1,1,1] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm2[1,1,1,1] ; AVX2-FAST-NEXT: vmovaps %xmm2, %xmm12 ; AVX2-FAST-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] @@ -4573,7 +4573,7 @@ ; AVX2-FAST-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vmovaps 96(%rsi), %xmm1 ; AVX2-FAST-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[1,1,2,2] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,1,2,2] ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-FAST-NEXT: vmovaps 96(%rcx), %xmm2 @@ -4598,7 +4598,7 @@ ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-FAST-NEXT: vmovaps (%r8), %ymm6 ; AVX2-FAST-NEXT: vmovaps (%r9), %ymm5 -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm1 = ymm5[1,1,2,2,5,5,6,6] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm5[1,1,2,2,5,5,6,6] ; AVX2-FAST-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm6[2],ymm1[3,4,5],ymm6[6],ymm1[7] ; AVX2-FAST-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -4623,7 +4623,7 @@ ; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps 32(%r9), %ymm1 ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[1,1,2,2,5,5,6,6] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,1,2,2,5,5,6,6] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4,5],ymm2[6],ymm1[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] ; AVX2-FAST-NEXT: vmovaps 48(%rax), %xmm15 @@ -4646,7 +4646,7 @@ ; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps 64(%r9), %ymm1 ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[1,1,2,2,5,5,6,6] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,1,2,2,5,5,6,6] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4,5],ymm2[6],ymm1[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] ; AVX2-FAST-NEXT: vmovaps 80(%rax), %xmm15 @@ -4655,7 +4655,7 @@ ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps 96(%rdi), %ymm3 ; AVX2-FAST-NEXT: vmovaps 96(%rsi), %ymm2 -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm15 = ymm2[1,1,1,1,5,5,5,5] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm15 = ymm2[1,1,1,1,5,5,5,5] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0],ymm3[1],ymm15[2,3,4],ymm3[5],ymm15[6,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[2,2,2,2] ; AVX2-FAST-NEXT: vmovaps 96(%rdx), %ymm4 @@ -4670,13 +4670,13 @@ ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1],ymm15[2],ymm14[3,4,5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vbroadcastss 112(%rdx), %ymm14 -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm15 = ymm1[3,1,2,0,7,5,6,4] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm15 = ymm1[3,1,2,0,7,5,6,4] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1,2,3,4,5],ymm14[6],ymm15[7] ; AVX2-FAST-NEXT: vunpcklps {{.*#+}} ymm15 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm15[4,5],ymm14[6,7] ; AVX2-FAST-NEXT: vbroadcastss 108(%r8), %ymm15 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0],ymm15[1],ymm14[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm15 = xmm12[2,2,3,3] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm15 = xmm12[2,2,3,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1],ymm15[2,3],ymm14[4,5,6,7] ; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm15 = ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[6],ymm2[6],ymm3[7],ymm2[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[2,2,2,2] @@ -4728,20 +4728,20 @@ ; AVX2-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-FAST-NEXT: vpermps %ymm2, %ymm7, %ymm2 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6],ymm2[7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm2 = xmm11[2,2,2,2] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm2 = xmm11[2,2,2,2] ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm2 = xmm4[0,1,2],xmm2[3] ; AVX2-FAST-NEXT: vbroadcastsd 8(%rax), %ymm3 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm2[2,3,4],ymm1[5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm1 = ymm10[1,1,1,1,5,5,5,5] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm10[1,1,1,1,5,5,5,5] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm10[1],ymm1[2,3,4],ymm10[5],ymm1[6,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,2] ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm9[1,1],ymm8[1,1],ymm9[5,5],ymm8[5,5] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6],ymm1[7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm2 = ymm5[0,0,0,0,4,4,4,4] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm3 = ymm6[0,1,0,1,4,5,4,5] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm2 = ymm5[0,0,0,0,4,4,4,4] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm3 = ymm6[0,1,0,1,4,5,4,5] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3,4],ymm2[5],ymm3[6,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[2,1,2,3] ; AVX2-FAST-NEXT: vbroadcastsd 16(%rax), %ymm3 @@ -4771,7 +4771,7 @@ ; AVX2-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-FAST-NEXT: vpermps %ymm2, %ymm7, %ymm2 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6],ymm2[7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm2 = xmm8[2,2,2,2] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm2 = xmm8[2,2,2,2] ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm2 = xmm9[0,1,2],xmm2[3] ; AVX2-FAST-NEXT: vbroadcastsd 40(%rax), %ymm3 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] @@ -4819,14 +4819,14 @@ ; AVX2-FAST-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 ; AVX2-FAST-NEXT: vpermps %ymm5, %ymm7, %ymm5 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5,6],ymm5[7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm5 = xmm1[2,2,2,2] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm5 = xmm1[2,2,2,2] ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm5 = xmm6[0,1,2],xmm5[3] ; AVX2-FAST-NEXT: vbroadcastsd 72(%rax), %ymm6 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1],ymm5[2,3,4],ymm4[5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm5 = ymm15[1,1,1,1,5,5,5,5] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm5 = ymm15[1,1,1,1,5,5,5,5] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm14[1],ymm5[2,3,4],ymm14[5],ymm5[6,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[2,2,2,2] @@ -4866,14 +4866,14 @@ ; AVX2-FAST-NEXT: vpermps %ymm9, %ymm7, %ymm0 ; AVX2-FAST-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm6[5,6],ymm0[7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm6 = xmm5[2,2,2,2] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm6 = xmm5[2,2,2,2] ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm6 = xmm1[0,1,2],xmm6[3] ; AVX2-FAST-NEXT: vbroadcastsd 104(%rax), %ymm9 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm9[4,5,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm0[0,1],ymm6[2,3,4],ymm0[5,6,7] ; AVX2-FAST-NEXT: vbroadcastss 16(%rdx), %ymm6 ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm9 = ymm0[3,1,2,0,7,5,6,4] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm9 = ymm0[3,1,2,0,7,5,6,4] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm9[0,1,2,3,4,5],ymm6[6],ymm9[7] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-NEXT: vunpcklps {{.*#+}} ymm9 = ymm10[0],ymm1[0],ymm10[1],ymm1[1],ymm10[4],ymm1[4],ymm10[5],ymm1[5] @@ -4898,7 +4898,7 @@ ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[2,1,2,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0],ymm6[1,2,3,4],ymm7[5,6,7] ; AVX2-FAST-NEXT: vbroadcastss 48(%rdx), %ymm7 -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm10 = ymm11[3,1,2,0,7,5,6,4] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm10 = ymm11[3,1,2,0,7,5,6,4] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm10[0,1,2,3,4,5],ymm7[6],ymm10[7] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vunpcklps {{.*#+}} ymm10 = ymm12[0],ymm0[0],ymm12[1],ymm0[1],ymm12[4],ymm0[4],ymm12[5],ymm0[5] @@ -4923,7 +4923,7 @@ ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm11 = ymm11[2,1,2,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0],ymm10[1,2,3,4],ymm11[5,6,7] ; AVX2-FAST-NEXT: vbroadcastss 80(%rdx), %ymm11 -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm12 = ymm13[3,1,2,0,7,5,6,4] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm12 = ymm13[3,1,2,0,7,5,6,4] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm11 = ymm12[0,1,2,3,4,5],ymm11[6],ymm12[7] ; AVX2-FAST-NEXT: vunpcklps {{.*#+}} ymm12 = ymm14[0],ymm15[0],ymm14[1],ymm15[1],ymm14[4],ymm15[4],ymm14[5],ymm15[5] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5],ymm11[6,7] @@ -5014,7 +5014,7 @@ ; AVX2-FAST-PERLANE-NEXT: vmovaps (%r9), %xmm8 ; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%r9), %xmm9 ; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm1 = xmm8[1,1,1,1] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm8[1,1,1,1] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm14[1],xmm1[2,3] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd %xmm1, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5],ymm1[6,7] @@ -5029,18 +5029,18 @@ ; AVX2-FAST-PERLANE-NEXT: vmovaps (%rsi), %xmm4 ; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rsi), %xmm12 ; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm2 = xmm4[1,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm2 = xmm4[1,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm5[2],xmm2[3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,2,1] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm1 = xmm9[1,1,1,1] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm9[1,1,1,1] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm6[1],xmm1[2,3] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd %xmm1, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5],ymm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm1 = xmm12[1,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm12[1,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm13[2],xmm1[3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rdx), %xmm6 @@ -5052,7 +5052,7 @@ ; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%r9), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,1,1] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1,1,1] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd %xmm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%rax), %xmm1 @@ -5063,7 +5063,7 @@ ; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%rsi), %xmm1 ; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[1,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%rcx), %xmm13 @@ -5075,7 +5075,7 @@ ; AVX2-FAST-PERLANE-NEXT: vmovaps 96(%r8), %xmm1 ; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 96(%r9), %xmm11 -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm0 = xmm11[1,1,1,1] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm11[1,1,1,1] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd %xmm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovaps 96(%rax), %xmm1 @@ -5086,7 +5086,7 @@ ; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 96(%rsi), %xmm1 ; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[1,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-FAST-PERLANE-NEXT: vmovaps 96(%rcx), %xmm3 @@ -5113,7 +5113,7 @@ ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps (%r9), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[1,1,2,2,5,5,6,6] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,1,2,2,5,5,6,6] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4,5],ymm2[6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] ; AVX2-FAST-PERLANE-NEXT: vmovaps 16(%rax), %xmm2 @@ -5136,7 +5136,7 @@ ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%r9), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[1,1,2,2,5,5,6,6] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,1,2,2,5,5,6,6] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4,5],ymm2[6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] ; AVX2-FAST-PERLANE-NEXT: vmovaps 48(%rax), %xmm2 @@ -5159,7 +5159,7 @@ ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%r9), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[1,1,2,2,5,5,6,6] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,1,2,2,5,5,6,6] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4,5],ymm2[6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] ; AVX2-FAST-PERLANE-NEXT: vmovaps 80(%rax), %xmm2 @@ -5168,7 +5168,7 @@ ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 96(%rdi), %ymm3 ; AVX2-FAST-PERLANE-NEXT: vmovaps 96(%rsi), %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm0 = ymm2[1,1,1,1,5,5,5,5] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm2[1,1,1,1,5,5,5,5] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3,4],ymm3[5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] ; AVX2-FAST-PERLANE-NEXT: vmovaps 96(%rdx), %ymm9 @@ -5187,7 +5187,7 @@ ; AVX2-FAST-PERLANE-NEXT: vbroadcastss %xmm7, %xmm15 ; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm0 = xmm15[0],xmm0[0],xmm15[1],xmm0[1] ; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm15 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm15 = xmm15[0,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm15 = xmm15[0,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[0,1,2,1] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1],ymm0[2,3],ymm15[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -5202,10 +5202,10 @@ ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm5[3,3],xmm4[3,3] ; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} xmm4 = xmm7[2],xmm10[2],xmm7[3],xmm10[3] ; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm4 = xmm4[0,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm4 = xmm4[0,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,2,1] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3,4],ymm0[5,6],ymm4[7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm4 = xmm8[2,2,2,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm4 = xmm8[2,2,2,2] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm4 = xmm9[0,1,2],xmm4[3] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 8(%rax), %ymm5 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] @@ -5218,7 +5218,7 @@ ; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm4 = xmm10[0],xmm14[0],xmm10[1],xmm14[1] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm4 = xmm4[0,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm4 = xmm4[0,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,2,1] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,3],ymm4[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload @@ -5232,10 +5232,10 @@ ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm10[3,3],xmm14[3,3] ; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} xmm4 = xmm6[2],xmm8[2],xmm6[3],xmm8[3] ; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm4 = xmm4[0,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm4 = xmm4[0,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,2,1] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3,4],ymm0[5,6],ymm4[7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm4 = xmm9[2,2,2,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm4 = xmm9[2,2,2,2] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm4 = xmm7[0,1,2],xmm4[3] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 40(%rax), %ymm5 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] @@ -5247,7 +5247,7 @@ ; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm4 = xmm9[0],xmm8[0],xmm9[1],xmm8[1] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm4 = xmm4[0,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm4 = xmm4[0,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,2,1] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,3],ymm4[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload @@ -5261,10 +5261,10 @@ ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm9[3,3],xmm8[3,3] ; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} xmm4 = xmm12[2],xmm13[2],xmm12[3],xmm13[3] ; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm4 = xmm4[0,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm4 = xmm4[0,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,2,1] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3,4],ymm0[5,6],ymm4[7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm4 = xmm6[2,2,2,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm4 = xmm6[2,2,2,2] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm4 = xmm7[0,1,2],xmm4[3] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 72(%rax), %ymm5 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] @@ -5278,7 +5278,7 @@ ; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm4 = xmm8[0],xmm7[0],xmm8[1],xmm7[1] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm4 = xmm4[0,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm4 = xmm4[0,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,2,1] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,3],ymm4[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload @@ -5291,23 +5291,23 @@ ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm8[3,3],xmm7[3,3] ; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} xmm4 = xmm10[2],xmm9[2],xmm10[3],xmm9[3] ; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm4 = xmm4[0,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm4 = xmm4[0,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,2,1] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3,4],ymm0[5,6],ymm4[7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm4 = xmm11[2,2,2,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm4 = xmm11[2,2,2,2] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm4 = xmm6[0,1,2],xmm4[3] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 104(%rax), %ymm5 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm4[2,3,4],ymm0[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vbroadcastss 112(%rdx), %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm5 = ymm1[3,1,2,0,7,5,6,4] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm5 = ymm1[3,1,2,0,7,5,6,4] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5],ymm0[6],ymm5[7] ; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} ymm5 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vbroadcastss 108(%r8), %ymm5 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm5[1],ymm0[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm5 = xmm11[2,2,3,3] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm5 = xmm11[2,2,3,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm5[2,3],ymm0[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm5 = ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[6],ymm2[6],ymm3[7],ymm2[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[2,2,2,2] @@ -5337,7 +5337,7 @@ ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm8[1,1,1,1,5,5,5,5] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm8[1,1,1,1,5,5,5,5] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm5[1],ymm1[2,3,4],ymm5[5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,2] @@ -5346,9 +5346,9 @@ ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm9[1,1],ymm4[1,1],ymm9[5,5],ymm4[5,5] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm2 = ymm13[0,0,0,0,4,4,4,4] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm13[0,0,0,0,4,4,4,4] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm3 = ymm11[0,1,0,1,4,5,4,5] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm3 = ymm11[0,1,0,1,4,5,4,5] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3,4],ymm2[5],ymm3[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[2,1,2,3] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 16(%rax), %ymm3 @@ -5356,7 +5356,7 @@ ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2],ymm1[3,4,5,6],ymm2[7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm2 = ymm14[1,1,1,1,5,5,5,5] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm14[1,1,1,1,5,5,5,5] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm12[1],ymm2[2,3,4],ymm12[5],ymm2[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[2,2,2,2] @@ -5394,7 +5394,7 @@ ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2],ymm3[3,4,5,6],ymm6[7] ; AVX2-FAST-PERLANE-NEXT: vbroadcastss 16(%rdx), %ymm3 ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm4, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm7 = ymm4[3,1,2,0,7,5,6,4] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm7 = ymm4[3,1,2,0,7,5,6,4] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm7[0,1,2,3,4,5],ymm3[6],ymm7[7] ; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} ymm7 = ymm5[0],ymm8[0],ymm5[1],ymm8[1],ymm5[4],ymm8[4],ymm5[5],ymm8[5] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm7[4,5],ymm3[6,7] @@ -5415,7 +5415,7 @@ ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[2,1,2,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0],ymm3[1,2,3,4],ymm8[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vbroadcastss 48(%rdx), %ymm3 -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm9 = ymm10[3,1,2,0,7,5,6,4] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm9 = ymm10[3,1,2,0,7,5,6,4] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm9[0,1,2,3,4,5],ymm3[6],ymm9[7] ; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} ymm9 = ymm12[0],ymm14[0],ymm12[1],ymm14[1],ymm12[4],ymm14[4],ymm12[5],ymm14[5] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm9[4,5],ymm3[6,7] @@ -5440,7 +5440,7 @@ ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm11[0],ymm3[1,2,3,4],ymm11[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vbroadcastss 80(%rdx), %ymm11 ; AVX2-FAST-PERLANE-NEXT: vmovaps %ymm15, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm12 = ymm15[3,1,2,0,7,5,6,4] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm12 = ymm15[3,1,2,0,7,5,6,4] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm11 = ymm12[0,1,2,3,4,5],ymm11[6],ymm12[7] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -7230,7 +7230,7 @@ ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm3[6,7] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = mem[2,3,2,3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[1,1,2,2,5,5,6,6] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,1,2,2,5,5,6,6] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6],ymm1[7] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm4[2,3],ymm0[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7] @@ -7253,7 +7253,7 @@ ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm8[0],xmm7[0],xmm8[1],xmm7[1] ; AVX1-ONLY-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,1,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,0,1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 @@ -7310,7 +7310,7 @@ ; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps %xmm2, %xmm9 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,1,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,0,1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 32(%r9), %xmm3 ; AVX1-ONLY-NEXT: vmovaps %xmm3, (%rsp) # 16-byte Spill @@ -7374,7 +7374,7 @@ ; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps %xmm2, %xmm9 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,1,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,0,1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 64(%r9), %xmm3 ; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -7437,7 +7437,7 @@ ; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps %xmm2, %xmm9 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,1,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,0,1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 96(%r9), %xmm3 ; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -7500,7 +7500,7 @@ ; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps %xmm2, %xmm9 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,1,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,0,1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 128(%r9), %xmm3 ; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -7564,7 +7564,7 @@ ; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps %xmm2, %xmm9 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,1,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,0,1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 160(%r9), %xmm3 ; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -7625,7 +7625,7 @@ ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] ; AVX1-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,1,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,0,1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 192(%r9), %xmm3 ; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -7670,7 +7670,7 @@ ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm1 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] @@ -7687,7 +7687,7 @@ ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm1[2],ymm8[2],ymm1[3],ymm8[3],ymm1[6],ymm8[6],ymm1[7],ymm8[7] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] @@ -7703,7 +7703,7 @@ ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm1 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm1 = ymm8[2],mem[2],ymm8[3],mem[3],ymm8[6],mem[6],ymm8[7],mem[7] @@ -7720,7 +7720,7 @@ ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm15[1],mem[1],ymm15[3],mem[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm1 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm1 = ymm12[2],mem[2],ymm12[3],mem[3],ymm12[6],mem[6],ymm12[7],mem[7] @@ -7737,7 +7737,7 @@ ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm1 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7] @@ -7754,7 +7754,7 @@ ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX1-ONLY-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm6[2],ymm2[2],ymm6[3],ymm2[3],ymm6[6],ymm2[6],ymm6[7],ymm2[7] ; AVX1-ONLY-NEXT: vmovaps %ymm2, %ymm10 @@ -7771,7 +7771,7 @@ ; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm5[1],ymm4[1],ymm5[3],ymm4[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm7[2],ymm14[2],ymm7[3],ymm14[3],ymm7[6],ymm14[6],ymm7[7],ymm14[7] @@ -7827,7 +7827,7 @@ ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm11[0] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,0],xmm11[2,1] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1,0,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, 224(%r8), %ymm1, %ymm1 @@ -7907,7 +7907,7 @@ ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[2,3],ymm1[1,2],ymm2[6,7],ymm1[5,6] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,2,3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,2,3,1,4,6,7,5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2,3,1,4,6,7,5] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4],ymm1[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -7942,7 +7942,7 @@ ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[2,3],ymm1[1,2],ymm2[6,7],ymm1[5,6] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,2,3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,2,3,1,4,6,7,5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2,3,1,4,6,7,5] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4],ymm1[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -7976,7 +7976,7 @@ ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[2,3],ymm1[1,2],ymm2[6,7],ymm1[5,6] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,2,3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,2,3,1,4,6,7,5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2,3,1,4,6,7,5] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4],ymm1[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -8010,7 +8010,7 @@ ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm2[2,3],ymm1[1,2],ymm2[6,7],ymm1[5,6] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,2,3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,2,3,1,4,6,7,5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2,3,1,4,6,7,5] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4],ymm1[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -8045,7 +8045,7 @@ ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm3[2,3],ymm2[1,2],ymm3[6,7],ymm2[5,6] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3,2,3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm2 = ymm2[0,2,3,1,4,6,7,5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,2,3,1,4,6,7,5] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0],ymm0[1,2,3,4],ymm2[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -8058,7 +8058,7 @@ ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm0[5,6],ymm2[7] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm2 = xmm15[2,2,2,2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm15[2,2,2,2] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm13[0,1,2],xmm2[3] ; AVX1-ONLY-NEXT: vbroadcastsd 168(%rax), %ymm3 @@ -8079,7 +8079,7 @@ ; AVX1-ONLY-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm3[2,3],ymm2[1,2],ymm3[6,7],ymm2[5,6] ; AVX1-ONLY-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3,2,3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm2 = ymm2[0,2,3,1,4,6,7,5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,2,3,1,4,6,7,5] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0],ymm0[1,2,3,4],ymm2[5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -8092,7 +8092,7 @@ ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm0[5,6],ymm2[7] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm2 = xmm8[2,2,2,2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm8[2,2,2,2] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm5[0,1,2],xmm2[3] ; AVX1-ONLY-NEXT: vbroadcastsd 200(%rax), %ymm3 @@ -8311,7 +8311,7 @@ ; AVX2-SLOW-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vmovaps 32(%r9), %xmm4 ; AVX2-SLOW-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[1,1,1,1] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,1,1,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm5[1],xmm1[2,3] ; AVX2-SLOW-NEXT: vbroadcastsd %xmm1, %ymm1 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5],ymm1[6,7] @@ -8326,7 +8326,7 @@ ; AVX2-SLOW-NEXT: vmovaps (%rsi), %xmm5 ; AVX2-SLOW-NEXT: vmovaps 32(%rsi), %xmm11 ; AVX2-SLOW-NEXT: vmovaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm2 = xmm5[1,1,2,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm5[1,1,2,2] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm6[2],xmm2[3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,2,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3,4,5,6,7] @@ -8335,11 +8335,11 @@ ; AVX2-SLOW-NEXT: vmovaps 32(%rax), %xmm0 ; AVX2-SLOW-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm1 = xmm4[1,1,1,1] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm4[1,1,1,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2,3] ; AVX2-SLOW-NEXT: vbroadcastsd %xmm1, %ymm1 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5],ymm1[6,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm1 = xmm11[1,1,2,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm11[1,1,2,2] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm7[2],xmm1[3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-SLOW-NEXT: vmovaps 32(%rdx), %xmm7 @@ -8351,7 +8351,7 @@ ; AVX2-SLOW-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vmovaps 64(%r9), %xmm0 ; AVX2-SLOW-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,1,1] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1,1,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] ; AVX2-SLOW-NEXT: vbroadcastsd %xmm0, %ymm0 ; AVX2-SLOW-NEXT: vmovaps 64(%rax), %xmm1 @@ -8362,7 +8362,7 @@ ; AVX2-SLOW-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vmovaps 64(%rsi), %xmm1 ; AVX2-SLOW-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[1,1,2,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,1,2,2] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-SLOW-NEXT: vmovaps 64(%rcx), %xmm2 @@ -8376,7 +8376,7 @@ ; AVX2-SLOW-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vmovaps 96(%r9), %xmm0 ; AVX2-SLOW-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,1,1] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1,1,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] ; AVX2-SLOW-NEXT: vbroadcastsd %xmm0, %ymm0 ; AVX2-SLOW-NEXT: vmovaps 96(%rax), %xmm1 @@ -8387,7 +8387,7 @@ ; AVX2-SLOW-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vmovaps 96(%rsi), %xmm1 ; AVX2-SLOW-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[1,1,2,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,1,2,2] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-SLOW-NEXT: vmovaps 96(%rcx), %xmm3 @@ -8402,7 +8402,7 @@ ; AVX2-SLOW-NEXT: vmovaps %xmm1, (%rsp) # 16-byte Spill ; AVX2-SLOW-NEXT: vmovaps 128(%r9), %xmm0 ; AVX2-SLOW-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,1,1] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1,1,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] ; AVX2-SLOW-NEXT: vbroadcastsd %xmm0, %ymm0 ; AVX2-SLOW-NEXT: vmovaps 128(%rax), %xmm1 @@ -8413,7 +8413,7 @@ ; AVX2-SLOW-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vmovaps 128(%rsi), %xmm1 ; AVX2-SLOW-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[1,1,2,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,1,2,2] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-SLOW-NEXT: vmovaps 128(%rcx), %xmm3 @@ -8428,7 +8428,7 @@ ; AVX2-SLOW-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vmovaps 160(%r9), %xmm0 ; AVX2-SLOW-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,1,1] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1,1,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] ; AVX2-SLOW-NEXT: vbroadcastsd %xmm0, %ymm0 ; AVX2-SLOW-NEXT: vmovaps 160(%rax), %xmm1 @@ -8439,7 +8439,7 @@ ; AVX2-SLOW-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vmovaps 160(%rsi), %xmm1 ; AVX2-SLOW-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[1,1,2,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,1,2,2] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-SLOW-NEXT: vmovaps 160(%rcx), %xmm3 @@ -8454,7 +8454,7 @@ ; AVX2-SLOW-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vmovaps 192(%r8), %xmm1 ; AVX2-SLOW-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,1,1] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1,1,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] ; AVX2-SLOW-NEXT: vbroadcastsd %xmm0, %ymm0 ; AVX2-SLOW-NEXT: vmovaps 192(%rax), %xmm1 @@ -8465,7 +8465,7 @@ ; AVX2-SLOW-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-SLOW-NEXT: vmovaps 192(%rsi), %xmm1 ; AVX2-SLOW-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[1,1,2,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,1,2,2] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-SLOW-NEXT: vmovaps 192(%rcx), %xmm3 @@ -8492,7 +8492,7 @@ ; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps (%r9), %ymm1 ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[1,1,2,2,5,5,6,6] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,1,2,2,5,5,6,6] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4,5],ymm2[6],ymm1[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] ; AVX2-SLOW-NEXT: vmovaps 16(%rax), %xmm2 @@ -8515,7 +8515,7 @@ ; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps 32(%r9), %ymm1 ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[1,1,2,2,5,5,6,6] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,1,2,2,5,5,6,6] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4,5],ymm2[6],ymm1[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] ; AVX2-SLOW-NEXT: vmovaps 48(%rax), %xmm2 @@ -8538,7 +8538,7 @@ ; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps 64(%r9), %ymm1 ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[1,1,2,2,5,5,6,6] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,1,2,2,5,5,6,6] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4,5],ymm2[6],ymm1[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] ; AVX2-SLOW-NEXT: vmovaps 80(%rax), %xmm2 @@ -8561,7 +8561,7 @@ ; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps 96(%r9), %ymm1 ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[1,1,2,2,5,5,6,6] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,1,2,2,5,5,6,6] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4,5],ymm2[6],ymm1[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] ; AVX2-SLOW-NEXT: vmovaps 112(%rax), %xmm2 @@ -8584,7 +8584,7 @@ ; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps 128(%r9), %ymm1 ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[1,1,2,2,5,5,6,6] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,1,2,2,5,5,6,6] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4,5],ymm2[6],ymm1[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] ; AVX2-SLOW-NEXT: vmovaps 144(%rax), %xmm2 @@ -8607,7 +8607,7 @@ ; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps 160(%r9), %ymm1 ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[1,1,2,2,5,5,6,6] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,1,2,2,5,5,6,6] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4,5],ymm2[6],ymm1[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] ; AVX2-SLOW-NEXT: vmovaps 176(%rax), %xmm2 @@ -8630,7 +8630,7 @@ ; AVX2-SLOW-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps 192(%r9), %ymm1 ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[1,1,2,2,5,5,6,6] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,1,2,2,5,5,6,6] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4,5],ymm2[6],ymm1[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] ; AVX2-SLOW-NEXT: vmovaps 208(%rax), %xmm2 @@ -8645,7 +8645,7 @@ ; AVX2-SLOW-NEXT: vmovaps 224(%rsi), %xmm4 ; AVX2-SLOW-NEXT: vmovaps 224(%rdi), %xmm2 ; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm15 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm15 = xmm15[0,1,2,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm15 = xmm15[0,1,2,2] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[0,1,2,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm15[0,1],ymm3[2,3],ymm15[4,5,6,7] ; AVX2-SLOW-NEXT: vbroadcastsd 224(%r8), %ymm15 @@ -8657,14 +8657,14 @@ ; AVX2-SLOW-NEXT: vbroadcastss %xmm15, %ymm13 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm10 = ymm14[0,1,2,3,4,5],ymm13[6],ymm14[7] ; AVX2-SLOW-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm13 = xmm4[1,1,2,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm13 = xmm4[1,1,2,2] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm13 = xmm13[0,1],xmm2[2],xmm13[3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[0,1,2,1] ; AVX2-SLOW-NEXT: vinsertps {{.*#+}} xmm14 = zero,xmm1[1],xmm0[1],zero ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0],ymm14[1,2],ymm13[3,4,5,6,7] ; AVX2-SLOW-NEXT: vbroadcastss 228(%r8), %ymm12 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0,1,2],ymm12[3],ymm13[4,5,6,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm13 = xmm3[1,1,1,1] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm13 = xmm3[1,1,1,1] ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm13, %ymm0, %ymm13 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm13[4,5],ymm12[6,7] ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm15, %ymm14, %ymm13 @@ -8673,20 +8673,20 @@ ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm2[3,3],xmm4[3,3] ; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} xmm0 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,2,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1,2,2] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,2,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6],ymm0[7] ; AVX2-SLOW-NEXT: vmovaps 224(%r8), %ymm1 ; AVX2-SLOW-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm1 = xmm3[2,2,2,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm3[2,2,2,2] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6,7] ; AVX2-SLOW-NEXT: vbroadcastss 232(%rax), %ymm1 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4],ymm0[5,6,7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovaps 224(%rdi), %ymm4 ; AVX2-SLOW-NEXT: vmovaps 224(%rsi), %ymm2 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm2[1,1,1,1,5,5,5,5] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm2[1,1,1,1,5,5,5,5] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm4[1],ymm0[2,3,4],ymm4[5],ymm0[6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm12 = ymm0[2,2,2,2] ; AVX2-SLOW-NEXT: vmovaps 224(%rdx), %ymm10 @@ -8704,7 +8704,7 @@ ; AVX2-SLOW-NEXT: vbroadcastss %xmm8, %xmm13 ; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1] ; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm13 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm13 = xmm13[0,1,2,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm13 = xmm13[0,1,2,2] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[0,1,2,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0,1],ymm12[2,3],ymm13[4,5,6,7] ; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload @@ -8718,10 +8718,10 @@ ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm5 = xmm6[3,3],xmm5[3,3] ; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} xmm6 = xmm8[2],xmm9[2],xmm8[3],xmm9[3] ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm6 = xmm6[0,1,2,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm6 = xmm6[0,1,2,2] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,2,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3,4],ymm5[5,6],ymm6[7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm6 = xmm0[2,2,2,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm6 = xmm0[2,2,2,2] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm6 = xmm15[0,1,2],xmm6[3] ; AVX2-SLOW-NEXT: vbroadcastsd 8(%rax), %ymm8 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] @@ -8734,7 +8734,7 @@ ; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload ; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload ; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm6 = xmm13[0],xmm14[0],xmm13[1],xmm14[1] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm6 = xmm6[0,1,2,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm6 = xmm6[0,1,2,2] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,2,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1],ymm5[2,3],ymm6[4,5,6,7] ; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload @@ -8748,10 +8748,10 @@ ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm5 = xmm13[3,3],xmm14[3,3] ; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} xmm6 = xmm7[2],xmm12[2],xmm7[3],xmm12[3] ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm6 = xmm6[0,1,2,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm6 = xmm6[0,1,2,2] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,2,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3,4],ymm5[5,6],ymm6[7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm6 = xmm15[2,2,2,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm6 = xmm15[2,2,2,2] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm6 = xmm9[0,1,2],xmm6[3] ; AVX2-SLOW-NEXT: vbroadcastsd 40(%rax), %ymm7 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7] @@ -8764,7 +8764,7 @@ ; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload ; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload ; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm6 = xmm12[0],xmm9[0],xmm12[1],xmm9[1] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm6 = xmm6[0,1,2,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm6 = xmm6[0,1,2,2] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,2,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1],ymm5[2,3],ymm6[4,5,6,7] ; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -8778,10 +8778,10 @@ ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm5 = xmm12[3,3],xmm9[3,3] ; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} xmm6 = xmm11[2],xmm13[2],xmm11[3],xmm13[3] ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm6 = xmm6[0,1,2,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm6 = xmm6[0,1,2,2] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,2,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3,4],ymm5[5,6],ymm6[7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm6 = xmm0[2,2,2,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm6 = xmm0[2,2,2,2] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm6 = xmm8[0,1,2],xmm6[3] ; AVX2-SLOW-NEXT: vbroadcastsd 72(%rax), %ymm7 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7] @@ -8795,7 +8795,7 @@ ; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload ; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload ; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm6 = xmm12[0],xmm9[0],xmm12[1],xmm9[1] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm6 = xmm6[0,1,2,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm6 = xmm6[0,1,2,2] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,2,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1],ymm5[2,3],ymm6[4,5,6,7] ; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload @@ -8809,10 +8809,10 @@ ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm5 = xmm12[3,3],xmm9[3,3] ; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} xmm6 = xmm14[2],xmm13[2],xmm14[3],xmm13[3] ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm6 = xmm6[0,1,2,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm6 = xmm6[0,1,2,2] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,2,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3,4],ymm5[5,6],ymm6[7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm6 = xmm11[2,2,2,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm6 = xmm11[2,2,2,2] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm6 = xmm8[0,1,2],xmm6[3] ; AVX2-SLOW-NEXT: vbroadcastsd 104(%rax), %ymm7 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7] @@ -8826,7 +8826,7 @@ ; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload ; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload ; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm6 = xmm9[0],xmm8[0],xmm9[1],xmm8[1] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm6 = xmm6[0,1,2,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm6 = xmm6[0,1,2,2] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,2,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1],ymm5[2,3],ymm6[4,5,6,7] ; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload @@ -8840,10 +8840,10 @@ ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm5 = xmm9[3,3],xmm8[3,3] ; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} xmm6 = xmm13[2],xmm12[2],xmm13[3],xmm12[3] ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm6 = xmm6[0,1,2,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm6 = xmm6[0,1,2,2] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,2,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3,4],ymm5[5,6],ymm6[7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm6 = xmm11[2,2,2,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm6 = xmm11[2,2,2,2] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm6 = xmm0[0,1,2],xmm6[3] ; AVX2-SLOW-NEXT: vbroadcastsd 136(%rax), %ymm7 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7] @@ -8857,7 +8857,7 @@ ; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload ; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload ; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm6 = xmm9[0],xmm8[0],xmm9[1],xmm8[1] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm6 = xmm6[0,1,2,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm6 = xmm6[0,1,2,2] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,2,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1],ymm5[2,3],ymm6[4,5,6,7] ; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload @@ -8871,10 +8871,10 @@ ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm5 = xmm9[3,3],xmm8[3,3] ; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} xmm6 = xmm13[2],xmm12[2],xmm13[3],xmm12[3] ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm6 = xmm6[0,1,2,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm6 = xmm6[0,1,2,2] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,2,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3,4],ymm5[5,6],ymm6[7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm6 = xmm11[2,2,2,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm6 = xmm11[2,2,2,2] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm6 = xmm0[0,1,2],xmm6[3] ; AVX2-SLOW-NEXT: vbroadcastsd 168(%rax), %ymm7 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7] @@ -8888,7 +8888,7 @@ ; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload ; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload ; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} xmm6 = xmm9[0],xmm8[0],xmm9[1],xmm8[1] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm6 = xmm6[0,1,2,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm6 = xmm6[0,1,2,2] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,2,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1],ymm5[2,3],ymm6[4,5,6,7] ; AVX2-SLOW-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -8902,10 +8902,10 @@ ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm5 = xmm9[3,3],xmm8[3,3] ; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} xmm6 = xmm13[2],xmm11[2],xmm13[3],xmm11[3] ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm6 = xmm6[0,1,2,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm6 = xmm6[0,1,2,2] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,2,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3,4],ymm5[5,6],ymm6[7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm6 = xmm0[2,2,2,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm6 = xmm0[2,2,2,2] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm6 = xmm12[0,1,2],xmm6[3] ; AVX2-SLOW-NEXT: vbroadcastsd 200(%rax), %ymm7 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7] @@ -8928,13 +8928,13 @@ ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0],ymm5[1,2,3,4,5,6],ymm6[7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vbroadcastss 240(%rdx), %ymm5 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm6 = ymm1[3,1,2,0,7,5,6,4] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm6 = ymm1[3,1,2,0,7,5,6,4] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5],ymm5[6],ymm6[7] ; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} ymm6 = ymm4[0],ymm2[0],ymm4[1],ymm2[1],ymm4[4],ymm2[4],ymm4[5],ymm2[5] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5],ymm5[6,7] ; AVX2-SLOW-NEXT: vbroadcastss 236(%r8), %ymm6 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm6[1],ymm5[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm3 = xmm3[2,2,3,3] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm3[2,2,3,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1],ymm3[2,3],ymm5[4,5,6,7] ; AVX2-SLOW-NEXT: vunpckhps {{.*#+}} ymm5 = ymm4[2],ymm2[2],ymm4[3],ymm2[3],ymm4[6],ymm2[6],ymm4[7],ymm2[7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[2,2,2,2] @@ -8964,7 +8964,7 @@ ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6],ymm1[7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm5[1,1,1,1,5,5,5,5] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm5[1,1,1,1,5,5,5,5] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm4[1],ymm0[2,3,4],ymm4[5],ymm0[6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] @@ -8973,9 +8973,9 @@ ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm8[1,1],ymm3[1,1],ymm8[5,5],ymm3[5,5] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6],ymm0[7] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm14[0,0,0,0,4,4,4,4] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm14[0,0,0,0,4,4,4,4] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm2 = ymm9[0,1,0,1,4,5,4,5] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm9[0,1,0,1,4,5,4,5] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] ; AVX2-SLOW-NEXT: vbroadcastsd 16(%rax), %ymm2 @@ -8983,7 +8983,7 @@ ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6],ymm1[7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm15[1,1,1,1,5,5,5,5] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm15[1,1,1,1,5,5,5,5] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm10[1],ymm0[2,3,4],ymm10[5],ymm0[6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] @@ -9002,7 +9002,7 @@ ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6],ymm1[7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm6[1,1,1,1,5,5,5,5] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm6[1,1,1,1,5,5,5,5] ; AVX2-SLOW-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-SLOW-NEXT: # ymm0 = ymm0[0],mem[1],ymm0[2,3,4],mem[5],ymm0[6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] @@ -9079,7 +9079,7 @@ ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,1],ymm12[1,1],ymm0[5,5],ymm12[5,5] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm13[1,1,1,1,5,5,5,5] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm13[1,1,1,1,5,5,5,5] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm11[1],ymm1[2,3,4],ymm11[5],ymm1[6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,2] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6],ymm1[7] @@ -9094,7 +9094,7 @@ ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6],ymm1[7] ; AVX2-SLOW-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-SLOW-NEXT: vbroadcastss 16(%rdx), %ymm0 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm3[3,1,2,0,7,5,6,4] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm3[3,1,2,0,7,5,6,4] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6],ymm1[7] ; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} ymm1 = ymm4[0],ymm5[0],ymm4[1],ymm5[1],ymm4[4],ymm5[4],ymm4[5],ymm5[5] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] @@ -9116,7 +9116,7 @@ ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm9 = ymm1[0],ymm0[1,2,3,4],ymm1[5,6,7] ; AVX2-SLOW-NEXT: vbroadcastss 48(%rdx), %ymm0 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm7[3,1,2,0,7,5,6,4] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm7[3,1,2,0,7,5,6,4] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6],ymm1[7] ; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} ymm1 = ymm10[0],ymm15[0],ymm10[1],ymm15[1],ymm10[4],ymm15[4],ymm10[5],ymm15[5] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] @@ -9141,7 +9141,7 @@ ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm15 = ymm1[0],ymm0[1,2,3,4],ymm1[5,6,7] ; AVX2-SLOW-NEXT: vbroadcastss 80(%rdx), %ymm0 ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm12[3,1,2,0,7,5,6,4] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm12[3,1,2,0,7,5,6,4] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6],ymm1[7] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-SLOW-NEXT: vunpcklps {{.*#+}} ymm1 = ymm2[0],ymm6[0],ymm2[1],ymm6[1],ymm2[4],ymm6[4],ymm2[5],ymm6[5] @@ -9167,7 +9167,7 @@ ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm12 = ymm1[0],ymm0[1,2,3,4],ymm1[5,6,7] ; AVX2-SLOW-NEXT: vbroadcastss 112(%rdx), %ymm2 ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm0[3,1,2,0,7,5,6,4] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm0[3,1,2,0,7,5,6,4] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm6 = ymm1[0,1,2,3,4,5],ymm2[6],ymm1[7] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload @@ -9194,7 +9194,7 @@ ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm1[0],ymm0[1,2,3,4],ymm1[5,6,7] ; AVX2-SLOW-NEXT: vbroadcastss 144(%rdx), %ymm1 ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm8 = ymm0[3,1,2,0,7,5,6,4] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm8 = ymm0[3,1,2,0,7,5,6,4] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm8[0,1,2,3,4,5],ymm1[6],ymm8[7] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload @@ -9221,7 +9221,7 @@ ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm2 = ymm7[0],ymm0[1,2,3,4],ymm7[5,6,7] ; AVX2-SLOW-NEXT: vbroadcastss 176(%rdx), %ymm7 ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm8 = ymm0[3,1,2,0,7,5,6,4] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm8 = ymm0[3,1,2,0,7,5,6,4] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1,2,3,4,5],ymm7[6],ymm8[7] ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-SLOW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload @@ -9379,7 +9379,7 @@ ; AVX2-FAST-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vmovaps 32(%r9), %xmm4 ; AVX2-FAST-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[1,1,1,1] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,1,1,1] ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3] ; AVX2-FAST-NEXT: vbroadcastsd %xmm1, %ymm1 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5],ymm1[6,7] @@ -9398,7 +9398,7 @@ ; AVX2-FAST-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vmovaps 32(%rsi), %xmm7 ; AVX2-FAST-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[1,1,2,2] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm2 = xmm2[1,1,2,2] ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm8[2],xmm2[3] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,2,1] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3,4,5,6,7] @@ -9407,11 +9407,11 @@ ; AVX2-FAST-NEXT: vmovaps 32(%rax), %xmm0 ; AVX2-FAST-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm1 = xmm4[1,1,1,1] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm4[1,1,1,1] ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2,3] ; AVX2-FAST-NEXT: vbroadcastsd %xmm1, %ymm1 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5],ymm1[6,7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm1 = xmm7[1,1,2,2] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm7[1,1,2,2] ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm6[2],xmm1[3] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-FAST-NEXT: vmovaps 32(%rdx), %xmm2 @@ -9424,7 +9424,7 @@ ; AVX2-FAST-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vmovaps 64(%r9), %xmm0 ; AVX2-FAST-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,1,1] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1,1,1] ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] ; AVX2-FAST-NEXT: vbroadcastsd %xmm0, %ymm0 ; AVX2-FAST-NEXT: vmovaps 64(%rax), %xmm1 @@ -9435,7 +9435,7 @@ ; AVX2-FAST-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vmovaps 64(%rsi), %xmm1 ; AVX2-FAST-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[1,1,2,2] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,1,2,2] ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-FAST-NEXT: vmovaps 64(%rcx), %xmm3 @@ -9450,7 +9450,7 @@ ; AVX2-FAST-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vmovaps 96(%r9), %xmm0 ; AVX2-FAST-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,1,1] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1,1,1] ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] ; AVX2-FAST-NEXT: vbroadcastsd %xmm0, %ymm0 ; AVX2-FAST-NEXT: vmovaps 96(%rax), %xmm1 @@ -9461,7 +9461,7 @@ ; AVX2-FAST-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vmovaps 96(%rsi), %xmm1 ; AVX2-FAST-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[1,1,2,2] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,1,2,2] ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-FAST-NEXT: vmovaps 96(%rcx), %xmm3 @@ -9476,7 +9476,7 @@ ; AVX2-FAST-NEXT: vmovaps %xmm1, (%rsp) # 16-byte Spill ; AVX2-FAST-NEXT: vmovaps 128(%r9), %xmm0 ; AVX2-FAST-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,1,1] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1,1,1] ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] ; AVX2-FAST-NEXT: vbroadcastsd %xmm0, %ymm0 ; AVX2-FAST-NEXT: vmovaps 128(%rax), %xmm1 @@ -9487,7 +9487,7 @@ ; AVX2-FAST-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vmovaps 128(%rsi), %xmm1 ; AVX2-FAST-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[1,1,2,2] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,1,2,2] ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-FAST-NEXT: vmovaps 128(%rcx), %xmm3 @@ -9502,7 +9502,7 @@ ; AVX2-FAST-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vmovaps 160(%r9), %xmm0 ; AVX2-FAST-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,1,1] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1,1,1] ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] ; AVX2-FAST-NEXT: vbroadcastsd %xmm0, %ymm0 ; AVX2-FAST-NEXT: vmovaps 160(%rax), %xmm1 @@ -9513,7 +9513,7 @@ ; AVX2-FAST-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vmovaps 160(%rsi), %xmm1 ; AVX2-FAST-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[1,1,2,2] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,1,2,2] ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-FAST-NEXT: vmovaps 160(%rcx), %xmm3 @@ -9528,7 +9528,7 @@ ; AVX2-FAST-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vmovaps 192(%r8), %xmm1 ; AVX2-FAST-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,1,1] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1,1,1] ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] ; AVX2-FAST-NEXT: vbroadcastsd %xmm0, %ymm0 ; AVX2-FAST-NEXT: vmovaps 192(%rax), %xmm1 @@ -9539,7 +9539,7 @@ ; AVX2-FAST-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-NEXT: vmovaps 192(%rsi), %xmm1 ; AVX2-FAST-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[1,1,2,2] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,1,2,2] ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-FAST-NEXT: vmovaps 192(%rcx), %xmm3 @@ -9565,7 +9565,7 @@ ; AVX2-FAST-NEXT: vmovaps (%r8), %ymm2 ; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps (%r9), %ymm12 -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm1 = ymm12[1,1,2,2,5,5,6,6] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm12[1,1,2,2,5,5,6,6] ; AVX2-FAST-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4,5],ymm2[6],ymm1[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] @@ -9589,7 +9589,7 @@ ; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps 32(%r9), %ymm1 ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[1,1,2,2,5,5,6,6] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,1,2,2,5,5,6,6] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4,5],ymm2[6],ymm1[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] ; AVX2-FAST-NEXT: vmovaps 48(%rax), %xmm2 @@ -9612,7 +9612,7 @@ ; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps 64(%r9), %ymm1 ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[1,1,2,2,5,5,6,6] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,1,2,2,5,5,6,6] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4,5],ymm2[6],ymm1[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] ; AVX2-FAST-NEXT: vmovaps 80(%rax), %xmm2 @@ -9635,7 +9635,7 @@ ; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps 96(%r9), %ymm1 ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[1,1,2,2,5,5,6,6] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,1,2,2,5,5,6,6] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4,5],ymm2[6],ymm1[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] ; AVX2-FAST-NEXT: vmovaps 112(%rax), %xmm2 @@ -9658,7 +9658,7 @@ ; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps 128(%r9), %ymm1 ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[1,1,2,2,5,5,6,6] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,1,2,2,5,5,6,6] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4,5],ymm2[6],ymm1[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] ; AVX2-FAST-NEXT: vmovaps 144(%rax), %xmm2 @@ -9681,7 +9681,7 @@ ; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps 160(%r9), %ymm1 ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[1,1,2,2,5,5,6,6] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,1,2,2,5,5,6,6] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4,5],ymm2[6],ymm1[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] ; AVX2-FAST-NEXT: vmovaps 176(%rax), %xmm2 @@ -9705,7 +9705,7 @@ ; AVX2-FAST-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps 192(%r9), %ymm1 ; AVX2-FAST-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[1,1,2,2,5,5,6,6] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,1,2,2,5,5,6,6] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4,5],ymm2[6],ymm1[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] ; AVX2-FAST-NEXT: vmovaps 208(%rax), %xmm2 @@ -9714,7 +9714,7 @@ ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps 224(%rdi), %xmm0 ; AVX2-FAST-NEXT: vmovaps 224(%rsi), %xmm1 -; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[1,1,2,2] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm2 = xmm1[1,1,2,2] ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2],xmm2[3] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,2,1] ; AVX2-FAST-NEXT: vmovaps 224(%rcx), %xmm3 @@ -9724,7 +9724,7 @@ ; AVX2-FAST-NEXT: vbroadcastss 228(%r8), %ymm4 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2],ymm4[3],ymm2[4,5,6,7] ; AVX2-FAST-NEXT: vmovaps 224(%r9), %xmm7 -; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm15 = xmm7[1,1,1,1] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm15 = xmm7[1,1,1,1] ; AVX2-FAST-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm15[4,5],ymm2[6,7] ; AVX2-FAST-NEXT: vmovaps 224(%rax), %xmm15 @@ -9753,14 +9753,14 @@ ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6],ymm1[7] ; AVX2-FAST-NEXT: vmovaps 224(%r8), %ymm6 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm6[2,3],ymm0[4,5,6,7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm1 = xmm7[2,2,2,2] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm7[2,2,2,2] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6,7] ; AVX2-FAST-NEXT: vbroadcastss 232(%rax), %ymm1 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4],ymm0[5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovaps 224(%rdi), %ymm2 ; AVX2-FAST-NEXT: vmovaps 224(%rsi), %ymm1 -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm0 = ymm1[1,1,1,1,5,5,5,5] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm1[1,1,1,1,5,5,5,5] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3,4],ymm2[5],ymm0[6,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm4 = ymm0[2,2,2,2] ; AVX2-FAST-NEXT: vmovaps 224(%rdx), %ymm3 @@ -9787,13 +9787,13 @@ ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm15[0],ymm4[1,2,3,4,5,6],ymm15[7] ; AVX2-FAST-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vbroadcastss 240(%rdx), %ymm4 -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm15 = ymm0[3,1,2,0,7,5,6,4] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm15 = ymm0[3,1,2,0,7,5,6,4] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm15[0,1,2,3,4,5],ymm4[6],ymm15[7] ; AVX2-FAST-NEXT: vunpcklps {{.*#+}} ymm15 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[4],ymm1[4],ymm2[5],ymm1[5] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm15[4,5],ymm4[6,7] ; AVX2-FAST-NEXT: vbroadcastss 236(%r8), %ymm15 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm15[1],ymm4[2,3,4,5,6,7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm7 = xmm7[2,2,3,3] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm7 = xmm7[2,2,3,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm7[2,3],ymm4[4,5,6,7] ; AVX2-FAST-NEXT: vunpckhps {{.*#+}} ymm7 = ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[6],ymm1[6],ymm2[7],ymm1[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[2,2,2,2] @@ -9844,7 +9844,7 @@ ; AVX2-FAST-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vpermps %ymm1, %ymm10, %ymm1 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6],ymm1[7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm1 = xmm6[2,2,2,2] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm6[2,2,2,2] ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3] ; AVX2-FAST-NEXT: vbroadcastsd 8(%rax), %ymm2 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] @@ -9858,7 +9858,7 @@ ; AVX2-FAST-NEXT: vshufps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm1 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm1 = ymm14[1,1],mem[1,1],ymm14[5,5],mem[5,5] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6],ymm0[7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm1 = ymm12[0,0,0,0,4,4,4,4] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm12[0,0,0,0,4,4,4,4] ; AVX2-FAST-NEXT: vpermilps $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm2 = mem[0,1,0,1,4,5,4,5] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7] @@ -9890,7 +9890,7 @@ ; AVX2-FAST-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vpermps %ymm1, %ymm10, %ymm1 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6],ymm1[7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm1 = xmm7[2,2,2,2] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm7[2,2,2,2] ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3] ; AVX2-FAST-NEXT: vbroadcastsd 40(%rax), %ymm2 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] @@ -9938,14 +9938,14 @@ ; AVX2-FAST-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vpermps %ymm1, %ymm10, %ymm1 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6],ymm1[7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm1 = xmm3[2,2,2,2] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm3[2,2,2,2] ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm8[0,1,2],xmm1[3] ; AVX2-FAST-NEXT: vbroadcastsd 72(%rax), %ymm2 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4],ymm0[5,6,7] ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm0 = ymm15[1,1,1,1,5,5,5,5] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm15[1,1,1,1,5,5,5,5] ; AVX2-FAST-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FAST-NEXT: # ymm0 = ymm0[0],mem[1],ymm0[2,3,4],mem[5],ymm0[6,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] @@ -9986,7 +9986,7 @@ ; AVX2-FAST-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vpermps %ymm1, %ymm10, %ymm1 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6],ymm1[7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm1 = xmm7[2,2,2,2] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm7[2,2,2,2] ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm8[0,1,2],xmm1[3] ; AVX2-FAST-NEXT: vbroadcastsd 104(%rax), %ymm2 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] @@ -10034,7 +10034,7 @@ ; AVX2-FAST-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vpermps %ymm1, %ymm10, %ymm1 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6],ymm1[7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm1 = xmm7[2,2,2,2] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm7[2,2,2,2] ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm8[0,1,2],xmm1[3] ; AVX2-FAST-NEXT: vbroadcastsd 136(%rax), %ymm2 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] @@ -10082,7 +10082,7 @@ ; AVX2-FAST-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vpermps %ymm1, %ymm10, %ymm1 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6],ymm1[7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm1 = xmm7[2,2,2,2] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm7[2,2,2,2] ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm8[0,1,2],xmm1[3] ; AVX2-FAST-NEXT: vbroadcastsd 168(%rax), %ymm2 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] @@ -10130,7 +10130,7 @@ ; AVX2-FAST-NEXT: vpermps %ymm1, %ymm10, %ymm1 ; AVX2-FAST-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6],ymm1[7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm1 = xmm6[2,2,2,2] +; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm6[2,2,2,2] ; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3] ; AVX2-FAST-NEXT: vbroadcastsd 200(%rax), %ymm2 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] @@ -10140,7 +10140,7 @@ ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,1],ymm13[1,1],ymm0[5,5],ymm13[5,5] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm1 = ymm7[1,1,1,1,5,5,5,5] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm7[1,1,1,1,5,5,5,5] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm6[1],ymm1[2,3,4],ymm6[5],ymm1[6,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,2] @@ -10157,7 +10157,7 @@ ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vbroadcastss 16(%rdx), %ymm0 ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm1 = ymm5[3,1,2,0,7,5,6,4] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm5[3,1,2,0,7,5,6,4] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6],ymm1[7] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload @@ -10186,7 +10186,7 @@ ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vbroadcastss 48(%rdx), %ymm0 ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm1 = ymm5[3,1,2,0,7,5,6,4] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm5[3,1,2,0,7,5,6,4] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6],ymm1[7] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload @@ -10215,7 +10215,7 @@ ; AVX2-FAST-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-NEXT: vbroadcastss 80(%rdx), %ymm0 ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm1 = ymm5[3,1,2,0,7,5,6,4] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm1 = ymm5[3,1,2,0,7,5,6,4] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6],ymm1[7] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-FAST-NEXT: vmovaps %ymm15, %ymm3 @@ -10242,7 +10242,7 @@ ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm10 = ymm1[0],ymm0[1,2,3,4],ymm1[5,6,7] ; AVX2-FAST-NEXT: vbroadcastss 112(%rdx), %ymm0 ; AVX2-FAST-NEXT: vmovaps %ymm11, %ymm3 -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm5 = ymm11[3,1,2,0,7,5,6,4] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm5 = ymm11[3,1,2,0,7,5,6,4] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5],ymm0[6],ymm5[7] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-FAST-NEXT: vunpcklps {{.*#+}} ymm5 = ymm12[0],ymm2[0],ymm12[1],ymm2[1],ymm12[4],ymm2[4],ymm12[5],ymm2[5] @@ -10268,7 +10268,7 @@ ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0],ymm0[1,2,3,4],ymm5[5,6,7] ; AVX2-FAST-NEXT: vbroadcastss 144(%rdx), %ymm5 ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm12 = ymm2[3,1,2,0,7,5,6,4] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm12 = ymm2[3,1,2,0,7,5,6,4] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm12[0,1,2,3,4,5],ymm5[6],ymm12[7] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-NEXT: vunpcklps {{.*#+}} ymm12 = ymm9[0],ymm1[0],ymm9[1],ymm1[1],ymm9[4],ymm1[4],ymm9[5],ymm1[5] @@ -10293,7 +10293,7 @@ ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[2,1,2,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0],ymm0[1,2,3,4],ymm5[5,6,7] ; AVX2-FAST-NEXT: vbroadcastss 176(%rdx), %ymm5 -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm9 = ymm14[3,1,2,0,7,5,6,4] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm9 = ymm14[3,1,2,0,7,5,6,4] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm9[0,1,2,3,4,5],ymm5[6],ymm9[7] ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-FAST-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload @@ -10319,7 +10319,7 @@ ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm8 = ymm8[2,1,2,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0],ymm0[1,2,3,4],ymm8[5,6,7] ; AVX2-FAST-NEXT: vunpcklps {{.*#+}} ymm8 = ymm6[0],ymm7[0],ymm6[1],ymm7[1],ymm6[4],ymm7[4],ymm6[5],ymm7[5] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm9 = ymm13[3,1,2,0,7,5,6,4] +; AVX2-FAST-NEXT: vshufps {{.*#+}} ymm9 = ymm13[3,1,2,0,7,5,6,4] ; AVX2-FAST-NEXT: vbroadcastss 208(%rdx), %ymm7 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm9[0,1,2,3,4,5],ymm7[6],ymm9[7] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5],ymm7[6,7] @@ -10452,7 +10452,7 @@ ; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%r9), %xmm4 ; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[1,1,1,1] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,1,1,1] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm5[1],xmm1[2,3] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd %xmm1, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5],ymm1[6,7] @@ -10467,7 +10467,7 @@ ; AVX2-FAST-PERLANE-NEXT: vmovaps (%rsi), %xmm5 ; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rsi), %xmm11 ; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm2 = xmm5[1,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm2 = xmm5[1,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm6[2],xmm2[3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,2,1] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1,2],ymm2[3,4,5,6,7] @@ -10476,11 +10476,11 @@ ; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rax), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm1 = xmm4[1,1,1,1] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm4[1,1,1,1] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2,3] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd %xmm1, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5],ymm1[6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm1 = xmm11[1,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm11[1,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm7[2],xmm1[3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%rdx), %xmm7 @@ -10492,7 +10492,7 @@ ; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%r9), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,1,1] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1,1,1] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd %xmm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%rax), %xmm1 @@ -10503,7 +10503,7 @@ ; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%rsi), %xmm1 ; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[1,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%rcx), %xmm2 @@ -10517,7 +10517,7 @@ ; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 96(%r9), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,1,1] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1,1,1] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd %xmm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovaps 96(%rax), %xmm1 @@ -10528,7 +10528,7 @@ ; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 96(%rsi), %xmm1 ; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[1,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-FAST-PERLANE-NEXT: vmovaps 96(%rcx), %xmm3 @@ -10543,7 +10543,7 @@ ; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm1, (%rsp) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 128(%r9), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,1,1] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1,1,1] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd %xmm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovaps 128(%rax), %xmm1 @@ -10554,7 +10554,7 @@ ; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 128(%rsi), %xmm1 ; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[1,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-FAST-PERLANE-NEXT: vmovaps 128(%rcx), %xmm3 @@ -10569,7 +10569,7 @@ ; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 160(%r9), %xmm0 ; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,1,1] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1,1,1] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd %xmm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovaps 160(%rax), %xmm1 @@ -10580,7 +10580,7 @@ ; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 160(%rsi), %xmm1 ; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[1,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-FAST-PERLANE-NEXT: vmovaps 160(%rcx), %xmm3 @@ -10595,7 +10595,7 @@ ; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 192(%r8), %xmm1 ; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,1,1] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1,1,1] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd %xmm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovaps 192(%rax), %xmm1 @@ -10606,7 +10606,7 @@ ; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 192(%rsi), %xmm1 ; AVX2-FAST-PERLANE-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[1,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,2,1] ; AVX2-FAST-PERLANE-NEXT: vmovaps 192(%rcx), %xmm3 @@ -10633,7 +10633,7 @@ ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps (%r9), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[1,1,2,2,5,5,6,6] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,1,2,2,5,5,6,6] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4,5],ymm2[6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] ; AVX2-FAST-PERLANE-NEXT: vmovaps 16(%rax), %xmm2 @@ -10656,7 +10656,7 @@ ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 32(%r9), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[1,1,2,2,5,5,6,6] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,1,2,2,5,5,6,6] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4,5],ymm2[6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] ; AVX2-FAST-PERLANE-NEXT: vmovaps 48(%rax), %xmm2 @@ -10679,7 +10679,7 @@ ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 64(%r9), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[1,1,2,2,5,5,6,6] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,1,2,2,5,5,6,6] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4,5],ymm2[6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] ; AVX2-FAST-PERLANE-NEXT: vmovaps 80(%rax), %xmm2 @@ -10702,7 +10702,7 @@ ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 96(%r9), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[1,1,2,2,5,5,6,6] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,1,2,2,5,5,6,6] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4,5],ymm2[6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] ; AVX2-FAST-PERLANE-NEXT: vmovaps 112(%rax), %xmm2 @@ -10725,7 +10725,7 @@ ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 128(%r9), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[1,1,2,2,5,5,6,6] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,1,2,2,5,5,6,6] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4,5],ymm2[6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] ; AVX2-FAST-PERLANE-NEXT: vmovaps 144(%rax), %xmm2 @@ -10748,7 +10748,7 @@ ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 160(%r9), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[1,1,2,2,5,5,6,6] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,1,2,2,5,5,6,6] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4,5],ymm2[6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] ; AVX2-FAST-PERLANE-NEXT: vmovaps 176(%rax), %xmm2 @@ -10771,7 +10771,7 @@ ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 192(%r9), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[1,1,2,2,5,5,6,6] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,1,2,2,5,5,6,6] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4,5],ymm2[6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] ; AVX2-FAST-PERLANE-NEXT: vmovaps 208(%rax), %xmm2 @@ -10786,7 +10786,7 @@ ; AVX2-FAST-PERLANE-NEXT: vmovaps 224(%rsi), %xmm4 ; AVX2-FAST-PERLANE-NEXT: vmovaps 224(%rdi), %xmm2 ; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm15 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm15 = xmm15[0,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm15 = xmm15[0,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm15 = ymm15[0,1,2,1] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm15[0,1],ymm3[2,3],ymm15[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 224(%r8), %ymm15 @@ -10798,14 +10798,14 @@ ; AVX2-FAST-PERLANE-NEXT: vbroadcastss %xmm15, %ymm13 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm10 = ymm14[0,1,2,3,4,5],ymm13[6],ymm14[7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm13 = xmm4[1,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm13 = xmm4[1,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm13 = xmm13[0,1],xmm2[2],xmm13[3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[0,1,2,1] ; AVX2-FAST-PERLANE-NEXT: vinsertps {{.*#+}} xmm14 = zero,xmm1[1],xmm0[1],zero ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0],ymm14[1,2],ymm13[3,4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vbroadcastss 228(%r8), %ymm12 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0,1,2],ymm12[3],ymm13[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm13 = xmm3[1,1,1,1] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm13 = xmm3[1,1,1,1] ; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm13, %ymm0, %ymm13 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm13[4,5],ymm12[6,7] ; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm15, %ymm14, %ymm13 @@ -10814,20 +10814,20 @@ ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm2 = xmm2[3,3],xmm4[3,3] ; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} xmm0 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,2,1] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6],ymm0[7] ; AVX2-FAST-PERLANE-NEXT: vmovaps 224(%r8), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm1 = xmm3[2,2,2,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm1 = xmm3[2,2,2,2] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3],ymm0[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vbroadcastss 232(%rax), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4],ymm0[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovaps 224(%rdi), %ymm4 ; AVX2-FAST-PERLANE-NEXT: vmovaps 224(%rsi), %ymm2 -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm0 = ymm2[1,1,1,1,5,5,5,5] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm2[1,1,1,1,5,5,5,5] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm4[1],ymm0[2,3,4],ymm4[5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm12 = ymm0[2,2,2,2] ; AVX2-FAST-PERLANE-NEXT: vmovaps 224(%rdx), %ymm10 @@ -10845,7 +10845,7 @@ ; AVX2-FAST-PERLANE-NEXT: vbroadcastss %xmm8, %xmm13 ; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1] ; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm13 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm13 = xmm13[0,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm13 = xmm13[0,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[0,1,2,1] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0,1],ymm12[2,3],ymm13[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload @@ -10859,10 +10859,10 @@ ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm5 = xmm6[3,3],xmm5[3,3] ; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} xmm6 = xmm8[2],xmm9[2],xmm8[3],xmm9[3] ; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm6 = xmm6[0,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm6 = xmm6[0,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,2,1] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3,4],ymm5[5,6],ymm6[7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm6 = xmm0[2,2,2,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm6 = xmm0[2,2,2,2] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm6 = xmm15[0,1,2],xmm6[3] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 8(%rax), %ymm8 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] @@ -10875,7 +10875,7 @@ ; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm6 = xmm13[0],xmm14[0],xmm13[1],xmm14[1] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm6 = xmm6[0,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm6 = xmm6[0,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,2,1] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1],ymm5[2,3],ymm6[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload @@ -10889,10 +10889,10 @@ ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm5 = xmm13[3,3],xmm14[3,3] ; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} xmm6 = xmm7[2],xmm12[2],xmm7[3],xmm12[3] ; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm6 = xmm6[0,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm6 = xmm6[0,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,2,1] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3,4],ymm5[5,6],ymm6[7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm6 = xmm15[2,2,2,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm6 = xmm15[2,2,2,2] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm6 = xmm9[0,1,2],xmm6[3] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 40(%rax), %ymm7 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7] @@ -10905,7 +10905,7 @@ ; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm6 = xmm12[0],xmm9[0],xmm12[1],xmm9[1] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm6 = xmm6[0,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm6 = xmm6[0,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,2,1] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1],ymm5[2,3],ymm6[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -10919,10 +10919,10 @@ ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm5 = xmm12[3,3],xmm9[3,3] ; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} xmm6 = xmm11[2],xmm13[2],xmm11[3],xmm13[3] ; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm6 = xmm6[0,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm6 = xmm6[0,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,2,1] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3,4],ymm5[5,6],ymm6[7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm6 = xmm0[2,2,2,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm6 = xmm0[2,2,2,2] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm6 = xmm8[0,1,2],xmm6[3] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 72(%rax), %ymm7 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7] @@ -10936,7 +10936,7 @@ ; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm6 = xmm12[0],xmm9[0],xmm12[1],xmm9[1] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm6 = xmm6[0,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm6 = xmm6[0,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,2,1] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1],ymm5[2,3],ymm6[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload @@ -10950,10 +10950,10 @@ ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm5 = xmm12[3,3],xmm9[3,3] ; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} xmm6 = xmm14[2],xmm13[2],xmm14[3],xmm13[3] ; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm6 = xmm6[0,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm6 = xmm6[0,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,2,1] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3,4],ymm5[5,6],ymm6[7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm6 = xmm11[2,2,2,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm6 = xmm11[2,2,2,2] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm6 = xmm8[0,1,2],xmm6[3] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 104(%rax), %ymm7 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7] @@ -10967,7 +10967,7 @@ ; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm6 = xmm9[0],xmm8[0],xmm9[1],xmm8[1] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm6 = xmm6[0,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm6 = xmm6[0,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,2,1] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1],ymm5[2,3],ymm6[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload @@ -10981,10 +10981,10 @@ ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm5 = xmm9[3,3],xmm8[3,3] ; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} xmm6 = xmm13[2],xmm12[2],xmm13[3],xmm12[3] ; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm6 = xmm6[0,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm6 = xmm6[0,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,2,1] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3,4],ymm5[5,6],ymm6[7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm6 = xmm11[2,2,2,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm6 = xmm11[2,2,2,2] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm6 = xmm0[0,1,2],xmm6[3] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 136(%rax), %ymm7 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7] @@ -10998,7 +10998,7 @@ ; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm6 = xmm9[0],xmm8[0],xmm9[1],xmm8[1] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm6 = xmm6[0,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm6 = xmm6[0,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,2,1] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1],ymm5[2,3],ymm6[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload @@ -11012,10 +11012,10 @@ ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm5 = xmm9[3,3],xmm8[3,3] ; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} xmm6 = xmm13[2],xmm12[2],xmm13[3],xmm12[3] ; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm6 = xmm6[0,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm6 = xmm6[0,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,2,1] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3,4],ymm5[5,6],ymm6[7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm6 = xmm11[2,2,2,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm6 = xmm11[2,2,2,2] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm6 = xmm0[0,1,2],xmm6[3] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 168(%rax), %ymm7 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7] @@ -11029,7 +11029,7 @@ ; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload ; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} xmm6 = xmm9[0],xmm8[0],xmm9[1],xmm8[1] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm6 = xmm6[0,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm6 = xmm6[0,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,2,1] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1],ymm5[2,3],ymm6[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -11043,10 +11043,10 @@ ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm5 = xmm9[3,3],xmm8[3,3] ; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} xmm6 = xmm13[2],xmm11[2],xmm13[3],xmm11[3] ; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm6 = xmm6[0,1,2,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm6 = xmm6[0,1,2,2] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,2,1] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3,4],ymm5[5,6],ymm6[7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm6 = xmm0[2,2,2,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm6 = xmm0[2,2,2,2] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} xmm6 = xmm12[0,1,2],xmm6[3] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 200(%rax), %ymm7 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7] @@ -11069,13 +11069,13 @@ ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0],ymm5[1,2,3,4,5,6],ymm6[7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vbroadcastss 240(%rdx), %ymm5 -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm6 = ymm1[3,1,2,0,7,5,6,4] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm6 = ymm1[3,1,2,0,7,5,6,4] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5],ymm5[6],ymm6[7] ; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} ymm6 = ymm4[0],ymm2[0],ymm4[1],ymm2[1],ymm4[4],ymm2[4],ymm4[5],ymm2[5] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5],ymm5[6,7] ; AVX2-FAST-PERLANE-NEXT: vbroadcastss 236(%r8), %ymm6 ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm6[1],ymm5[2,3,4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm3 = xmm3[2,2,3,3] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm3 = xmm3[2,2,3,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1],ymm3[2,3],ymm5[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vunpckhps {{.*#+}} ymm5 = ymm4[2],ymm2[2],ymm4[3],ymm2[3],ymm4[6],ymm2[6],ymm4[7],ymm2[7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[2,2,2,2] @@ -11105,7 +11105,7 @@ ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm0 = ymm5[1,1,1,1,5,5,5,5] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm5[1,1,1,1,5,5,5,5] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm4[1],ymm0[2,3,4],ymm4[5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] @@ -11114,9 +11114,9 @@ ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm8[1,1],ymm3[1,1],ymm8[5,5],ymm3[5,5] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6],ymm0[7] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm14[0,0,0,0,4,4,4,4] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm14[0,0,0,0,4,4,4,4] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm2 = ymm9[0,1,0,1,4,5,4,5] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm2 = ymm9[0,1,0,1,4,5,4,5] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3,4],ymm1[5],ymm2[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] ; AVX2-FAST-PERLANE-NEXT: vbroadcastsd 16(%rax), %ymm2 @@ -11124,7 +11124,7 @@ ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm0 = ymm15[1,1,1,1,5,5,5,5] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm15[1,1,1,1,5,5,5,5] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm10[1],ymm0[2,3,4],ymm10[5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] @@ -11143,7 +11143,7 @@ ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm0 = ymm6[1,1,1,1,5,5,5,5] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm6[1,1,1,1,5,5,5,5] ; AVX2-FAST-PERLANE-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload ; AVX2-FAST-PERLANE-NEXT: # ymm0 = ymm0[0],mem[1],ymm0[2,3,4],mem[5],ymm0[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] @@ -11220,7 +11220,7 @@ ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,1],ymm12[1,1],ymm0[5,5],ymm12[5,5] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm13[1,1,1,1,5,5,5,5] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm13[1,1,1,1,5,5,5,5] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm11[1],ymm1[2,3,4],ymm11[5],ymm1[6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,2] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6],ymm1[7] @@ -11235,7 +11235,7 @@ ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-FAST-PERLANE-NEXT: vbroadcastss 16(%rdx), %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm3[3,1,2,0,7,5,6,4] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm3[3,1,2,0,7,5,6,4] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} ymm1 = ymm4[0],ymm5[0],ymm4[1],ymm5[1],ymm4[4],ymm5[4],ymm4[5],ymm5[5] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] @@ -11257,7 +11257,7 @@ ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm9 = ymm1[0],ymm0[1,2,3,4],ymm1[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vbroadcastss 48(%rdx), %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm7[3,1,2,0,7,5,6,4] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm7[3,1,2,0,7,5,6,4] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} ymm1 = ymm10[0],ymm15[0],ymm10[1],ymm15[1],ymm10[4],ymm15[4],ymm10[5],ymm15[5] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] @@ -11282,7 +11282,7 @@ ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm15 = ymm1[0],ymm0[1,2,3,4],ymm1[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vbroadcastss 80(%rdx), %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm12[3,1,2,0,7,5,6,4] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm12[3,1,2,0,7,5,6,4] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vunpcklps {{.*#+}} ymm1 = ymm2[0],ymm6[0],ymm2[1],ymm6[1],ymm2[4],ymm6[4],ymm2[5],ymm6[5] @@ -11308,7 +11308,7 @@ ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm12 = ymm1[0],ymm0[1,2,3,4],ymm1[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vbroadcastss 112(%rdx), %ymm2 ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm0[3,1,2,0,7,5,6,4] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm0[3,1,2,0,7,5,6,4] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm6 = ymm1[0,1,2,3,4,5],ymm2[6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload @@ -11335,7 +11335,7 @@ ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm1[0],ymm0[1,2,3,4],ymm1[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vbroadcastss 144(%rdx), %ymm1 ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm8 = ymm0[3,1,2,0,7,5,6,4] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm8 = ymm0[3,1,2,0,7,5,6,4] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm8[0,1,2,3,4,5],ymm1[6],ymm8[7] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload @@ -11362,7 +11362,7 @@ ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm2 = ymm7[0],ymm0[1,2,3,4],ymm7[5,6,7] ; AVX2-FAST-PERLANE-NEXT: vbroadcastss 176(%rdx), %ymm7 ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm8 = ymm0[3,1,2,0,7,5,6,4] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm8 = ymm0[3,1,2,0,7,5,6,4] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1,2,3,4,5],ymm7[6],ymm8[7] ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload ; AVX2-FAST-PERLANE-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-8.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-8.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-8.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-8.ll @@ -494,7 +494,7 @@ ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm4[0],ymm6[0],ymm4[1],ymm6[1],ymm4[4],ymm6[4],ymm4[5],ymm6[5] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm5[1,0],ymm3[1,0],ymm5[5,4],ymm3[5,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm2 = ymm2[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] @@ -505,7 +505,7 @@ ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm2 = ymm4[2],ymm6[2],ymm4[3],ymm6[3],ymm4[6],ymm6[6],ymm4[7],ymm6[7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm5[3,0],ymm3[3,0],ymm5[7,4],ymm3[7,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm11 = ymm11[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm11[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm11, %xmm11 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm11[0,1],xmm2[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm1[4,5,6,7] @@ -516,7 +516,7 @@ ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm11 = ymm3[2],ymm5[2],ymm3[3],ymm5[3],ymm3[6],ymm5[6],ymm3[7],ymm5[7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm11, %xmm11 ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm12 = ymm6[1],ymm4[1],ymm6[3],ymm4[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm12 = ymm12[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm12[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm12, %xmm12 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm11 = xmm11[0,1],xmm12[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm2[4,5,6,7] @@ -527,7 +527,7 @@ ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm3 = ymm3[0],ymm5[0],ymm3[1],ymm5[1],ymm3[4],ymm5[4],ymm3[5],ymm5[5] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm3 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm6[0],ymm4[0],ymm6[2],ymm4[2] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm4 = ymm4[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm4 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm7[4,5,6,7] @@ -542,18 +542,18 @@ ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm8 ; AVX1-ONLY-NEXT: vmovaps (%r9), %xmm13 ; AVX1-ONLY-NEXT: vmovaps (%r8), %xmm14 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm15 = xmm14[1,1,1,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm14[1,1,1,1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm15[0],xmm13[1],xmm15[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm15[0,1,2,3,4,5],ymm8[6,7] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm15 = xmm11[1,1,1,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm11[1,1,1,1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm15[0],xmm10[1],xmm15[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm15[0,1],xmm9[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm15[0,1,2,3],ymm8[4,5,6,7] ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm15 = xmm7[2],xmm6[2],xmm7[3],xmm6[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15 ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm14[2],xmm13[2],xmm14[3],xmm13[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm2[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm2[2,3,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm15[6,7] ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm15 = xmm5[2],xmm4[2],xmm5[3],xmm4[3] @@ -564,14 +564,14 @@ ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm9 = xmm10[0],xmm9[0] ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm10 = xmm14[0],xmm13[0],xmm14[1],xmm13[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm10 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm11 = xmm12[0,1,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm11 = xmm12[0,1,0,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm11 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm11[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm4 = xmm4[2,2,2,2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm4[2,2,2,2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm4 = xmm6[2,2,2,2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm6[2,2,2,2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm7[0,1,2],xmm4[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 @@ -1078,7 +1078,7 @@ ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm10 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm13 = ymm4[1],ymm3[1],ymm4[3],ymm3[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm13 = ymm13[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm13[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm13, %xmm13 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm10[0,1],xmm13[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm10[0,1,2,3],ymm2[4,5,6,7] @@ -1089,7 +1089,7 @@ ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm10 = ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[6],ymm4[6],ymm3[7],ymm4[7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm1[3,0],ymm0[3,0],ymm1[7,4],ymm0[7,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm13 = ymm13[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm13[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm13, %xmm13 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm13[0,1],xmm10[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm10[0,1,2,3],ymm2[4,5,6,7] @@ -1100,7 +1100,7 @@ ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm4[0],ymm3[0],ymm4[2],ymm3[2] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm13 = ymm13[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm13[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm13, %xmm13 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm13 = xmm2[0,1],xmm13[2,3] ; AVX1-ONLY-NEXT: vmovaps (%rcx), %ymm2 @@ -1116,7 +1116,7 @@ ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm3 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[4],ymm4[4],ymm3[5],ymm4[5] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm3 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[1,0],ymm0[1,0],ymm1[5,4],ymm0[5,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7] @@ -1127,7 +1127,7 @@ ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm7[2],ymm8[2],ymm7[3],ymm8[3],ymm7[6],ymm8[6],ymm7[7],ymm8[7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm2[1],ymm9[1],ymm2[3],ymm9[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm3 = ymm3[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm3 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] @@ -1138,7 +1138,7 @@ ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm9[2],ymm2[2],ymm9[3],ymm2[3],ymm9[6],ymm2[6],ymm9[7],ymm2[7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm8[3,0],ymm7[3,0],ymm8[7,4],ymm7[7,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm3 = ymm3[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm3 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] @@ -1149,7 +1149,7 @@ ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm7[0],ymm8[0],ymm7[1],ymm8[1],ymm7[4],ymm8[4],ymm7[5],ymm8[5] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm2[0],ymm9[0],ymm2[2],ymm9[2] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm3 = ymm3[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm3 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] @@ -1160,7 +1160,7 @@ ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm1 = ymm9[0],ymm2[0],ymm9[1],ymm2[1],ymm9[4],ymm2[4],ymm9[5],ymm2[5] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm8[1,0],ymm7[1,0],ymm8[5,4],ymm7[5,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm2 = ymm2[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] @@ -1174,12 +1174,12 @@ ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovaps (%r9), %xmm15 ; AVX1-ONLY-NEXT: vmovaps (%r8), %xmm13 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm3 = xmm13[1,1,1,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm13[1,1,1,1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0],xmm15[1],xmm3[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm0[6,7] ; AVX1-ONLY-NEXT: vmovaps (%rdi), %xmm11 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm4 = xmm11[1,1,1,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm11[1,1,1,1] ; AVX1-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0],xmm5[1],xmm4[2,3] ; AVX1-ONLY-NEXT: vmovaps (%rcx), %xmm0 @@ -1193,14 +1193,14 @@ ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm3 = xmm3[0],xmm12[0] ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm4 = xmm13[0],xmm15[0],xmm13[1],xmm15[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[0,1,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,1,0,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5],ymm2[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm2[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 32(%rax), %xmm9 ; AVX1-ONLY-NEXT: vmovaps 32(%r10), %xmm8 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm3 = xmm9[2,2,2,2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm9[2,2,2,2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm8[0,1,2],xmm3[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm7 ; AVX1-ONLY-NEXT: vmovaps 32(%r9), %xmm6 @@ -1210,7 +1210,7 @@ ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm10[0,1,2,3,4,5],ymm7[6,7] ; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %xmm10 ; AVX1-ONLY-NEXT: vmovaps 32(%rdx), %xmm0 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm12 = xmm10[2,2,2,2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm12 = xmm10[2,2,2,2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm12 = xmm0[0,1,2],xmm12[3] ; AVX1-ONLY-NEXT: vmovaps 32(%rsi), %xmm3 ; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm2 @@ -1221,7 +1221,7 @@ ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm12[1] ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm12 = xmm8[2],xmm9[2],xmm8[3],xmm9[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm12 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm4 = xmm4[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm4[2,3,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm12[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] @@ -1229,16 +1229,16 @@ ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1] ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm9 = xmm8[0,1,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm9 = xmm8[0,1,0,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm9 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm9[6,7] ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm9 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm9 = xmm9[0],xmm4[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[1,1,1,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm2[1,1,1,1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm3 = xmm5[1,1,1,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm5[1,1,1,1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0],xmm6[1],xmm3[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm4 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 @@ -1248,14 +1248,14 @@ ; AVX1-ONLY-NEXT: # xmm3 = xmm11[2],mem[2],xmm11[3],mem[3] ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm4 = xmm13[2],xmm15[2],xmm13[3],xmm15[3] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm5 = xmm9[2,2,2,2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm9[2,2,2,2] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm8[0,1,2],xmm5[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm6 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5],ymm5[6,7] ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm6 = xmm10[2,2,2,2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm10[2,2,2,2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm14[0,1,2],xmm6[3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm3[0,1],xmm6[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7] @@ -1263,7 +1263,7 @@ ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm3[1],xmm6[1] ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm6 = xmm8[2],xmm9[2],xmm8[3],xmm9[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm4 = xmm4[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm4[2,3,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm6[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] @@ -1412,7 +1412,7 @@ ; AVX2-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vmovaps (%rsi), %xmm2 ; AVX2-ONLY-NEXT: vmovaps (%rdi), %xmm4 -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm4[1,1,1,1] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm4[1,1,1,1] ; AVX2-ONLY-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3] ; AVX2-ONLY-NEXT: vmovaps %xmm2, %xmm5 @@ -1424,7 +1424,7 @@ ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vmovaps (%r9), %xmm6 ; AVX2-ONLY-NEXT: vmovaps (%r8), %xmm14 -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm2 = xmm14[1,1,1,1] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm14[1,1,1,1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm6[1],xmm2[2,3] ; AVX2-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 @@ -1447,7 +1447,7 @@ ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 32(%rax), %xmm8 ; AVX2-ONLY-NEXT: vmovaps 32(%r10), %xmm5 -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm2 = xmm8[2,2,2,2] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm8[2,2,2,2] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm5[0,1,2],xmm2[3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX2-ONLY-NEXT: vmovaps 32(%r9), %xmm10 @@ -1457,7 +1457,7 @@ ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovaps 32(%rcx), %xmm2 ; AVX2-ONLY-NEXT: vmovaps 32(%rdx), %xmm1 -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm13 = xmm2[2,2,2,2] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm13 = xmm2[2,2,2,2] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm13 = xmm1[0,1,2],xmm13[3] ; AVX2-ONLY-NEXT: vmovaps 32(%rsi), %xmm4 ; AVX2-ONLY-NEXT: vmovaps 32(%rdi), %xmm3 @@ -1469,7 +1469,7 @@ ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm7[1] ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm7 = xmm5[2],xmm8[2],xmm5[3],xmm8[3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm6 = xmm6[2,3,2,3] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm6[2,3,2,3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm7[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm0[0,1,2,3],ymm6[4,5,6,7] @@ -1487,11 +1487,11 @@ ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm13[0,1,2,3,4,5],ymm6[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm0[0,1,2,3],ymm6[4,5,6,7] ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm3[1,1,1,1] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm3[1,1,1,1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm4[1],xmm1[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm1 = xmm5[0],xmm8[0],xmm5[1],xmm8[1] -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm2 = xmm9[1,1,1,1] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm9[1,1,1,1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm10[1],xmm2[2,3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 @@ -1502,13 +1502,13 @@ ; AVX2-ONLY-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] ; AVX2-ONLY-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm2 # 16-byte Folded Reload ; AVX2-ONLY-NEXT: # xmm2 = xmm14[2],mem[2],xmm14[3],mem[3] -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm3 = xmm11[2,2,2,2] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm11[2,2,2,2] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm12[0,1,2],xmm3[3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm4 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7] ; AVX2-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm4 = xmm5[2,2,2,2] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm5[2,2,2,2] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm15[0,1,2],xmm4[3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm1[0,1],xmm4[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] @@ -1516,7 +1516,7 @@ ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm4[1] ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm4 = xmm12[2],xmm11[2],xmm12[3],xmm11[3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[2,3,2,3] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm2[2,3,2,3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] @@ -2334,7 +2334,7 @@ ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm9 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[4],ymm4[4],ymm3[5],ymm4[5] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm9, %xmm9 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm6[0],ymm5[0],ymm6[2],ymm5[2] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm13 = ymm13[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm13[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm13, %xmm13 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm9 = xmm9[0,1],xmm13[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7] @@ -2345,7 +2345,7 @@ ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm9 = ymm5[0],ymm6[0],ymm5[1],ymm6[1],ymm5[4],ymm6[4],ymm5[5],ymm6[5] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm9, %xmm9 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm4[1,0],ymm3[1,0],ymm4[5,4],ymm3[5,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm13 = ymm13[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm13[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm13, %xmm13 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm9 = xmm13[0,1],xmm9[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7] @@ -2356,7 +2356,7 @@ ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm7 = ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[6],ymm4[6],ymm3[7],ymm4[7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm7 ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm13 = ymm6[1],ymm5[1],ymm6[3],ymm5[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm13 = ymm13[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm13[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm13, %xmm13 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm13 = xmm7[0,1],xmm13[2,3] ; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %ymm7 @@ -2372,7 +2372,7 @@ ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm5 = ymm5[2],ymm6[2],ymm5[3],ymm6[3],ymm5[6],ymm6[6],ymm5[7],ymm6[7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm5 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm4[3,0],ymm3[3,0],ymm4[7,4],ymm3[7,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm3 = ymm3[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm3 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm5[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm12[4,5,6,7] @@ -2383,7 +2383,7 @@ ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm4 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm4 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm7[0],ymm2[0],ymm7[2],ymm2[2] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm5 = ymm5[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm5 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] @@ -2394,7 +2394,7 @@ ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm4 = ymm2[0],ymm7[0],ymm2[1],ymm7[1],ymm2[4],ymm7[4],ymm2[5],ymm7[5] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm4 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm1[1,0],ymm0[1,0],ymm1[5,4],ymm0[5,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm5 = ymm5[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm5 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] @@ -2405,7 +2405,7 @@ ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm4 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm4 ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm7[1],ymm2[1],ymm7[3],ymm2[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm5 = ymm5[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm5 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] @@ -2422,7 +2422,7 @@ ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[3,0],ymm0[3,0],ymm1[7,4],ymm0[7,4] ; AVX1-ONLY-NEXT: vmovaps 64(%rax), %ymm0 ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm7 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm7[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm8[4,5,6,7] @@ -2435,7 +2435,7 @@ ; AVX1-ONLY-NEXT: vmovaps 64(%rdx), %ymm8 ; AVX1-ONLY-NEXT: vmovaps 64(%rcx), %ymm9 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm9[0],ymm8[0],ymm9[2],ymm8[2] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm10 = ymm10[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm10[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm1[4,5,6,7] @@ -2446,7 +2446,7 @@ ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm7 = ymm8[0],ymm9[0],ymm8[1],ymm9[1],ymm8[4],ymm9[4],ymm8[5],ymm9[5] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm7 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm4[1,0],ymm3[1,0],ymm4[5,4],ymm3[5,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm10 = ymm10[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm10[0,1],xmm7[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm1[4,5,6,7] @@ -2457,7 +2457,7 @@ ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm7 = ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[6],ymm4[6],ymm3[7],ymm4[7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm7 ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm10 = ymm9[1],ymm8[1],ymm9[3],ymm8[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm10 = ymm10[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm10[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm1[4,5,6,7] @@ -2470,7 +2470,7 @@ ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm4[3,0],ymm3[3,0],ymm4[7,4],ymm3[7,4] ; AVX1-ONLY-NEXT: vmovaps 96(%r9), %ymm1 ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm4 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm3 = ymm3[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm3 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3] ; AVX1-ONLY-NEXT: vmovaps 96(%r10), %ymm4 @@ -2487,7 +2487,7 @@ ; AVX1-ONLY-NEXT: vmovaps 96(%rdx), %ymm8 ; AVX1-ONLY-NEXT: vmovaps 96(%rcx), %ymm9 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm9[0],ymm8[0],ymm9[2],ymm8[2] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm10 = ymm10[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm10[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] @@ -2498,7 +2498,7 @@ ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm7 = ymm8[0],ymm9[0],ymm8[1],ymm9[1],ymm8[4],ymm9[4],ymm8[5],ymm9[5] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm7 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm3[1,0],ymm2[1,0],ymm3[5,4],ymm2[5,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm10 = ymm10[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm10[0,1],xmm7[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] @@ -2509,7 +2509,7 @@ ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm7 = ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[6],ymm3[6],ymm2[7],ymm3[7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm7 ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm10 = ymm9[1],ymm8[1],ymm9[3],ymm8[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm10 = ymm10[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm10[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] @@ -2520,7 +2520,7 @@ ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm8[2],ymm9[2],ymm8[3],ymm9[3],ymm8[6],ymm9[6],ymm8[7],ymm9[7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm3[3,0],ymm2[3,0],ymm3[7,4],ymm2[7,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm2 = ymm2[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] @@ -2532,7 +2532,7 @@ ; AVX1-ONLY-NEXT: vmovaps (%rax), %xmm10 ; AVX1-ONLY-NEXT: vmovaps (%r10), %xmm1 ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm5 = xmm1[0],xmm10[0],xmm1[1],xmm10[1] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm6 = xmm5[0,1,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm5[0,1,0,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm6[6,7] ; AVX1-ONLY-NEXT: vmovaps (%rcx), %xmm6 @@ -2544,11 +2544,11 @@ ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm11 = xmm11[0],xmm8[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm11[0,1,2,3],ymm4[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm4 = xmm0[1,1,1,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm0[1,1,1,1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0],xmm9[1],xmm4[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],xmm8[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm8 = xmm3[1,1,1,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm8 = xmm3[1,1,1,1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0],xmm2[1],xmm8[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm8[0,1,2,3,4,5],ymm5[6,7] @@ -2556,12 +2556,12 @@ ; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm9[2],xmm0[3],xmm9[3] ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm3 = xmm10[2,2,2,2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm10[2,2,2,2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm1[0,1,2],xmm3[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm4 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm4 = xmm6[2,2,2,2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm6[2,2,2,2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm7[0,1,2],xmm4[3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm0[0,1],xmm4[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] @@ -2570,7 +2570,7 @@ ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm3[1] ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm1[2],xmm10[2],xmm1[3],xmm10[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm2[2,3,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] @@ -2582,7 +2582,7 @@ ; AVX1-ONLY-NEXT: vmovaps 32(%rax), %xmm8 ; AVX1-ONLY-NEXT: vmovaps 32(%r10), %xmm7 ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm5 = xmm7[0],xmm8[0],xmm7[1],xmm8[1] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm6 = xmm5[0,1,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm5[0,1,0,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm2[0,1,2,3,4,5],ymm6[6,7] ; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %xmm10 @@ -2594,11 +2594,11 @@ ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm15 = xmm15[0],xmm6[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm15[0,1,2,3],ymm9[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm15 = xmm0[1,1,1,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm0[1,1,1,1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm15[0],xmm1[1],xmm15[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm15[0,1],xmm6[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm15 = xmm3[1,1,1,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm3[1,1,1,1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm15[0],xmm4[1],xmm15[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm15[0,1,2,3,4,5],ymm5[6,7] @@ -2606,12 +2606,12 @@ ; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm3[2],xmm4[2],xmm3[3],xmm4[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm3 = xmm8[2,2,2,2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm8[2,2,2,2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm7[0,1,2],xmm3[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm4 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm4 = xmm10[2,2,2,2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm10[2,2,2,2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm2[0,1,2],xmm4[3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm0[0,1],xmm4[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] @@ -2620,7 +2620,7 @@ ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm7[2],xmm8[2],xmm7[3],xmm8[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] @@ -2632,7 +2632,7 @@ ; AVX1-ONLY-NEXT: vmovaps 64(%rax), %xmm7 ; AVX1-ONLY-NEXT: vmovaps 64(%r10), %xmm8 ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm10 = xmm8[0],xmm7[0],xmm8[1],xmm7[1] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm3 = xmm10[0,1,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm10[0,1,0,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm2[0,1,2,3,4,5],ymm3[6,7] ; AVX1-ONLY-NEXT: vmovaps 64(%rcx), %xmm15 @@ -2643,23 +2643,23 @@ ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm14 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm14 = xmm14[0],xmm2[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm14[0,1,2,3],ymm6[4,5,6,7] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm14 = xmm0[1,1,1,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm0[1,1,1,1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm14[0],xmm1[1],xmm14[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm14[0,1],xmm2[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm10 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm14 = xmm4[1,1,1,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm4[1,1,1,1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm14[0],xmm5[1],xmm14[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm14[0,1,2,3,4,5],ymm10[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm2[0,1,2,3],ymm10[4,5,6,7] ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm4[2],xmm5[2],xmm4[3],xmm5[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm7[2,2,2,2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm7[2,2,2,2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm8[0,1,2],xmm1[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm4 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm4 = xmm15[2,2,2,2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm15[2,2,2,2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm3[0,1,2],xmm4[3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm0[0,1],xmm4[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm4[0,1,2,3],ymm1[4,5,6,7] @@ -2667,7 +2667,7 @@ ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm3[1] ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm3 = xmm8[2],xmm7[2],xmm8[3],xmm7[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm2[2,3,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm0[0,1,2,3],ymm2[4,5,6,7] @@ -2678,7 +2678,7 @@ ; AVX1-ONLY-NEXT: vmovaps 96(%rax), %xmm4 ; AVX1-ONLY-NEXT: vmovaps 96(%r10), %xmm7 ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm8 = xmm7[0],xmm4[0],xmm7[1],xmm4[1] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm14 = xmm8[0,1,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm8[0,1,0,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm0[0,1,2,3,4,5],ymm14[6,7] ; AVX1-ONLY-NEXT: vmovaps 96(%rcx), %xmm14 @@ -2689,23 +2689,23 @@ ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm13 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm13 = xmm13[0],xmm2[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm13[0,1,2,3],ymm6[4,5,6,7] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm13 = xmm0[1,1,1,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm13 = xmm0[1,1,1,1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm13 = xmm13[0],xmm1[1],xmm13[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm13[0,1],xmm2[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm13 = xmm3[1,1,1,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm13 = xmm3[1,1,1,1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm13 = xmm13[0],xmm5[1],xmm13[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm0, %ymm13 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm13[0,1,2,3,4,5],ymm8[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7] ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm3[2],xmm5[2],xmm3[3],xmm5[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm3 = xmm4[2,2,2,2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm4[2,2,2,2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm7[0,1,2],xmm3[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm5 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3,4,5],ymm3[6,7] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm5 = xmm14[2,2,2,2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm14[2,2,2,2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm15[0,1,2],xmm5[3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm0[0,1],xmm5[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5,6,7] @@ -2713,7 +2713,7 @@ ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm5[1] ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm4 = xmm7[2],xmm4[2],xmm7[3],xmm4[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm4[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] @@ -3009,13 +3009,13 @@ ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm10[0,1,2,3,4,5],ymm7[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm6 = xmm5[1,1,1,1] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm5[1,1,1,1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0],xmm4[1],xmm6[2,3] ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm7 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1],xmm7[2,3] ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm7 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm10 = xmm9[1,1,1,1] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm9[1,1,1,1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm10[0],xmm8[1],xmm10[2,3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm10 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm10[0,1,2,3,4,5],ymm7[6,7] @@ -3023,12 +3023,12 @@ ; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm4 = xmm5[2],xmm4[2],xmm5[3],xmm4[3] ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm5 = xmm9[2],xmm8[2],xmm9[3],xmm8[3] -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm6 = xmm2[2,2,2,2] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm2[2,2,2,2] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm3[0,1,2],xmm6[3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm7 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3,4,5],ymm6[6,7] -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm7 = xmm0[2,2,2,2] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm0[2,2,2,2] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm1[0,1,2],xmm7[3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm4[0,1],xmm7[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] @@ -3037,7 +3037,7 @@ ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm4[1],xmm0[1] ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm3[2],xmm2[2],xmm3[3],xmm2[3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm2 = xmm5[2,3,2,3] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm5[2,3,2,3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] @@ -3064,13 +3064,13 @@ ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm10[0,1,2,3,4,5],ymm7[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm6 = xmm5[1,1,1,1] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm5[1,1,1,1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0],xmm4[1],xmm6[2,3] ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm7 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1],xmm7[2,3] ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm7 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm10 = xmm9[1,1,1,1] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm9[1,1,1,1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm10[0],xmm8[1],xmm10[2,3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm10 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm10[0,1,2,3,4,5],ymm7[6,7] @@ -3078,12 +3078,12 @@ ; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm4 = xmm5[2],xmm4[2],xmm5[3],xmm4[3] ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm5 = xmm9[2],xmm8[2],xmm9[3],xmm8[3] -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm6 = xmm2[2,2,2,2] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm2[2,2,2,2] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm3[0,1,2],xmm6[3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm7 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3,4,5],ymm6[6,7] -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm7 = xmm0[2,2,2,2] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm0[2,2,2,2] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm1[0,1,2],xmm7[3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm4[0,1],xmm7[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] @@ -3092,7 +3092,7 @@ ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm4[1],xmm0[1] ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm3[2],xmm2[2],xmm3[3],xmm2[3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm2 = xmm5[2,3,2,3] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm5[2,3,2,3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] @@ -3118,25 +3118,25 @@ ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm15[0,1,2,3,4,5],ymm8[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm9[0,1,2,3],ymm8[4,5,6,7] -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm8 = xmm5[1,1,1,1] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm8 = xmm5[1,1,1,1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0],xmm4[1],xmm8[2,3] ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm15 = xmm1[0],xmm6[0],xmm1[1],xmm6[1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0,1],xmm15[2,3] ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm15 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15 -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm14 = xmm7[1,1,1,1] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm7[1,1,1,1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm14[0],xmm0[1],xmm14[2,3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm8[0,1,2,3],ymm14[4,5,6,7] ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm4 = xmm5[2],xmm4[2],xmm5[3],xmm4[3] ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm7[2],xmm0[2],xmm7[3],xmm0[3] -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm5 = xmm2[2,2,2,2] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm2[2,2,2,2] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm3[0,1,2],xmm5[3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm7 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0,1,2,3,4,5],ymm5[6,7] -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm7 = xmm6[2,2,2,2] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm6[2,2,2,2] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm1[0,1,2],xmm7[3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm4[0,1],xmm7[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm7[0,1,2,3],ymm5[4,5,6,7] @@ -3144,7 +3144,7 @@ ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm4[1],xmm1[1] ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm3[2],xmm2[2],xmm3[3],xmm2[3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm1[0,1,2,3],ymm0[4,5,6,7] @@ -3169,25 +3169,25 @@ ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm0, %ymm13 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],ymm13[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm5[0,1,2,3],ymm12[4,5,6,7] -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm5 = xmm14[1,1,1,1] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm14[1,1,1,1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0],xmm6[1],xmm5[2,3] ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm12 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1],xmm12[2,3] ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm12 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm12 -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm11 = xmm0[1,1,1,1] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm11 = xmm0[1,1,1,1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm11 = xmm11[0],xmm15[1],xmm11[2,3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm11 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],ymm12[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm11[4,5,6,7] ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm6 = xmm14[2],xmm6[2],xmm14[3],xmm6[3] ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm15[2],xmm0[3],xmm15[3] -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm11 = xmm2[2,2,2,2] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm11 = xmm2[2,2,2,2] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm11 = xmm1[0,1,2],xmm11[3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm11 ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm12 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm12[0,1,2,3,4,5],ymm11[6,7] -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm12 = xmm4[2,2,2,2] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm12 = xmm4[2,2,2,2] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm12 = xmm3[0,1,2],xmm12[3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm12 = xmm6[0,1],xmm12[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm12[0,1,2,3],ymm11[4,5,6,7] @@ -3195,7 +3195,7 @@ ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm6[1],xmm3[1] ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] @@ -4904,7 +4904,7 @@ ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm10 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[4],ymm4[4],ymm3[5],ymm4[5] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm13 = ymm6[0],ymm5[0],ymm6[2],ymm5[2] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm13 = ymm13[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm13[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm13, %xmm13 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm10[0,1],xmm13[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7] @@ -4915,7 +4915,7 @@ ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm10 = ymm5[0],ymm6[0],ymm5[1],ymm6[1],ymm5[4],ymm6[4],ymm5[5],ymm6[5] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm4[1,0],ymm3[1,0],ymm4[5,4],ymm3[5,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm13 = ymm13[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm13[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm13, %xmm13 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm13[0,1],xmm10[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7] @@ -4926,7 +4926,7 @@ ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm9 = ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[6],ymm4[6],ymm3[7],ymm4[7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm9, %xmm9 ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm13 = ymm6[1],ymm5[1],ymm6[3],ymm5[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm13 = ymm13[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm13[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm13, %xmm13 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm13 = xmm9[0,1],xmm13[2,3] ; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %ymm9 @@ -4942,7 +4942,7 @@ ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm5 = ymm5[2],ymm6[2],ymm5[3],ymm6[3],ymm5[6],ymm6[6],ymm5[7],ymm6[7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm5 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm4[3,0],ymm3[3,0],ymm4[7,4],ymm3[7,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm3 = ymm3[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm3 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm5[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm12[4,5,6,7] @@ -4953,7 +4953,7 @@ ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm4 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm4 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm5 = ymm9[0],ymm2[0],ymm9[2],ymm2[2] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm5 = ymm5[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm5 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] @@ -4964,7 +4964,7 @@ ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm4 = ymm2[0],ymm9[0],ymm2[1],ymm9[1],ymm2[4],ymm9[4],ymm2[5],ymm9[5] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm4 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm1[1,0],ymm0[1,0],ymm1[5,4],ymm0[5,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm5 = ymm5[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm5 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] @@ -4975,7 +4975,7 @@ ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm4 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm4, %xmm4 ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm9[1],ymm2[1],ymm9[3],ymm2[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm5 = ymm5[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm5 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] @@ -4992,7 +4992,7 @@ ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[3,0],ymm0[3,0],ymm1[7,4],ymm0[7,4] ; AVX1-ONLY-NEXT: vmovaps 64(%rax), %ymm0 ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm8, %xmm8 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm8[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm7[4,5,6,7] @@ -5005,7 +5005,7 @@ ; AVX1-ONLY-NEXT: vmovaps 64(%rdx), %ymm8 ; AVX1-ONLY-NEXT: vmovaps 64(%rcx), %ymm9 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm9[0],ymm8[0],ymm9[2],ymm8[2] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm10 = ymm10[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm10[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm1[4,5,6,7] @@ -5016,7 +5016,7 @@ ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm7 = ymm8[0],ymm9[0],ymm8[1],ymm9[1],ymm8[4],ymm9[4],ymm8[5],ymm9[5] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm7 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm4[1,0],ymm3[1,0],ymm4[5,4],ymm3[5,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm10 = ymm10[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm10[0,1],xmm7[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm1[4,5,6,7] @@ -5027,7 +5027,7 @@ ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm7 = ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[6],ymm4[6],ymm3[7],ymm4[7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm7 ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm10 = ymm9[1],ymm8[1],ymm9[3],ymm8[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm10 = ymm10[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm10[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm1[4,5,6,7] @@ -5040,7 +5040,7 @@ ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm4[3,0],ymm3[3,0],ymm4[7,4],ymm3[7,4] ; AVX1-ONLY-NEXT: vmovaps 96(%r9), %ymm1 ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm5, %xmm4 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm3 = ymm3[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm3 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3] ; AVX1-ONLY-NEXT: vmovaps 96(%r10), %ymm4 @@ -5057,7 +5057,7 @@ ; AVX1-ONLY-NEXT: vmovaps 96(%rdx), %ymm8 ; AVX1-ONLY-NEXT: vmovaps 96(%rcx), %ymm9 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm9[0],ymm8[0],ymm9[2],ymm8[2] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm10 = ymm10[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm10[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] @@ -5068,7 +5068,7 @@ ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm7 = ymm8[0],ymm9[0],ymm8[1],ymm9[1],ymm8[4],ymm9[4],ymm8[5],ymm9[5] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm7 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm3[1,0],ymm2[1,0],ymm3[5,4],ymm2[5,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm10 = ymm10[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm10[0,1],xmm7[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] @@ -5079,7 +5079,7 @@ ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm7 = ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[6],ymm3[6],ymm2[7],ymm3[7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm7 ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm10 = ymm9[1],ymm8[1],ymm9[3],ymm8[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm10 = ymm10[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm10[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] @@ -5090,7 +5090,7 @@ ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm1 = ymm8[2],ymm9[2],ymm8[3],ymm9[3],ymm8[6],ymm9[6],ymm8[7],ymm9[7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm3[3,0],ymm2[3,0],ymm3[7,4],ymm2[7,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm2 = ymm2[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm2, %xmm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] @@ -5109,7 +5109,7 @@ ; AVX1-ONLY-NEXT: vmovaps 128(%rdx), %ymm8 ; AVX1-ONLY-NEXT: vmovaps 128(%rcx), %ymm9 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm9[0],ymm8[0],ymm9[2],ymm8[2] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm10 = ymm10[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm10[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] @@ -5120,7 +5120,7 @@ ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm7 = ymm8[0],ymm9[0],ymm8[1],ymm9[1],ymm8[4],ymm9[4],ymm8[5],ymm9[5] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm7 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm1[1,0],ymm0[1,0],ymm1[5,4],ymm0[5,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm10 = ymm10[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm10[0,1],xmm7[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] @@ -5131,7 +5131,7 @@ ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm7 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm7 ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm10 = ymm9[1],ymm8[1],ymm9[3],ymm8[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm10 = ymm10[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm10[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] @@ -5142,7 +5142,7 @@ ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm3 = ymm8[2],ymm9[2],ymm8[3],ymm9[3],ymm8[6],ymm9[6],ymm8[7],ymm9[7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[3,0],ymm0[3,0],ymm1[7,4],ymm0[7,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm1 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] @@ -5161,7 +5161,7 @@ ; AVX1-ONLY-NEXT: vmovaps 160(%rdx), %ymm8 ; AVX1-ONLY-NEXT: vmovaps 160(%rcx), %ymm9 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm9[0],ymm8[0],ymm9[2],ymm8[2] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm10 = ymm10[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm10[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] @@ -5172,7 +5172,7 @@ ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm7 = ymm8[0],ymm9[0],ymm8[1],ymm9[1],ymm8[4],ymm9[4],ymm8[5],ymm9[5] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm7 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm1[1,0],ymm0[1,0],ymm1[5,4],ymm0[5,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm10 = ymm10[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm10[0,1],xmm7[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] @@ -5183,7 +5183,7 @@ ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm7 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm7 ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm10 = ymm9[1],ymm8[1],ymm9[3],ymm8[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm10 = ymm10[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm10[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] @@ -5194,7 +5194,7 @@ ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm3 = ymm8[2],ymm9[2],ymm8[3],ymm9[3],ymm8[6],ymm9[6],ymm8[7],ymm9[7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[3,0],ymm0[3,0],ymm1[7,4],ymm0[7,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm1 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] @@ -5213,7 +5213,7 @@ ; AVX1-ONLY-NEXT: vmovaps 192(%rdx), %ymm8 ; AVX1-ONLY-NEXT: vmovaps 192(%rcx), %ymm9 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm9[0],ymm8[0],ymm9[2],ymm8[2] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm10 = ymm10[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm10[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] @@ -5224,7 +5224,7 @@ ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm7 = ymm8[0],ymm9[0],ymm8[1],ymm9[1],ymm8[4],ymm9[4],ymm8[5],ymm9[5] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm7 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm1[1,0],ymm0[1,0],ymm1[5,4],ymm0[5,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm10 = ymm10[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm10[0,1],xmm7[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] @@ -5235,7 +5235,7 @@ ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm7 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm7 ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm10 = ymm9[1],ymm8[1],ymm9[3],ymm8[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm10 = ymm10[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm10[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] @@ -5246,7 +5246,7 @@ ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm3 = ymm8[2],ymm9[2],ymm8[3],ymm9[3],ymm8[6],ymm9[6],ymm8[7],ymm9[7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[3,0],ymm0[3,0],ymm1[7,4],ymm0[7,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm1 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] @@ -5265,7 +5265,7 @@ ; AVX1-ONLY-NEXT: vmovaps 224(%rdx), %ymm8 ; AVX1-ONLY-NEXT: vmovaps 224(%rcx), %ymm9 ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm10 = ymm9[0],ymm8[0],ymm9[2],ymm8[2] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm10 = ymm10[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm10[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] @@ -5276,7 +5276,7 @@ ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} ymm7 = ymm8[0],ymm9[0],ymm8[1],ymm9[1],ymm8[4],ymm9[4],ymm8[5],ymm9[5] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm7 ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm1[1,0],ymm0[1,0],ymm1[5,4],ymm0[5,4] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm10 = ymm10[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm10[0,1],xmm7[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] @@ -5287,7 +5287,7 @@ ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm7 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm7, %xmm7 ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm10 = ymm9[1],ymm8[1],ymm9[3],ymm8[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm10 = ymm10[0,1,2,0,4,5,6,4] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm10 = ymm10[0,1,2,0,4,5,6,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm10, %xmm10 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm10[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] @@ -5298,7 +5298,7 @@ ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} ymm3 = ymm8[2],ymm9[2],ymm8[3],ymm9[3],ymm8[6],ymm9[6],ymm8[7],ymm9[7] ; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm1[3,0],ymm0[3,0],ymm1[7,4],ymm0[7,4] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm3, %xmm1 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,0,2,3,6,4,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0,2,3,6,4,6,7] ; AVX1-ONLY-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] @@ -5310,7 +5310,7 @@ ; AVX1-ONLY-NEXT: vmovaps (%rax), %xmm0 ; AVX1-ONLY-NEXT: vmovaps (%r10), %xmm1 ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm5 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm6 = xmm5[0,1,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm5[0,1,0,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm6[6,7] ; AVX1-ONLY-NEXT: vmovaps (%rcx), %xmm6 @@ -5322,11 +5322,11 @@ ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm11 = xmm11[0],xmm8[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm11[0,1,2,3],ymm4[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm4 = xmm10[1,1,1,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm10[1,1,1,1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0],xmm9[1],xmm4[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],xmm8[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm8 = xmm3[1,1,1,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm8 = xmm3[1,1,1,1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0],xmm2[1],xmm8[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm8[0,1,2,3,4,5],ymm5[6,7] @@ -5334,12 +5334,12 @@ ; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm4 = xmm10[2],xmm9[2],xmm10[3],xmm9[3] ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm3 = xmm0[2,2,2,2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm0[2,2,2,2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm1[0,1,2],xmm3[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm5 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3,4,5],ymm3[6,7] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm5 = xmm6[2,2,2,2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm6[2,2,2,2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm7[0,1,2],xmm5[3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm4[0,1],xmm5[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5,6,7] @@ -5348,7 +5348,7 @@ ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm4[1],xmm3[1] ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm2[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm2[2,3,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] @@ -5360,7 +5360,7 @@ ; AVX1-ONLY-NEXT: vmovaps 32(%rax), %xmm3 ; AVX1-ONLY-NEXT: vmovaps 32(%r10), %xmm4 ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm5 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm6 = xmm5[0,1,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm5[0,1,0,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm6[6,7] ; AVX1-ONLY-NEXT: vmovaps 32(%rcx), %xmm6 @@ -5372,11 +5372,11 @@ ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm11 = xmm11[0],xmm8[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm11[0,1,2,3],ymm2[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm2 = xmm10[1,1,1,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm10[1,1,1,1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm9[1],xmm2[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm8[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm8 = xmm1[1,1,1,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm8 = xmm1[1,1,1,1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0],xmm0[1],xmm8[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm8[0,1,2,3,4,5],ymm5[6,7] @@ -5384,12 +5384,12 @@ ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm10[2],xmm9[2],xmm10[3],xmm9[3] ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm3[2,2,2,2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm3[2,2,2,2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm4[0,1,2],xmm1[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm5 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm5 = xmm6[2,2,2,2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm6[2,2,2,2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm7[0,1,2],xmm5[3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm2[0,1],xmm5[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm1[4,5,6,7] @@ -5398,7 +5398,7 @@ ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm2[1],xmm1[1] ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] @@ -5410,7 +5410,7 @@ ; AVX1-ONLY-NEXT: vmovaps 64(%rax), %xmm3 ; AVX1-ONLY-NEXT: vmovaps 64(%r10), %xmm4 ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm5 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm6 = xmm5[0,1,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm5[0,1,0,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm6[6,7] ; AVX1-ONLY-NEXT: vmovaps 64(%rcx), %xmm6 @@ -5422,11 +5422,11 @@ ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm11 = xmm11[0],xmm8[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm11[0,1,2,3],ymm2[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm2 = xmm10[1,1,1,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm10[1,1,1,1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm9[1],xmm2[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm8[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm8 = xmm1[1,1,1,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm8 = xmm1[1,1,1,1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0],xmm0[1],xmm8[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm8[0,1,2,3,4,5],ymm5[6,7] @@ -5434,12 +5434,12 @@ ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm10[2],xmm9[2],xmm10[3],xmm9[3] ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm3[2,2,2,2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm3[2,2,2,2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm4[0,1,2],xmm1[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm5 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm5 = xmm6[2,2,2,2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm6[2,2,2,2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm7[0,1,2],xmm5[3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm2[0,1],xmm5[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm1[4,5,6,7] @@ -5448,7 +5448,7 @@ ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm2[1],xmm1[1] ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] @@ -5460,7 +5460,7 @@ ; AVX1-ONLY-NEXT: vmovaps 96(%rax), %xmm3 ; AVX1-ONLY-NEXT: vmovaps 96(%r10), %xmm4 ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm5 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm6 = xmm5[0,1,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm5[0,1,0,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm6[6,7] ; AVX1-ONLY-NEXT: vmovaps 96(%rcx), %xmm6 @@ -5472,11 +5472,11 @@ ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm11 = xmm11[0],xmm8[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm11[0,1,2,3],ymm2[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm2 = xmm10[1,1,1,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm10[1,1,1,1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm9[1],xmm2[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm8[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm8 = xmm1[1,1,1,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm8 = xmm1[1,1,1,1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0],xmm0[1],xmm8[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm8[0,1,2,3,4,5],ymm5[6,7] @@ -5484,12 +5484,12 @@ ; AVX1-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm10[2],xmm9[2],xmm10[3],xmm9[3] ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm3[2,2,2,2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm3[2,2,2,2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm4[0,1,2],xmm1[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm5 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm5 = xmm6[2,2,2,2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm6[2,2,2,2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm7[0,1,2],xmm5[3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm2[0,1],xmm5[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm1[4,5,6,7] @@ -5498,7 +5498,7 @@ ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm2[1],xmm1[1] ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] @@ -5510,7 +5510,7 @@ ; AVX1-ONLY-NEXT: vmovaps 128(%rax), %xmm3 ; AVX1-ONLY-NEXT: vmovaps 128(%r10), %xmm4 ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm5 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm6 = xmm5[0,1,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm5[0,1,0,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm2[0,1,2,3,4,5],ymm6[6,7] ; AVX1-ONLY-NEXT: vmovaps 128(%rcx), %xmm6 @@ -5522,11 +5522,11 @@ ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm11 = xmm11[0],xmm8[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm10 = xmm2[1,1,1,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm2[1,1,1,1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm10[0],xmm9[1],xmm10[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm10[0,1],xmm8[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm10 = xmm1[1,1,1,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm1[1,1,1,1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm10[0],xmm0[1],xmm10[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm10 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm10[0,1,2,3,4,5],ymm5[6,7] @@ -5534,12 +5534,12 @@ ; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm2[2],xmm9[2],xmm2[3],xmm9[3] ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm3[2,2,2,2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm3[2,2,2,2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm4[0,1,2],xmm1[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm5 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm5 = xmm6[2,2,2,2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm6[2,2,2,2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm7[0,1,2],xmm5[3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm2[0,1],xmm5[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm1[4,5,6,7] @@ -5548,7 +5548,7 @@ ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm2[1],xmm1[1] ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] @@ -5560,7 +5560,7 @@ ; AVX1-ONLY-NEXT: vmovaps 160(%rax), %xmm9 ; AVX1-ONLY-NEXT: vmovaps 160(%r10), %xmm8 ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm5 = xmm8[0],xmm9[0],xmm8[1],xmm9[1] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm6 = xmm5[0,1,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm5[0,1,0,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm2[0,1,2,3,4,5],ymm6[6,7] ; AVX1-ONLY-NEXT: vmovaps 160(%rcx), %xmm3 @@ -5572,11 +5572,11 @@ ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm15 = xmm15[0],xmm6[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm15[0,1,2,3],ymm10[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm15 = xmm0[1,1,1,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm0[1,1,1,1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm15[0],xmm1[1],xmm15[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm15[0,1],xmm6[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm15 = xmm4[1,1,1,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm15 = xmm4[1,1,1,1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm15 = xmm15[0],xmm7[1],xmm15[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm15[0,1,2,3,4,5],ymm5[6,7] @@ -5584,12 +5584,12 @@ ; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm4[2],xmm7[2],xmm4[3],xmm7[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm4 = xmm9[2,2,2,2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm9[2,2,2,2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm8[0,1,2],xmm4[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm5 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm5 = xmm3[2,2,2,2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm3[2,2,2,2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm2[0,1,2],xmm5[3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm0[0,1],xmm5[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7] @@ -5598,7 +5598,7 @@ ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm8[2],xmm9[2],xmm8[3],xmm9[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] @@ -5610,7 +5610,7 @@ ; AVX1-ONLY-NEXT: vmovaps 192(%rax), %xmm7 ; AVX1-ONLY-NEXT: vmovaps 192(%r10), %xmm8 ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm9 = xmm8[0],xmm7[0],xmm8[1],xmm7[1] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm3 = xmm9[0,1,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm9[0,1,0,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm2[0,1,2,3,4,5],ymm3[6,7] ; AVX1-ONLY-NEXT: vmovaps 192(%rcx), %xmm15 @@ -5621,23 +5621,23 @@ ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm14 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm14 = xmm14[0],xmm2[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm14[0,1,2,3],ymm6[4,5,6,7] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm14 = xmm0[1,1,1,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm0[1,1,1,1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm14[0],xmm1[1],xmm14[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm14[0,1],xmm2[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm9 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm14 = xmm4[1,1,1,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm4[1,1,1,1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm14[0],xmm5[1],xmm14[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm14[0,1,2,3,4,5],ymm9[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm2[0,1,2,3],ymm9[4,5,6,7] ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm4[2],xmm5[2],xmm4[3],xmm5[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm7[2,2,2,2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm7[2,2,2,2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm1 = xmm8[0,1,2],xmm1[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm4 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm1[6,7] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm4 = xmm15[2,2,2,2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm4 = xmm15[2,2,2,2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm3[0,1,2],xmm4[3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm4 = xmm0[0,1],xmm4[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm4[0,1,2,3],ymm1[4,5,6,7] @@ -5645,7 +5645,7 @@ ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm3[1] ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm3 = xmm8[2],xmm7[2],xmm8[3],xmm7[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm2[2,3,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm0[0,1,2,3],ymm2[4,5,6,7] @@ -5656,7 +5656,7 @@ ; AVX1-ONLY-NEXT: vmovaps 224(%rax), %xmm4 ; AVX1-ONLY-NEXT: vmovaps 224(%r10), %xmm7 ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm8 = xmm7[0],xmm4[0],xmm7[1],xmm4[1] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm14 = xmm8[0,1,0,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm8[0,1,0,1] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm0[0,1,2,3,4,5],ymm14[6,7] ; AVX1-ONLY-NEXT: vmovaps 224(%rcx), %xmm14 @@ -5667,23 +5667,23 @@ ; AVX1-ONLY-NEXT: vunpcklps {{.*#+}} xmm13 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; AVX1-ONLY-NEXT: vmovlhps {{.*#+}} xmm13 = xmm13[0],xmm2[0] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm13[0,1,2,3],ymm6[4,5,6,7] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm13 = xmm0[1,1,1,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm13 = xmm0[1,1,1,1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm13 = xmm13[0],xmm1[1],xmm13[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm2 = xmm13[0,1],xmm2[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm13 = xmm3[1,1,1,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm13 = xmm3[1,1,1,1] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm13 = xmm13[0],xmm5[1],xmm13[2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm0, %ymm13 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm13[0,1,2,3,4,5],ymm8[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm8[4,5,6,7] ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm3[2],xmm5[2],xmm3[3],xmm5[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm3 = xmm4[2,2,2,2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm4[2,2,2,2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm3 = xmm7[0,1,2],xmm3[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm5 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3,4,5],ymm3[6,7] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm5 = xmm14[2,2,2,2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm14[2,2,2,2] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm15[0,1,2],xmm5[3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm0[0,1],xmm5[2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5,6,7] @@ -5691,7 +5691,7 @@ ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm5[1] ; AVX1-ONLY-NEXT: vunpckhps {{.*#+}} xmm4 = xmm7[2],xmm4[2],xmm7[3],xmm4[3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm4[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] @@ -6255,13 +6255,13 @@ ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm10[0,1,2,3,4,5],ymm7[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm6 = xmm4[1,1,1,1] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm4[1,1,1,1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0],xmm3[1],xmm6[2,3] ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm7 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1],xmm7[2,3] ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm7 = xmm5[0],xmm2[0],xmm5[1],xmm2[1] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm10 = xmm9[1,1,1,1] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm9[1,1,1,1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm10[0],xmm8[1],xmm10[2,3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm10 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm10[0,1,2,3,4,5],ymm7[6,7] @@ -6269,12 +6269,12 @@ ; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm3 = xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm4 = xmm9[2],xmm8[2],xmm9[3],xmm8[3] -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm6 = xmm2[2,2,2,2] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm2[2,2,2,2] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm5[0,1,2],xmm6[3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm7 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3,4,5],ymm6[6,7] -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm7 = xmm0[2,2,2,2] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm0[2,2,2,2] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm1[0,1,2],xmm7[3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm3[0,1],xmm7[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] @@ -6283,7 +6283,7 @@ ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm3[1],xmm0[1] ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm5[2],xmm2[2],xmm5[3],xmm2[3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm2 = xmm4[2,3,2,3] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm4[2,3,2,3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] @@ -6310,13 +6310,13 @@ ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm10[0,1,2,3,4,5],ymm7[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm6 = xmm5[1,1,1,1] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm5[1,1,1,1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0],xmm4[1],xmm6[2,3] ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm7 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1],xmm7[2,3] ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm7 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm10 = xmm8[1,1,1,1] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm8[1,1,1,1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm10[0],xmm9[1],xmm10[2,3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm10 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm10[0,1,2,3,4,5],ymm7[6,7] @@ -6324,12 +6324,12 @@ ; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm4 = xmm5[2],xmm4[2],xmm5[3],xmm4[3] ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm5 = xmm8[2],xmm9[2],xmm8[3],xmm9[3] -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm6 = xmm2[2,2,2,2] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm2[2,2,2,2] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm3[0,1,2],xmm6[3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm7 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3,4,5],ymm6[6,7] -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm7 = xmm0[2,2,2,2] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm0[2,2,2,2] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm1[0,1,2],xmm7[3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm4[0,1],xmm7[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] @@ -6338,7 +6338,7 @@ ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm4[1],xmm0[1] ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm3[2],xmm2[2],xmm3[3],xmm2[3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm2 = xmm5[2,3,2,3] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm5[2,3,2,3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] @@ -6365,13 +6365,13 @@ ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm10[0,1,2,3,4,5],ymm7[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm6 = xmm5[1,1,1,1] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm5[1,1,1,1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0],xmm4[1],xmm6[2,3] ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm7 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1],xmm7[2,3] ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm7 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm10 = xmm8[1,1,1,1] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm8[1,1,1,1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm10[0],xmm9[1],xmm10[2,3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm10 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm10[0,1,2,3,4,5],ymm7[6,7] @@ -6379,12 +6379,12 @@ ; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm4 = xmm5[2],xmm4[2],xmm5[3],xmm4[3] ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm5 = xmm8[2],xmm9[2],xmm8[3],xmm9[3] -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm6 = xmm2[2,2,2,2] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm2[2,2,2,2] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm3[0,1,2],xmm6[3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm7 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3,4,5],ymm6[6,7] -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm7 = xmm0[2,2,2,2] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm0[2,2,2,2] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm1[0,1,2],xmm7[3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm4[0,1],xmm7[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] @@ -6393,7 +6393,7 @@ ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm4[1],xmm0[1] ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm3[2],xmm2[2],xmm3[3],xmm2[3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm2 = xmm5[2,3,2,3] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm5[2,3,2,3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] @@ -6420,13 +6420,13 @@ ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm10[0,1,2,3,4,5],ymm7[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm6 = xmm5[1,1,1,1] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm5[1,1,1,1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0],xmm4[1],xmm6[2,3] ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm7 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1],xmm7[2,3] ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm7 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm10 = xmm9[1,1,1,1] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm9[1,1,1,1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm10[0],xmm8[1],xmm10[2,3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm10 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm10[0,1,2,3,4,5],ymm7[6,7] @@ -6434,12 +6434,12 @@ ; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm4 = xmm5[2],xmm4[2],xmm5[3],xmm4[3] ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm5 = xmm9[2],xmm8[2],xmm9[3],xmm8[3] -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm6 = xmm2[2,2,2,2] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm2[2,2,2,2] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm3[0,1,2],xmm6[3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm7 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3,4,5],ymm6[6,7] -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm7 = xmm0[2,2,2,2] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm0[2,2,2,2] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm1[0,1,2],xmm7[3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm4[0,1],xmm7[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] @@ -6448,7 +6448,7 @@ ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm4[1],xmm0[1] ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm3[2],xmm2[2],xmm3[3],xmm2[3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm2 = xmm5[2,3,2,3] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm5[2,3,2,3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] @@ -6475,13 +6475,13 @@ ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm10[0,1,2,3,4,5],ymm7[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm6 = xmm5[1,1,1,1] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm5[1,1,1,1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0],xmm4[1],xmm6[2,3] ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm7 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1],xmm7[2,3] ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm7 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm10 = xmm9[1,1,1,1] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm9[1,1,1,1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm10[0],xmm8[1],xmm10[2,3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm10 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm10[0,1,2,3,4,5],ymm7[6,7] @@ -6489,12 +6489,12 @@ ; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm4 = xmm5[2],xmm4[2],xmm5[3],xmm4[3] ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm5 = xmm9[2],xmm8[2],xmm9[3],xmm8[3] -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm6 = xmm2[2,2,2,2] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm2[2,2,2,2] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm3[0,1,2],xmm6[3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm7 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3,4,5],ymm6[6,7] -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm7 = xmm0[2,2,2,2] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm0[2,2,2,2] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm1[0,1,2],xmm7[3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm4[0,1],xmm7[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] @@ -6503,7 +6503,7 @@ ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm4[1],xmm0[1] ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm3[2],xmm2[2],xmm3[3],xmm2[3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm2 = xmm5[2,3,2,3] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm5[2,3,2,3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] @@ -6530,13 +6530,13 @@ ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm10[0,1,2,3,4,5],ymm7[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm6 = xmm5[1,1,1,1] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm5[1,1,1,1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0],xmm4[1],xmm6[2,3] ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm7 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1],xmm7[2,3] ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm7 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7 -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm10 = xmm9[1,1,1,1] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm9[1,1,1,1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm10 = xmm10[0],xmm8[1],xmm10[2,3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm10 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm10[0,1,2,3,4,5],ymm7[6,7] @@ -6544,12 +6544,12 @@ ; AVX2-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm4 = xmm5[2],xmm4[2],xmm5[3],xmm4[3] ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm5 = xmm9[2],xmm8[2],xmm9[3],xmm8[3] -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm6 = xmm2[2,2,2,2] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm2[2,2,2,2] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm6 = xmm3[0,1,2],xmm6[3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6 ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm7 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3,4,5],ymm6[6,7] -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm7 = xmm0[2,2,2,2] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm0[2,2,2,2] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm1[0,1,2],xmm7[3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm4[0,1],xmm7[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7] @@ -6558,7 +6558,7 @@ ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm4[1],xmm0[1] ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm3[2],xmm2[2],xmm3[3],xmm2[3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm2 = xmm5[2,3,2,3] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm5[2,3,2,3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] @@ -6584,25 +6584,25 @@ ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm15[0,1,2,3,4,5],ymm8[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm9[0,1,2,3],ymm8[4,5,6,7] -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm8 = xmm5[1,1,1,1] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm8 = xmm5[1,1,1,1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0],xmm4[1],xmm8[2,3] ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm15 = xmm1[0],xmm6[0],xmm1[1],xmm6[1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0,1],xmm15[2,3] ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm15 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15 -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm14 = xmm7[1,1,1,1] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm7[1,1,1,1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm14 = xmm14[0],xmm0[1],xmm14[2,3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm8[0,1,2,3],ymm14[4,5,6,7] ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm4 = xmm5[2],xmm4[2],xmm5[3],xmm4[3] ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm7[2],xmm0[2],xmm7[3],xmm0[3] -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm5 = xmm2[2,2,2,2] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm2[2,2,2,2] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm3[0,1,2],xmm5[3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm7 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0,1,2,3,4,5],ymm5[6,7] -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm7 = xmm6[2,2,2,2] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm6[2,2,2,2] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm1[0,1,2],xmm7[3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm7 = xmm4[0,1],xmm7[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm7[0,1,2,3],ymm5[4,5,6,7] @@ -6610,7 +6610,7 @@ ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm4[1],xmm1[1] ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm2 = xmm3[2],xmm2[2],xmm3[3],xmm2[3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm1[0,1,2,3],ymm0[4,5,6,7] @@ -6635,25 +6635,25 @@ ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm13, %ymm0, %ymm13 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],ymm13[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm5[0,1,2,3],ymm12[4,5,6,7] -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm5 = xmm14[1,1,1,1] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm14[1,1,1,1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0],xmm6[1],xmm5[2,3] ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm12 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1],xmm12[2,3] ; AVX2-ONLY-NEXT: vunpcklps {{.*#+}} xmm12 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm12 -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm11 = xmm0[1,1,1,1] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm11 = xmm0[1,1,1,1] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm11 = xmm11[0],xmm15[1],xmm11[2,3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm11 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],ymm12[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm11[4,5,6,7] ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm6 = xmm14[2],xmm6[2],xmm14[3],xmm6[3] ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm15[2],xmm0[3],xmm15[3] -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm11 = xmm2[2,2,2,2] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm11 = xmm2[2,2,2,2] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm11 = xmm1[0,1,2],xmm11[3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm11 ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm12 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm12[0,1,2,3,4,5],ymm11[6,7] -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm12 = xmm4[2,2,2,2] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm12 = xmm4[2,2,2,2] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm12 = xmm3[0,1,2],xmm12[3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} xmm12 = xmm6[0,1],xmm12[2,3] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm12[0,1,2,3],ymm11[4,5,6,7] @@ -6661,7 +6661,7 @@ ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm6[1],xmm3[1] ; AVX2-ONLY-NEXT: vunpckhps {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX2-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-3.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-3.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-3.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-3.ll @@ -139,7 +139,7 @@ ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1],ymm3[2,3],ymm4[4,5,6,7] ; AVX2-ONLY-NEXT: vbroadcastsd (%rdx), %ymm4 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5],ymm3[6,7] -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} ymm4 = ymm1[2,3,0,1,6,7,4,5] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm1[2,3,0,1,6,7,4,5] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm0[4,5],ymm4[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm2[2,3],ymm4[4,5,6,7] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] @@ -270,7 +270,7 @@ ; AVX2-ONLY-NEXT: vmovaps 32(%rsi), %ymm3 ; AVX2-ONLY-NEXT: vmovaps (%rdx), %ymm4 ; AVX2-ONLY-NEXT: vmovaps 32(%rdx), %ymm5 -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} ymm6 = ymm2[2,3,0,1,6,7,4,5] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm2[2,3,0,1,6,7,4,5] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm0[4,5],ymm6[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm4[2,3],ymm6[4,5,6,7] ; AVX2-ONLY-NEXT: vmovddup {{.*#+}} xmm7 = mem[0,0] @@ -278,7 +278,7 @@ ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm8[0,1],ymm7[2,3],ymm8[4,5,6,7] ; AVX2-ONLY-NEXT: vbroadcastsd (%rdx), %ymm8 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5],ymm7[6,7] -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} ymm8 = ymm3[2,3,0,1,6,7,4,5] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm3[2,3,0,1,6,7,4,5] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm1[4,5],ymm8[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm5[2,3],ymm8[4,5,6,7] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm1[1],ymm3[1],ymm1[3],ymm3[3] @@ -547,11 +547,11 @@ ; AVX2-ONLY-NEXT: vbroadcastsd 64(%rdx), %ymm12 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm12[4,5],ymm7[6,7] ; AVX2-ONLY-NEXT: vmovaps 32(%rdx), %ymm12 -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} ymm9 = ymm9[2,3,0,1,6,7,4,5] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm9[2,3,0,1,6,7,4,5] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5],ymm9[6,7] ; AVX2-ONLY-NEXT: vmovaps 96(%rdx), %ymm13 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm11[2,3],ymm8[4,5,6,7] -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} ymm9 = ymm10[2,3,0,1,6,7,4,5] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm10[2,3,0,1,6,7,4,5] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm6[4,5],ymm9[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1],ymm12[2,3],ymm9[4,5,6,7] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm10 = ymm6[1],ymm10[1],ymm6[3],ymm10[3] @@ -563,7 +563,7 @@ ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm11[2,3],ymm6[4,5,6,7] ; AVX2-ONLY-NEXT: vbroadcastsd 32(%rdx), %ymm11 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm11[4,5],ymm6[6,7] -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} ymm11 = ymm5[2,3,0,1,6,7,4,5] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm5[2,3,0,1,6,7,4,5] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm3[4,5],ymm11[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1],ymm13[2,3],ymm11[4,5,6,7] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm5 = ymm3[1],ymm5[1],ymm3[3],ymm5[3] @@ -575,7 +575,7 @@ ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm12[2,3],ymm3[4,5,6,7] ; AVX2-ONLY-NEXT: vbroadcastsd 96(%rdx), %ymm12 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm12[4,5],ymm3[6,7] -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} ymm12 = ymm1[2,3,0,1,6,7,4,5] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm1[2,3,0,1,6,7,4,5] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm0[4,5],ymm12[6,7] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] ; AVX2-ONLY-NEXT: vmovaps (%rdx), %ymm1 @@ -1087,7 +1087,7 @@ ; AVX2-ONLY-NEXT: vbroadcastsd (%rdx), %ymm7 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm7[4,5],ymm4[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} ymm4 = ymm2[2,3,0,1,6,7,4,5] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm2[2,3,0,1,6,7,4,5] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm1[4,5],ymm4[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm3[2,3],ymm4[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -1102,7 +1102,7 @@ ; AVX2-ONLY-NEXT: vbroadcastsd 32(%rdx), %ymm3 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} ymm1 = ymm0[2,3,0,1,6,7,4,5] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm0[2,3,0,1,6,7,4,5] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm5[4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm6[2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm1, (%rsp) # 32-byte Spill @@ -1117,7 +1117,7 @@ ; AVX2-ONLY-NEXT: vbroadcastsd 64(%rdx), %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} ymm0 = ymm11[2,3,0,1,6,7,4,5] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm11[2,3,0,1,6,7,4,5] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm9[4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm12[2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -1131,7 +1131,7 @@ ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vbroadcastsd 96(%rdx), %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} ymm0 = ymm14[2,3,0,1,6,7,4,5] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm14[2,3,0,1,6,7,4,5] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm13[4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm12 = ymm0[0,1],ymm15[2,3],ymm0[4,5,6,7] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm13[1],ymm14[1],ymm13[3],ymm14[3] @@ -1143,7 +1143,7 @@ ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7] ; AVX2-ONLY-NEXT: vbroadcastsd 128(%rdx), %ymm1 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} ymm0 = ymm10[2,3,0,1,6,7,4,5] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm10[2,3,0,1,6,7,4,5] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm8[4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm8[1],ymm10[1],ymm8[3],ymm10[3] ; AVX2-ONLY-NEXT: vmovaps 128(%rdx), %ymm10 @@ -1158,7 +1158,7 @@ ; AVX2-ONLY-NEXT: vbroadcastsd 160(%rdx), %ymm15 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm1[0,1,2,3],ymm15[4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovaps 160(%rsi), %ymm1 -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} ymm2 = ymm1[2,3,0,1,6,7,4,5] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm1[2,3,0,1,6,7,4,5] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm0[4,5],ymm2[6,7] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] ; AVX2-ONLY-NEXT: vmovaps 160(%rdx), %ymm1 @@ -1173,7 +1173,7 @@ ; AVX2-ONLY-NEXT: vbroadcastsd 192(%rdx), %ymm4 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm0[0,1,2,3],ymm4[4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovaps 192(%rsi), %ymm4 -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} ymm5 = ymm4[2,3,0,1,6,7,4,5] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm4[2,3,0,1,6,7,4,5] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm1[4,5],ymm5[6,7] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm4[1],ymm1[3],ymm4[3] ; AVX2-ONLY-NEXT: vmovaps 192(%rdx), %ymm4 @@ -1188,7 +1188,7 @@ ; AVX2-ONLY-NEXT: vbroadcastsd 224(%rdx), %ymm7 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm7[4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovaps 224(%rsi), %ymm7 -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} ymm9 = ymm7[2,3,0,1,6,7,4,5] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm9 = ymm7[2,3,0,1,6,7,4,5] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm4[4,5],ymm9[6,7] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm4 = ymm4[1],ymm7[1],ymm4[3],ymm7[3] ; AVX2-ONLY-NEXT: vmovaps 224(%rdx), %ymm7 @@ -2224,7 +2224,7 @@ ; AVX2-ONLY-NEXT: vbroadcastsd (%rdx), %ymm15 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm15[4,5],ymm14[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} ymm14 = ymm11[2,3,0,1,6,7,4,5] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm14 = ymm11[2,3,0,1,6,7,4,5] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm8[4,5],ymm14[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1],ymm12[2,3],ymm14[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -2239,7 +2239,7 @@ ; AVX2-ONLY-NEXT: vbroadcastsd 32(%rdx), %ymm11 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm11[4,5],ymm8[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} ymm8 = ymm9[2,3,0,1,6,7,4,5] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm9[2,3,0,1,6,7,4,5] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm5[4,5],ymm8[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm13[2,3],ymm8[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -2254,7 +2254,7 @@ ; AVX2-ONLY-NEXT: vbroadcastsd 64(%rdx), %ymm8 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5],ymm5[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} ymm5 = ymm6[2,3,0,1,6,7,4,5] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm6[2,3,0,1,6,7,4,5] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm3[4,5],ymm5[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm10[2,3],ymm5[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -2269,7 +2269,7 @@ ; AVX2-ONLY-NEXT: vbroadcastsd 96(%rdx), %ymm5 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm5[4,5],ymm3[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} ymm3 = ymm4[2,3,0,1,6,7,4,5] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm4[2,3,0,1,6,7,4,5] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm2[4,5],ymm3[6,7] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm7[2,3],ymm3[4,5,6,7] ; AVX2-ONLY-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -2284,7 +2284,7 @@ ; AVX2-ONLY-NEXT: vbroadcastsd 128(%rdx), %ymm3 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5],ymm2[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} ymm2 = ymm1[2,3,0,1,6,7,4,5] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm1[2,3,0,1,6,7,4,5] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm0[4,5],ymm2[6,7] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] ; AVX2-ONLY-NEXT: vmovaps 128(%rdx), %ymm1 @@ -2302,7 +2302,7 @@ ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 160(%rsi), %ymm1 -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} ymm2 = ymm1[2,3,0,1,6,7,4,5] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm1[2,3,0,1,6,7,4,5] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm0[4,5],ymm2[6,7] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] ; AVX2-ONLY-NEXT: vmovaps 160(%rdx), %ymm1 @@ -2320,7 +2320,7 @@ ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 192(%rsi), %ymm1 -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} ymm2 = ymm1[2,3,0,1,6,7,4,5] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm1[2,3,0,1,6,7,4,5] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm0[4,5],ymm2[6,7] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] ; AVX2-ONLY-NEXT: vmovaps 192(%rdx), %ymm1 @@ -2338,7 +2338,7 @@ ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 224(%rsi), %ymm1 -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} ymm2 = ymm1[2,3,0,1,6,7,4,5] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm1[2,3,0,1,6,7,4,5] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm0[4,5],ymm2[6,7] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] ; AVX2-ONLY-NEXT: vmovaps 224(%rdx), %ymm1 @@ -2356,7 +2356,7 @@ ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 256(%rsi), %ymm1 -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} ymm2 = ymm1[2,3,0,1,6,7,4,5] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm1[2,3,0,1,6,7,4,5] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm0[4,5],ymm2[6,7] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] ; AVX2-ONLY-NEXT: vmovaps 256(%rdx), %ymm1 @@ -2374,7 +2374,7 @@ ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 288(%rsi), %ymm1 -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} ymm2 = ymm1[2,3,0,1,6,7,4,5] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm1[2,3,0,1,6,7,4,5] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm0[4,5],ymm2[6,7] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] ; AVX2-ONLY-NEXT: vmovaps 288(%rdx), %ymm1 @@ -2392,7 +2392,7 @@ ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX2-ONLY-NEXT: vmovaps 320(%rsi), %ymm1 -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} ymm2 = ymm1[2,3,0,1,6,7,4,5] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm1[2,3,0,1,6,7,4,5] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm0[4,5],ymm2[6,7] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] ; AVX2-ONLY-NEXT: vmovaps 320(%rdx), %ymm1 @@ -2409,7 +2409,7 @@ ; AVX2-ONLY-NEXT: vbroadcastsd 352(%rdx), %ymm2 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm10 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovaps 352(%rsi), %ymm1 -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} ymm2 = ymm1[2,3,0,1,6,7,4,5] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm1[2,3,0,1,6,7,4,5] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm0[4,5],ymm2[6,7] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] ; AVX2-ONLY-NEXT: vmovaps 352(%rdx), %ymm1 @@ -2424,7 +2424,7 @@ ; AVX2-ONLY-NEXT: vbroadcastsd 384(%rdx), %ymm2 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovaps 384(%rsi), %ymm1 -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} ymm2 = ymm1[2,3,0,1,6,7,4,5] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm1[2,3,0,1,6,7,4,5] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm0[4,5],ymm2[6,7] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] ; AVX2-ONLY-NEXT: vmovaps 384(%rdx), %ymm1 @@ -2439,7 +2439,7 @@ ; AVX2-ONLY-NEXT: vbroadcastsd 416(%rdx), %ymm2 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7] ; AVX2-ONLY-NEXT: vmovaps 416(%rsi), %ymm1 -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} ymm15 = ymm1[2,3,0,1,6,7,4,5] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm15 = ymm1[2,3,0,1,6,7,4,5] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0,1,2,3],ymm0[4,5],ymm15[6,7] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] ; AVX2-ONLY-NEXT: vmovaps 416(%rdx), %ymm1 @@ -2454,7 +2454,7 @@ ; AVX2-ONLY-NEXT: vbroadcastsd 448(%rdx), %ymm14 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm0[0,1,2,3],ymm14[4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovaps 448(%rsi), %ymm14 -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} ymm13 = ymm14[2,3,0,1,6,7,4,5] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm13 = ymm14[2,3,0,1,6,7,4,5] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm1[4,5],ymm13[6,7] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm1[1],ymm14[1],ymm1[3],ymm14[3] ; AVX2-ONLY-NEXT: vmovaps 448(%rdx), %ymm14 @@ -2469,7 +2469,7 @@ ; AVX2-ONLY-NEXT: vbroadcastsd 480(%rdx), %ymm12 ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm12[4,5],ymm0[6,7] ; AVX2-ONLY-NEXT: vmovaps 480(%rsi), %ymm12 -; AVX2-ONLY-NEXT: vpermilps {{.*#+}} ymm11 = ymm12[2,3,0,1,6,7,4,5] +; AVX2-ONLY-NEXT: vshufps {{.*#+}} ymm11 = ymm12[2,3,0,1,6,7,4,5] ; AVX2-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm14[4,5],ymm11[6,7] ; AVX2-ONLY-NEXT: vunpckhpd {{.*#+}} ymm12 = ymm14[1],ymm12[1],ymm14[3],ymm12[3] ; AVX2-ONLY-NEXT: vmovaps 480(%rdx), %ymm14 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-5.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-5.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-5.ll @@ -803,7 +803,7 @@ ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm8[0],mem[0],ymm8[2],mem[2] ; AVX1-ONLY-NEXT: vmovapd 96(%rcx), %xmm1 ; AVX1-ONLY-NEXT: vmovapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 @@ -1737,7 +1737,7 @@ ; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 96(%rcx), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm2[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = mem[2,3,2,3] @@ -1749,13 +1749,13 @@ ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm11[0],mem[0],ymm11[2],mem[2] ; AVX1-ONLY-NEXT: vmovapd 160(%rcx), %xmm1 ; AVX1-ONLY-NEXT: vmovapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm4[0],mem[0],ymm4[2],mem[2] ; AVX1-ONLY-NEXT: vmovaps 192(%rcx), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 192(%rcx), %ymm0 @@ -3719,7 +3719,7 @@ ; AVX1-ONLY-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 96(%rcx), %xmm2 ; AVX1-ONLY-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm2 = xmm2[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = mem[2,3,2,3] @@ -3733,14 +3733,14 @@ ; AVX1-ONLY-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 160(%rcx), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm4[0],mem[0],ymm4[2],mem[2] ; AVX1-ONLY-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 192(%rcx), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 192(%rcx), %ymm0 @@ -3763,7 +3763,7 @@ ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 288(%rcx), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 320(%rdi), %ymm1 @@ -3772,7 +3772,7 @@ ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 320(%rcx), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 320(%rcx), %ymm0 @@ -3788,7 +3788,7 @@ ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 384(%rcx), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 384(%rcx), %ymm0 @@ -3810,7 +3810,7 @@ ; AVX1-ONLY-NEXT: vmovapd %ymm1, %ymm15 ; AVX1-ONLY-NEXT: vmovapd 480(%rcx), %xmm1 ; AVX1-ONLY-NEXT: vmovapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3] ; AVX1-ONLY-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3] ; AVX1-ONLY-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps (%rdi), %ymm9 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-7.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-7.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-7.ll @@ -570,12 +570,12 @@ ; AVX1-ONLY-NEXT: vbroadcastsd 8(%rcx), %ymm10 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm10[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm10 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm13 = xmm10[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm13 = xmm10[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm13[0,1],ymm8[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm13 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm13[6,7] ; AVX1-ONLY-NEXT: vmovaps 16(%rcx), %xmm13 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm14 = xmm13[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm13[2,3,2,3] ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm11 = ymm4[1],ymm11[1],ymm4[3],ymm11[3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],mem[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm14[0,1],ymm11[2,3,4,5,6,7] @@ -2030,7 +2030,7 @@ ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm1[0,1,2,3],ymm4[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm6 ; AVX1-ONLY-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm6 = xmm6[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm6 = xmm6[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm4 = ymm6[0,1],ymm4[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7] @@ -2041,7 +2041,7 @@ ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 16(%rcx), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3] ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm9[1],ymm2[1],ymm9[3],ymm2[3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],mem[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3,4,5,6,7] @@ -2079,7 +2079,7 @@ ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm9[1],ymm0[1],ymm9[3],ymm0[3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] ; AVX1-ONLY-NEXT: vmovaps 48(%rcx), %xmm7 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm7[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm7[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 64(%r9), %xmm8 @@ -2096,7 +2096,7 @@ ; AVX1-ONLY-NEXT: vbroadcastsd 72(%rcx), %ymm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 64(%rsi), %xmm15 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm14 = xmm15[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm14 = xmm15[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm14[0,1],ymm2[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] @@ -2113,7 +2113,7 @@ ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm4[1],ymm0[1],ymm4[3],ymm0[3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm14 = ymm0[0,1,2,3,4,5],mem[6,7] ; AVX1-ONLY-NEXT: vmovaps 80(%rcx), %xmm3 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm13 = xmm3[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm13 = xmm3[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1],ymm14[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 96(%rsi), %xmm13 @@ -4756,7 +4756,7 @@ ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm7 = ymm2[0,1,2,3],ymm7[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm11 ; AVX1-ONLY-NEXT: vmovaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm11 = xmm11[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm11 = xmm11[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1],ymm7[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 32(%rdi), %xmm7 ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm9 @@ -4778,7 +4778,7 @@ ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm9[0,1],ymm1[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm6[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm6[2,3,2,3] ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm4[1],ymm5[1],ymm4[3],ymm5[3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],mem[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3,4,5,6,7] @@ -4824,7 +4824,7 @@ ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 48(%r8), %xmm0 ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm3[1],ymm0[1],ymm3[3],ymm0[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -4841,7 +4841,7 @@ ; AVX1-ONLY-NEXT: vbroadcastsd 72(%rcx), %ymm3 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm2[0,1,2,3],ymm3[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 64(%rsi), %xmm11 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm5 = xmm11[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm11[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1],ymm3[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] @@ -4868,7 +4868,7 @@ ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 80(%r8), %xmm0 ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm4[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm4[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -4916,7 +4916,7 @@ ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 112(%r8), %xmm0 ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm3[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm3[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -4933,7 +4933,7 @@ ; AVX1-ONLY-NEXT: vbroadcastsd 136(%rcx), %ymm3 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm2[0,1,2,3],ymm3[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 128(%rsi), %xmm14 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm5 = xmm14[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm14[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1],ymm3[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7] @@ -4960,7 +4960,7 @@ ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 144(%r8), %xmm0 ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm4[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm4[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -5008,7 +5008,7 @@ ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 176(%r8), %xmm0 ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm3[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm3[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill @@ -5025,7 +5025,7 @@ ; AVX1-ONLY-NEXT: vbroadcastsd 200(%rcx), %ymm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm4[0,1,2,3],ymm2[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 192(%rsi), %xmm8 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm7 = xmm8[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm7 = xmm8[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5],ymm1[6,7] @@ -10400,7 +10400,7 @@ ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm0[0,1,2,3],ymm3[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps (%rsi), %xmm5 ; AVX1-ONLY-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm5 = xmm5[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm5 = xmm5[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1],ymm3[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7] @@ -10412,7 +10412,7 @@ ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 16(%rcx), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX1-ONLY-NEXT: vmovaps 16(%r8), %xmm2 ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm7[1],ymm2[1],ymm7[3],ymm2[3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],mem[6,7] @@ -10456,7 +10456,7 @@ ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] ; AVX1-ONLY-NEXT: vmovaps 48(%rcx), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 64(%r9), %xmm1 @@ -10476,7 +10476,7 @@ ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 64(%rsi), %xmm3 ; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm3 = xmm3[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm3[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] @@ -10496,7 +10496,7 @@ ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] ; AVX1-ONLY-NEXT: vmovaps 80(%rcx), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 96(%rsi), %xmm0 @@ -10538,7 +10538,7 @@ ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] ; AVX1-ONLY-NEXT: vmovaps 112(%rcx), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 128(%r9), %xmm1 @@ -10558,7 +10558,7 @@ ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 128(%rsi), %xmm3 ; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm3 = xmm3[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm3[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] @@ -10578,7 +10578,7 @@ ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] ; AVX1-ONLY-NEXT: vmovaps 144(%rcx), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 160(%rsi), %xmm0 @@ -10620,7 +10620,7 @@ ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] ; AVX1-ONLY-NEXT: vmovaps 176(%rcx), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 192(%r9), %xmm1 @@ -10640,7 +10640,7 @@ ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 192(%rsi), %xmm3 ; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm3 = xmm3[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm3[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] @@ -10660,7 +10660,7 @@ ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] ; AVX1-ONLY-NEXT: vmovaps 208(%rcx), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 224(%rsi), %xmm0 @@ -10702,7 +10702,7 @@ ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] ; AVX1-ONLY-NEXT: vmovaps 240(%rcx), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 256(%r9), %xmm1 @@ -10722,7 +10722,7 @@ ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 256(%rsi), %xmm3 ; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm3 = xmm3[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm3[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] @@ -10742,7 +10742,7 @@ ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] ; AVX1-ONLY-NEXT: vmovaps 272(%rcx), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 288(%rsi), %xmm0 @@ -10784,7 +10784,7 @@ ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] ; AVX1-ONLY-NEXT: vmovaps 304(%rcx), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 320(%r9), %xmm1 @@ -10804,7 +10804,7 @@ ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 320(%rsi), %xmm3 ; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm3 = xmm3[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm3[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] @@ -10824,7 +10824,7 @@ ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] ; AVX1-ONLY-NEXT: vmovaps 336(%rcx), %xmm1 ; AVX1-ONLY-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 352(%rsi), %xmm0 @@ -10863,7 +10863,7 @@ ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm14[1],ymm0[1],ymm14[3],ymm0[3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] ; AVX1-ONLY-NEXT: vmovaps 368(%rcx), %xmm12 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm12[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm12[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 384(%r9), %xmm13 @@ -10881,7 +10881,7 @@ ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm2[4,5,6,7] ; AVX1-ONLY-NEXT: vmovaps 384(%rsi), %xmm3 ; AVX1-ONLY-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm3 = xmm3[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm3 = xmm3[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] @@ -10898,7 +10898,7 @@ ; AVX1-ONLY-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm8[1],ymm0[1],ymm8[3],ymm0[3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],mem[6,7] ; AVX1-ONLY-NEXT: vmovaps 400(%rcx), %xmm7 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm1 = xmm7[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm1 = xmm7[2,3,2,3] ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 416(%rsi), %xmm3 @@ -10925,7 +10925,7 @@ ; AVX1-ONLY-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; AVX1-ONLY-NEXT: vmovaps 448(%rsi), %xmm0 ; AVX1-ONLY-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX1-ONLY-NEXT: vbroadcastsd 456(%rcx), %ymm1 ; AVX1-ONLY-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vmovapd 448(%rdx), %ymm5 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-6.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-6.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-6.ll @@ -1181,8 +1181,8 @@ ; AVX1-ONLY-NEXT: vpor %xmm9, %xmm8, %xmm8 ; AVX1-ONLY-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; AVX1-ONLY-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm8 = xmm9[0,0,1,1] -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} xmm10 = xmm9[1,1,2,2] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm8 = xmm9[0,0,1,1] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} xmm10 = xmm9[1,1,2,2] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm8, %ymm8 ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm10 = xmm13[1,0,2,2,4,5,6,7] ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[0,1,0,1] diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-8.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-8.ll --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-8.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-8.ll @@ -1598,7 +1598,7 @@ ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm1[0,0,2,1,4,5,6,7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm1[0,2,2,3,4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm2 = ymm0[0,0,2,1,4,4,6,5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm0[0,0,2,1,4,4,6,5] ; AVX1-ONLY-NEXT: vbroadcastsd {{.*#+}} ymm0 = [281474976710655,281474976710655,281474976710655,281474976710655] ; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm0, %ymm2 ; AVX1-ONLY-NEXT: vmovdqa (%r9), %xmm7 @@ -1607,7 +1607,7 @@ ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm12[0,1,1,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm12[2,1,3,3,4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm5, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm3 = ymm3[0,0,2,1,4,4,6,5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,0,2,1,4,4,6,5] ; AVX1-ONLY-NEXT: vandps %ymm0, %ymm3, %ymm3 ; AVX1-ONLY-NEXT: vorps %ymm2, %ymm3, %ymm5 ; AVX1-ONLY-NEXT: vmovdqa (%rsi), %xmm2 @@ -1639,9 +1639,9 @@ ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,6,5,7,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm12, %ymm5, %ymm5 ; AVX1-ONLY-NEXT: vmovdqa 16(%r10), %xmm13 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] ; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm5 = ymm5[0,2,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[0,2,2,3,4,6,6,7] ; AVX1-ONLY-NEXT: vandps %ymm0, %ymm5, %ymm5 ; AVX1-ONLY-NEXT: vorps %ymm1, %ymm5, %ymm1 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm15[0,1,2,3,4,4,6,5] @@ -1653,7 +1653,7 @@ ; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm5, %ymm3 ; AVX1-ONLY-NEXT: vmovdqa 16(%rax), %xmm9 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm12 = ymm12[2,1,3,3,6,5,7,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm12 = ymm12[2,1,3,3,6,5,7,7] ; AVX1-ONLY-NEXT: vandnps %ymm12, %ymm2, %ymm12 ; AVX1-ONLY-NEXT: vandps %ymm2, %ymm3, %ymm3 ; AVX1-ONLY-NEXT: vorps %ymm3, %ymm12, %ymm3 @@ -1674,9 +1674,9 @@ ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm4 ; AVX1-ONLY-NEXT: vmovdqa 16(%r8), %xmm7 ; AVX1-ONLY-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] ; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm4 = ymm4[0,2,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,2,2,3,4,6,6,7] ; AVX1-ONLY-NEXT: vandps %ymm0, %ymm4, %ymm4 ; AVX1-ONLY-NEXT: vorps %ymm3, %ymm4, %ymm3 ; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm11[8],xmm10[8],xmm11[9],xmm10[9],xmm11[10],xmm10[10],xmm11[11],xmm10[11],xmm11[12],xmm10[12],xmm11[13],xmm10[13],xmm11[14],xmm10[14],xmm11[15],xmm10[15] @@ -1684,7 +1684,7 @@ ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm1[0,1,2,3,4,4,6,5] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm10 = xmm1[0,1,2,3,4,6,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm4 = ymm4[2,1,3,3,6,5,7,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[2,1,3,3,6,5,7,7] ; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm2, %ymm4 ; AVX1-ONLY-NEXT: vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm1 # 16-byte Folded Reload ; AVX1-ONLY-NEXT: # xmm1 = xmm6[8],mem[8],xmm6[9],mem[9],xmm6[10],mem[10],xmm6[11],mem[11],xmm6[12],mem[12],xmm6[13],mem[13],xmm6[14],mem[14],xmm6[15],mem[15] @@ -1707,9 +1707,9 @@ ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm12[0,1,2,3,4,5,5,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm6 = xmm12[0,1,2,3,6,5,7,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm4, %ymm4 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] ; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm0, %ymm3 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm4 = ymm4[0,2,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,2,2,3,4,6,6,7] ; AVX1-ONLY-NEXT: vandps %ymm0, %ymm4, %ymm4 ; AVX1-ONLY-NEXT: vorps %ymm3, %ymm4, %ymm5 ; AVX1-ONLY-NEXT: vmovdqa 16(%rcx), %xmm7 @@ -1726,7 +1726,7 @@ ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm15 = xmm14[3,3,3,3] ; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm15 = xmm15[0],zero,zero,zero,xmm15[1],zero,zero,zero ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm15, %ymm13, %ymm13 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[2,1,3,3,6,5,7,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,1,3,3,6,5,7,7] ; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm2, %ymm1 ; AVX1-ONLY-NEXT: vandps %ymm2, %ymm13, %ymm13 ; AVX1-ONLY-NEXT: vorps %ymm1, %ymm13, %ymm1 @@ -1741,9 +1741,9 @@ ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm8 = xmm15[0,1,2,3,4,5,5,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm9 = xmm15[0,1,2,3,6,5,7,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm9, %ymm8, %ymm8 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] ; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm0, %ymm1 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm8 = ymm8[0,2,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[0,2,2,3,4,6,6,7] ; AVX1-ONLY-NEXT: vandps %ymm0, %ymm8, %ymm8 ; AVX1-ONLY-NEXT: vorps %ymm1, %ymm8, %ymm8 ; AVX1-ONLY-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm3[8],xmm7[8],xmm3[9],xmm7[9],xmm3[10],xmm7[10],xmm3[11],xmm7[11],xmm3[12],xmm7[12],xmm3[13],xmm7[13],xmm3[14],xmm7[14],xmm3[15],xmm7[15] @@ -1756,7 +1756,7 @@ ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm7 = xmm4[3,3,3,3] ; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm6, %ymm6 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm3 = ymm3[2,1,3,3,6,5,7,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[2,1,3,3,6,5,7,7] ; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm2, %ymm3 ; AVX1-ONLY-NEXT: vandps %ymm2, %ymm6, %ymm6 ; AVX1-ONLY-NEXT: vorps %ymm3, %ymm6, %ymm3 @@ -1767,9 +1767,9 @@ ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm15[0,1,1,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm15[2,1,3,3,4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm6, %ymm6 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm5 = ymm5[0,0,2,1,4,4,6,5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[0,0,2,1,4,4,6,5] ; AVX1-ONLY-NEXT: vandnps %ymm5, %ymm0, %ymm5 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm6 = ymm6[0,0,2,1,4,4,6,5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm6[0,0,2,1,4,4,6,5] ; AVX1-ONLY-NEXT: vandps %ymm0, %ymm6, %ymm6 ; AVX1-ONLY-NEXT: vorps %ymm5, %ymm6, %ymm5 ; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm6 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero @@ -1791,9 +1791,9 @@ ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm5 = xmm12[0,1,1,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm12[2,1,3,3,4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm4 = ymm4[0,0,2,1,4,4,6,5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,0,2,1,4,4,6,5] ; AVX1-ONLY-NEXT: vandnps %ymm4, %ymm0, %ymm4 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm5 = ymm5[0,0,2,1,4,4,6,5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[0,0,2,1,4,4,6,5] ; AVX1-ONLY-NEXT: vandps %ymm0, %ymm5, %ymm5 ; AVX1-ONLY-NEXT: vorps %ymm4, %ymm5, %ymm4 ; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm5 = xmm14[0],zero,zero,zero,xmm14[1],zero,zero,zero @@ -1817,9 +1817,9 @@ ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm7[0,1,1,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[2,1,3,3,4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm6, %ymm6 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm5 = ymm5[0,0,2,1,4,4,6,5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[0,0,2,1,4,4,6,5] ; AVX1-ONLY-NEXT: vandnps %ymm5, %ymm0, %ymm5 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm6 = ymm6[0,0,2,1,4,4,6,5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm6[0,0,2,1,4,4,6,5] ; AVX1-ONLY-NEXT: vandps %ymm0, %ymm6, %ymm0 ; AVX1-ONLY-NEXT: vorps %ymm5, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload @@ -4045,7 +4045,7 @@ ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,4,6,5] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm1[0,1,2,3,4,6,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] ; AVX1-ONLY-NEXT: vbroadcastsd {{.*#+}} ymm5 = [281474976710655,281474976710655,281474976710655,281474976710655] ; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm5, %ymm0 ; AVX1-ONLY-NEXT: vmovdqa (%r9), %xmm3 @@ -4056,7 +4056,7 @@ ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm2[0,1,2,3,4,5,5,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm4 = xmm2[0,1,2,3,6,5,7,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] ; AVX1-ONLY-NEXT: vandps %ymm5, %ymm3, %ymm3 ; AVX1-ONLY-NEXT: vorps %ymm0, %ymm3, %ymm0 ; AVX1-ONLY-NEXT: vmovdqa (%rcx), %xmm4 @@ -4077,7 +4077,7 @@ ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm8 = xmm6[3,3,3,3] ; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm8 = xmm8[0],zero,zero,zero,xmm8[1],zero,zero,zero ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm7, %ymm7 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm8 = ymm4[2,1,3,3,6,5,7,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm4[2,1,3,3,6,5,7,7] ; AVX1-ONLY-NEXT: vbroadcastsd {{.*#+}} ymm4 = [18446744069414649855,18446744069414649855,18446744069414649855,18446744069414649855] ; AVX1-ONLY-NEXT: vandnps %ymm8, %ymm4, %ymm8 ; AVX1-ONLY-NEXT: vandps %ymm4, %ymm7, %ymm7 @@ -4092,9 +4092,9 @@ ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm2 ; AVX1-ONLY-NEXT: vmovdqa 48(%r10), %xmm1 ; AVX1-ONLY-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,2,1,4,4,6,5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0,2,1,4,4,6,5] ; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm5, %ymm0 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm2 = ymm2[0,0,2,1,4,4,6,5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,0,2,1,4,4,6,5] ; AVX1-ONLY-NEXT: vandps %ymm5, %ymm2, %ymm2 ; AVX1-ONLY-NEXT: vorps %ymm0, %ymm2, %ymm0 ; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero @@ -4123,9 +4123,9 @@ ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm12[0,1,1,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm8 = xmm12[2,1,3,3,4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm7, %ymm7 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm6 = ymm6[0,0,2,1,4,4,6,5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm6[0,0,2,1,4,4,6,5] ; AVX1-ONLY-NEXT: vandnps %ymm6, %ymm5, %ymm6 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm7 = ymm7[0,0,2,1,4,4,6,5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm7[0,0,2,1,4,4,6,5] ; AVX1-ONLY-NEXT: vandps %ymm5, %ymm7, %ymm7 ; AVX1-ONLY-NEXT: vorps %ymm6, %ymm7, %ymm13 ; AVX1-ONLY-NEXT: vmovdqa 48(%rsi), %xmm8 @@ -4154,9 +4154,9 @@ ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm12[0,1,2,3,4,5,5,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm11 = xmm12[0,1,2,3,6,5,7,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] ; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm5, %ymm1 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] ; AVX1-ONLY-NEXT: vandps %ymm5, %ymm2, %ymm2 ; AVX1-ONLY-NEXT: vorps %ymm1, %ymm2, %ymm1 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm15[0,1,2,3,4,4,6,5] @@ -4167,7 +4167,7 @@ ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[3,3,3,3] ; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm10 = xmm10[0],zero,zero,zero,xmm10[1],zero,zero,zero ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm11, %ymm10 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm2 = ymm2[2,1,3,3,6,5,7,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,1,3,3,6,5,7,7] ; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm4, %ymm2 ; AVX1-ONLY-NEXT: vandps %ymm4, %ymm10, %ymm10 ; AVX1-ONLY-NEXT: vorps %ymm2, %ymm10, %ymm2 @@ -4183,9 +4183,9 @@ ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm0[0,1,1,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm10 = xmm0[2,1,3,3,4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,0,2,1,4,4,6,5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,0,2,1,4,4,6,5] ; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm5, %ymm1 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm3 = ymm3[0,0,2,1,4,4,6,5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,0,2,1,4,4,6,5] ; AVX1-ONLY-NEXT: vandps %ymm5, %ymm3, %ymm3 ; AVX1-ONLY-NEXT: vorps %ymm1, %ymm3, %ymm3 ; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3],xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] @@ -4210,9 +4210,9 @@ ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm0[0,1,2,3,4,5,5,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,7,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] ; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm5, %ymm2 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] ; AVX1-ONLY-NEXT: vandps %ymm5, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vorps %ymm2, %ymm0, %ymm0 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm6[0,1,2,3,4,4,6,5] @@ -4223,7 +4223,7 @@ ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,3,3,3] ; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm2 = ymm2[2,1,3,3,6,5,7,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,1,3,3,6,5,7,7] ; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm4, %ymm2 ; AVX1-ONLY-NEXT: vandps %ymm4, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vorps %ymm2, %ymm1, %ymm1 @@ -4243,9 +4243,9 @@ ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm3 = xmm12[0,1,1,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm12[2,1,3,3,4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm3, %ymm3 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,2,1,4,4,6,5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0,2,1,4,4,6,5] ; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm5, %ymm0 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm3 = ymm3[0,0,2,1,4,4,6,5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,0,2,1,4,4,6,5] ; AVX1-ONLY-NEXT: vandps %ymm5, %ymm3, %ymm3 ; AVX1-ONLY-NEXT: vorps %ymm0, %ymm3, %ymm13 ; AVX1-ONLY-NEXT: vmovdqa 32(%rsi), %xmm7 @@ -4274,9 +4274,9 @@ ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm12[0,1,2,3,4,5,5,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm11 = xmm12[0,1,2,3,6,5,7,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm11, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] ; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm5, %ymm1 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] ; AVX1-ONLY-NEXT: vandps %ymm5, %ymm2, %ymm2 ; AVX1-ONLY-NEXT: vorps %ymm1, %ymm2, %ymm1 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm15[0,1,2,3,4,4,6,5] @@ -4287,7 +4287,7 @@ ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[3,3,3,3] ; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm10 = xmm10[0],zero,zero,zero,xmm10[1],zero,zero,zero ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm11, %ymm10 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm2 = ymm2[2,1,3,3,6,5,7,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,1,3,3,6,5,7,7] ; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm4, %ymm2 ; AVX1-ONLY-NEXT: vandps %ymm4, %ymm10, %ymm10 ; AVX1-ONLY-NEXT: vorps %ymm2, %ymm10, %ymm2 @@ -4303,9 +4303,9 @@ ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm8 = xmm2[0,1,1,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm10 = xmm2[2,1,3,3,4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm10, %ymm8, %ymm8 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,0,2,1,4,4,6,5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,0,2,1,4,4,6,5] ; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm5, %ymm1 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm8 = ymm8[0,0,2,1,4,4,6,5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm8 = ymm8[0,0,2,1,4,4,6,5] ; AVX1-ONLY-NEXT: vandps %ymm5, %ymm8, %ymm8 ; AVX1-ONLY-NEXT: vorps %ymm1, %ymm8, %ymm8 ; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm9[0],xmm7[0],xmm9[1],xmm7[1],xmm9[2],xmm7[2],xmm9[3],xmm7[3],xmm9[4],xmm7[4],xmm9[5],xmm7[5],xmm9[6],xmm7[6],xmm9[7],xmm7[7] @@ -4330,9 +4330,9 @@ ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm6 = xmm2[0,1,2,3,4,5,5,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,7,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm2, %ymm6, %ymm2 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] ; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm5, %ymm3 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] ; AVX1-ONLY-NEXT: vandps %ymm5, %ymm2, %ymm2 ; AVX1-ONLY-NEXT: vorps %ymm3, %ymm2, %ymm2 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm0[0,1,2,3,4,4,6,5] @@ -4343,7 +4343,7 @@ ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,3,3,3] ; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,1,3,3,6,5,7,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,1,3,3,6,5,7,7] ; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm4, %ymm0 ; AVX1-ONLY-NEXT: vandps %ymm4, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vorps %ymm0, %ymm1, %ymm0 @@ -4363,9 +4363,9 @@ ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm6 = xmm1[0,1,1,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm8 = xmm1[2,1,3,3,4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm6, %ymm6 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm2 = ymm2[0,0,2,1,4,4,6,5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,0,2,1,4,4,6,5] ; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm5, %ymm2 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm6 = ymm6[0,0,2,1,4,4,6,5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm6[0,0,2,1,4,4,6,5] ; AVX1-ONLY-NEXT: vandps %ymm5, %ymm6, %ymm6 ; AVX1-ONLY-NEXT: vorps %ymm2, %ymm6, %ymm13 ; AVX1-ONLY-NEXT: vmovdqa 16(%rsi), %xmm11 @@ -4393,9 +4393,9 @@ ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm1[0,1,2,3,4,5,5,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,7,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] ; AVX1-ONLY-NEXT: vandnps %ymm2, %ymm5, %ymm2 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] ; AVX1-ONLY-NEXT: vandps %ymm5, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vorps %ymm2, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,4,6,5] @@ -4406,7 +4406,7 @@ ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm10[3,3,3,3] ; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,1,3,3,6,5,7,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,1,3,3,6,5,7,7] ; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm4, %ymm0 ; AVX1-ONLY-NEXT: vandps %ymm4, %ymm2, %ymm2 ; AVX1-ONLY-NEXT: vorps %ymm0, %ymm2, %ymm0 @@ -4421,9 +4421,9 @@ ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm2 = xmm3[0,1,1,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm3[2,1,3,3,4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm7, %ymm2, %ymm2 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,0,2,1,4,4,6,5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,0,2,1,4,4,6,5] ; AVX1-ONLY-NEXT: vandnps %ymm1, %ymm5, %ymm1 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm2 = ymm2[0,0,2,1,4,4,6,5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,0,2,1,4,4,6,5] ; AVX1-ONLY-NEXT: vandps %ymm5, %ymm2, %ymm2 ; AVX1-ONLY-NEXT: vorps %ymm1, %ymm2, %ymm2 ; AVX1-ONLY-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm6[0],xmm11[0],xmm6[1],xmm11[1],xmm6[2],xmm11[2],xmm6[3],xmm11[3],xmm6[4],xmm11[4],xmm6[5],xmm11[5],xmm6[6],xmm11[6],xmm6[7],xmm11[7] @@ -4447,9 +4447,9 @@ ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm6 = xmm3[0,1,2,3,4,5,5,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,7,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm6, %ymm3 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] ; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm5, %ymm0 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] ; AVX1-ONLY-NEXT: vandps %ymm5, %ymm3, %ymm3 ; AVX1-ONLY-NEXT: vorps %ymm0, %ymm3, %ymm0 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm3 = xmm7[0,1,2,3,4,4,6,5] @@ -4460,7 +4460,7 @@ ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,3,3,3] ; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm1, %ymm6, %ymm1 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm3 = ymm3[2,1,3,3,6,5,7,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[2,1,3,3,6,5,7,7] ; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm4, %ymm3 ; AVX1-ONLY-NEXT: vandps %ymm4, %ymm1, %ymm1 ; AVX1-ONLY-NEXT: vorps %ymm3, %ymm1, %ymm1 @@ -4477,9 +4477,9 @@ ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm7 = xmm6[0,1,1,3,4,5,6,7] ; AVX1-ONLY-NEXT: vpshuflw {{.*#+}} xmm8 = xmm6[2,1,3,3,4,5,6,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm8, %ymm7, %ymm7 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm3 = ymm3[0,0,2,1,4,4,6,5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,0,2,1,4,4,6,5] ; AVX1-ONLY-NEXT: vandnps %ymm3, %ymm5, %ymm3 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm7 = ymm7[0,0,2,1,4,4,6,5] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm7 = ymm7[0,0,2,1,4,4,6,5] ; AVX1-ONLY-NEXT: vandps %ymm5, %ymm7, %ymm7 ; AVX1-ONLY-NEXT: vorps %ymm3, %ymm7, %ymm7 ; AVX1-ONLY-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload @@ -4507,9 +4507,9 @@ ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm8 = xmm6[0,1,2,3,4,5,5,7] ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,6,5,7,7] ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm6, %ymm8, %ymm6 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] ; AVX1-ONLY-NEXT: vandnps %ymm0, %ymm5, %ymm0 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm6 = ymm6[0,2,2,3,4,6,6,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm6 = ymm6[0,2,2,3,4,6,6,7] ; AVX1-ONLY-NEXT: vandps %ymm5, %ymm6, %ymm5 ; AVX1-ONLY-NEXT: vorps %ymm0, %ymm5, %ymm0 ; AVX1-ONLY-NEXT: vpshufhw {{.*#+}} xmm5 = xmm9[0,1,2,3,4,4,6,5] @@ -4520,7 +4520,7 @@ ; AVX1-ONLY-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[3,3,3,3] ; AVX1-ONLY-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero ; AVX1-ONLY-NEXT: vinsertf128 $1, %xmm3, %ymm6, %ymm3 -; AVX1-ONLY-NEXT: vpermilps {{.*#+}} ymm5 = ymm5[2,1,3,3,6,5,7,7] +; AVX1-ONLY-NEXT: vshufps {{.*#+}} ymm5 = ymm5[2,1,3,3,6,5,7,7] ; AVX1-ONLY-NEXT: vandnps %ymm5, %ymm4, %ymm5 ; AVX1-ONLY-NEXT: vandps %ymm4, %ymm3, %ymm3 ; AVX1-ONLY-NEXT: vorps %ymm5, %ymm3, %ymm3 diff --git a/llvm/test/CodeGen/X86/vector-reduce-add-mask.ll b/llvm/test/CodeGen/X86/vector-reduce-add-mask.ll --- a/llvm/test/CodeGen/X86/vector-reduce-add-mask.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-add-mask.ll @@ -446,7 +446,7 @@ ; AVX1-SLOW-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX1-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-SLOW-NEXT: vorps %xmm1, %xmm0, %xmm0 -; AVX1-SLOW-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 diff --git a/llvm/test/CodeGen/X86/vector-reduce-and-cmp.ll b/llvm/test/CodeGen/X86/vector-reduce-and-cmp.ll --- a/llvm/test/CodeGen/X86/vector-reduce-and-cmp.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-and-cmp.ll @@ -102,7 +102,7 @@ ; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vmovq %xmm0, %rax ; AVX1-NEXT: testq %rax, %rax @@ -165,7 +165,7 @@ ; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vmovq %xmm0, %rax ; AVX1-NEXT: testq %rax, %rax @@ -340,9 +340,9 @@ ; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vmovd %xmm0, %eax ; AVX1-NEXT: testl %eax, %eax @@ -411,9 +411,9 @@ ; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vmovd %xmm0, %eax ; AVX1-NEXT: testl %eax, %eax @@ -638,9 +638,9 @@ ; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 @@ -718,9 +718,9 @@ ; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 @@ -1002,9 +1002,9 @@ ; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 @@ -1091,9 +1091,9 @@ ; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 diff --git a/llvm/test/CodeGen/X86/vector-reduce-and.ll b/llvm/test/CodeGen/X86/vector-reduce-and.ll --- a/llvm/test/CodeGen/X86/vector-reduce-and.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-and.ll @@ -86,7 +86,7 @@ ; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vmovq %xmm0, %rax ; AVX1-NEXT: vzeroupper @@ -140,7 +140,7 @@ ; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vmovq %xmm0, %rax ; AVX1-NEXT: vzeroupper @@ -287,9 +287,9 @@ ; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vmovd %xmm0, %eax ; AVX1-NEXT: vzeroupper @@ -349,9 +349,9 @@ ; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vmovd %xmm0, %eax ; AVX1-NEXT: vzeroupper @@ -554,9 +554,9 @@ ; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 @@ -629,9 +629,9 @@ ; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 @@ -891,9 +891,9 @@ ; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 @@ -975,9 +975,9 @@ ; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 diff --git a/llvm/test/CodeGen/X86/vector-reduce-fadd.ll b/llvm/test/CodeGen/X86/vector-reduce-fadd.ll --- a/llvm/test/CodeGen/X86/vector-reduce-fadd.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-fadd.ll @@ -76,7 +76,7 @@ ; AVX-NEXT: vaddss %xmm2, %xmm0, %xmm0 ; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] ; AVX-NEXT: vaddss %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,3,3,3] +; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3] ; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq ; @@ -87,7 +87,7 @@ ; AVX512-NEXT: vaddss %xmm2, %xmm0, %xmm0 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] ; AVX512-NEXT: vaddss %xmm2, %xmm0, %xmm0 -; AVX512-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,3,3,3] +; AVX512-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3] ; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: retq %1 = call float @llvm.vector.reduce.fadd.f32.v4f32(float %a0, <4 x float> %a1) @@ -144,7 +144,7 @@ ; AVX-NEXT: vaddss %xmm2, %xmm0, %xmm0 ; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] ; AVX-NEXT: vaddss %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[3,3,3,3] +; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm1[3,3,3,3] ; AVX-NEXT: vaddss %xmm2, %xmm0, %xmm0 ; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0 @@ -152,7 +152,7 @@ ; AVX-NEXT: vaddss %xmm2, %xmm0, %xmm0 ; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] ; AVX-NEXT: vaddss %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,3,3,3] +; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3] ; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq @@ -164,7 +164,7 @@ ; AVX512-NEXT: vaddss %xmm2, %xmm0, %xmm0 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] ; AVX512-NEXT: vaddss %xmm2, %xmm0, %xmm0 -; AVX512-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[3,3,3,3] +; AVX512-NEXT: vshufps {{.*#+}} xmm2 = xmm1[3,3,3,3] ; AVX512-NEXT: vaddss %xmm2, %xmm0, %xmm0 ; AVX512-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0 @@ -172,7 +172,7 @@ ; AVX512-NEXT: vaddss %xmm2, %xmm0, %xmm0 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] ; AVX512-NEXT: vaddss %xmm2, %xmm0, %xmm0 -; AVX512-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,3,3,3] +; AVX512-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3] ; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -264,7 +264,7 @@ ; AVX-NEXT: vaddss %xmm3, %xmm0, %xmm0 ; AVX-NEXT: vpermilpd {{.*#+}} xmm3 = xmm1[1,0] ; AVX-NEXT: vaddss %xmm3, %xmm0, %xmm0 -; AVX-NEXT: vpermilps {{.*#+}} xmm3 = xmm1[3,3,3,3] +; AVX-NEXT: vshufps {{.*#+}} xmm3 = xmm1[3,3,3,3] ; AVX-NEXT: vaddss %xmm3, %xmm0, %xmm0 ; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0 @@ -272,14 +272,14 @@ ; AVX-NEXT: vaddss %xmm3, %xmm0, %xmm0 ; AVX-NEXT: vpermilpd {{.*#+}} xmm3 = xmm1[1,0] ; AVX-NEXT: vaddss %xmm3, %xmm0, %xmm0 -; AVX-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,3,3,3] +; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3] ; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vaddss %xmm2, %xmm0, %xmm0 ; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm2[1,1,3,3] ; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm2[1,0] ; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpermilps {{.*#+}} xmm1 = xmm2[3,3,3,3] +; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm2[3,3,3,3] ; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vextractf128 $1, %ymm2, %xmm1 ; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0 @@ -287,7 +287,7 @@ ; AVX-NEXT: vaddss %xmm2, %xmm0, %xmm0 ; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] ; AVX-NEXT: vaddss %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,3,3,3] +; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3] ; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq @@ -299,7 +299,7 @@ ; AVX512-NEXT: vaddss %xmm2, %xmm0, %xmm0 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] ; AVX512-NEXT: vaddss %xmm2, %xmm0, %xmm0 -; AVX512-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[3,3,3,3] +; AVX512-NEXT: vshufps {{.*#+}} xmm2 = xmm1[3,3,3,3] ; AVX512-NEXT: vaddss %xmm2, %xmm0, %xmm0 ; AVX512-NEXT: vextractf128 $1, %ymm1, %xmm2 ; AVX512-NEXT: vaddss %xmm2, %xmm0, %xmm0 @@ -307,7 +307,7 @@ ; AVX512-NEXT: vaddss %xmm3, %xmm0, %xmm0 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm3 = xmm2[1,0] ; AVX512-NEXT: vaddss %xmm3, %xmm0, %xmm0 -; AVX512-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[3,3,3,3] +; AVX512-NEXT: vshufps {{.*#+}} xmm2 = xmm2[3,3,3,3] ; AVX512-NEXT: vaddss %xmm2, %xmm0, %xmm0 ; AVX512-NEXT: vextractf32x4 $2, %zmm1, %xmm2 ; AVX512-NEXT: vaddss %xmm2, %xmm0, %xmm0 @@ -315,7 +315,7 @@ ; AVX512-NEXT: vaddss %xmm3, %xmm0, %xmm0 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm3 = xmm2[1,0] ; AVX512-NEXT: vaddss %xmm3, %xmm0, %xmm0 -; AVX512-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[3,3,3,3] +; AVX512-NEXT: vshufps {{.*#+}} xmm2 = xmm2[3,3,3,3] ; AVX512-NEXT: vaddss %xmm2, %xmm0, %xmm0 ; AVX512-NEXT: vextractf32x4 $3, %zmm1, %xmm1 ; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0 @@ -323,7 +323,7 @@ ; AVX512-NEXT: vaddss %xmm2, %xmm0, %xmm0 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] ; AVX512-NEXT: vaddss %xmm2, %xmm0, %xmm0 -; AVX512-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,3,3,3] +; AVX512-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3] ; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -405,7 +405,7 @@ ; AVX1-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm1 ; AVX1-SLOW-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] ; AVX1-SLOW-NEXT: vaddss %xmm2, %xmm1, %xmm1 -; AVX1-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX1-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX1-SLOW-NEXT: vaddss %xmm0, %xmm1, %xmm0 ; AVX1-SLOW-NEXT: retq ; @@ -414,7 +414,7 @@ ; AVX1-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm1 ; AVX1-FAST-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] ; AVX1-FAST-NEXT: vaddss %xmm2, %xmm1, %xmm1 -; AVX1-FAST-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX1-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX1-FAST-NEXT: vaddss %xmm0, %xmm1, %xmm0 ; AVX1-FAST-NEXT: retq ; @@ -424,7 +424,7 @@ ; AVX2-NEXT: vaddss %xmm1, %xmm0, %xmm1 ; AVX2-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] ; AVX2-NEXT: vaddss %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX2-NEXT: vaddss %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; @@ -434,7 +434,7 @@ ; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm1 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] ; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1 -; AVX512-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX512-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX512-NEXT: vaddss %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: retq %1 = call float @llvm.vector.reduce.fadd.f32.v4f32(float -0.0, <4 x float> %a0) @@ -488,7 +488,7 @@ ; AVX1-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm1 ; AVX1-SLOW-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] ; AVX1-SLOW-NEXT: vaddss %xmm2, %xmm1, %xmm1 -; AVX1-SLOW-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[3,3,3,3] +; AVX1-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm0[3,3,3,3] ; AVX1-SLOW-NEXT: vaddss %xmm2, %xmm1, %xmm1 ; AVX1-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-SLOW-NEXT: vaddss %xmm0, %xmm1, %xmm1 @@ -496,7 +496,7 @@ ; AVX1-SLOW-NEXT: vaddss %xmm2, %xmm1, %xmm1 ; AVX1-SLOW-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] ; AVX1-SLOW-NEXT: vaddss %xmm2, %xmm1, %xmm1 -; AVX1-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX1-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX1-SLOW-NEXT: vaddss %xmm0, %xmm1, %xmm0 ; AVX1-SLOW-NEXT: vzeroupper ; AVX1-SLOW-NEXT: retq @@ -506,7 +506,7 @@ ; AVX1-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm1 ; AVX1-FAST-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] ; AVX1-FAST-NEXT: vaddss %xmm2, %xmm1, %xmm1 -; AVX1-FAST-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[3,3,3,3] +; AVX1-FAST-NEXT: vshufps {{.*#+}} xmm2 = xmm0[3,3,3,3] ; AVX1-FAST-NEXT: vaddss %xmm2, %xmm1, %xmm1 ; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-FAST-NEXT: vaddss %xmm0, %xmm1, %xmm1 @@ -514,7 +514,7 @@ ; AVX1-FAST-NEXT: vaddss %xmm2, %xmm1, %xmm1 ; AVX1-FAST-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] ; AVX1-FAST-NEXT: vaddss %xmm2, %xmm1, %xmm1 -; AVX1-FAST-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX1-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX1-FAST-NEXT: vaddss %xmm0, %xmm1, %xmm0 ; AVX1-FAST-NEXT: vzeroupper ; AVX1-FAST-NEXT: retq @@ -525,7 +525,7 @@ ; AVX2-NEXT: vaddss %xmm1, %xmm0, %xmm1 ; AVX2-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] ; AVX2-NEXT: vaddss %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[3,3,3,3] +; AVX2-NEXT: vshufps {{.*#+}} xmm2 = xmm0[3,3,3,3] ; AVX2-NEXT: vaddss %xmm2, %xmm1, %xmm1 ; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX2-NEXT: vaddss %xmm0, %xmm1, %xmm1 @@ -533,7 +533,7 @@ ; AVX2-NEXT: vaddss %xmm2, %xmm1, %xmm1 ; AVX2-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] ; AVX2-NEXT: vaddss %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX2-NEXT: vaddss %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -544,7 +544,7 @@ ; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm1 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] ; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1 -; AVX512-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[3,3,3,3] +; AVX512-NEXT: vshufps {{.*#+}} xmm2 = xmm0[3,3,3,3] ; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1 ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX512-NEXT: vaddss %xmm0, %xmm1, %xmm1 @@ -552,7 +552,7 @@ ; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] ; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1 -; AVX512-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX512-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX512-NEXT: vaddss %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -641,7 +641,7 @@ ; AVX1-SLOW-NEXT: vaddss %xmm2, %xmm0, %xmm2 ; AVX1-SLOW-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0] ; AVX1-SLOW-NEXT: vaddss %xmm3, %xmm2, %xmm2 -; AVX1-SLOW-NEXT: vpermilps {{.*#+}} xmm3 = xmm0[3,3,3,3] +; AVX1-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm0[3,3,3,3] ; AVX1-SLOW-NEXT: vaddss %xmm3, %xmm2, %xmm2 ; AVX1-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-SLOW-NEXT: vaddss %xmm0, %xmm2, %xmm2 @@ -649,14 +649,14 @@ ; AVX1-SLOW-NEXT: vaddss %xmm3, %xmm2, %xmm2 ; AVX1-SLOW-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0] ; AVX1-SLOW-NEXT: vaddss %xmm3, %xmm2, %xmm2 -; AVX1-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX1-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX1-SLOW-NEXT: vaddss %xmm0, %xmm2, %xmm0 ; AVX1-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm0 ; AVX1-SLOW-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] ; AVX1-SLOW-NEXT: vaddss %xmm2, %xmm0, %xmm0 ; AVX1-SLOW-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] ; AVX1-SLOW-NEXT: vaddss %xmm2, %xmm0, %xmm0 -; AVX1-SLOW-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[3,3,3,3] +; AVX1-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm1[3,3,3,3] ; AVX1-SLOW-NEXT: vaddss %xmm2, %xmm0, %xmm0 ; AVX1-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm0 @@ -664,7 +664,7 @@ ; AVX1-SLOW-NEXT: vaddss %xmm2, %xmm0, %xmm0 ; AVX1-SLOW-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] ; AVX1-SLOW-NEXT: vaddss %xmm2, %xmm0, %xmm0 -; AVX1-SLOW-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,3,3,3] +; AVX1-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3] ; AVX1-SLOW-NEXT: vaddss %xmm1, %xmm0, %xmm0 ; AVX1-SLOW-NEXT: vzeroupper ; AVX1-SLOW-NEXT: retq @@ -674,7 +674,7 @@ ; AVX1-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm2 ; AVX1-FAST-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0] ; AVX1-FAST-NEXT: vaddss %xmm3, %xmm2, %xmm2 -; AVX1-FAST-NEXT: vpermilps {{.*#+}} xmm3 = xmm0[3,3,3,3] +; AVX1-FAST-NEXT: vshufps {{.*#+}} xmm3 = xmm0[3,3,3,3] ; AVX1-FAST-NEXT: vaddss %xmm3, %xmm2, %xmm2 ; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-FAST-NEXT: vaddss %xmm0, %xmm2, %xmm2 @@ -682,14 +682,14 @@ ; AVX1-FAST-NEXT: vaddss %xmm3, %xmm2, %xmm2 ; AVX1-FAST-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0] ; AVX1-FAST-NEXT: vaddss %xmm3, %xmm2, %xmm2 -; AVX1-FAST-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX1-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX1-FAST-NEXT: vaddss %xmm0, %xmm2, %xmm0 ; AVX1-FAST-NEXT: vaddss %xmm1, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] ; AVX1-FAST-NEXT: vaddss %xmm2, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] ; AVX1-FAST-NEXT: vaddss %xmm2, %xmm0, %xmm0 -; AVX1-FAST-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[3,3,3,3] +; AVX1-FAST-NEXT: vshufps {{.*#+}} xmm2 = xmm1[3,3,3,3] ; AVX1-FAST-NEXT: vaddss %xmm2, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-FAST-NEXT: vaddss %xmm1, %xmm0, %xmm0 @@ -697,7 +697,7 @@ ; AVX1-FAST-NEXT: vaddss %xmm2, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] ; AVX1-FAST-NEXT: vaddss %xmm2, %xmm0, %xmm0 -; AVX1-FAST-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,3,3,3] +; AVX1-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3] ; AVX1-FAST-NEXT: vaddss %xmm1, %xmm0, %xmm0 ; AVX1-FAST-NEXT: vzeroupper ; AVX1-FAST-NEXT: retq @@ -708,7 +708,7 @@ ; AVX2-NEXT: vaddss %xmm2, %xmm0, %xmm2 ; AVX2-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0] ; AVX2-NEXT: vaddss %xmm3, %xmm2, %xmm2 -; AVX2-NEXT: vpermilps {{.*#+}} xmm3 = xmm0[3,3,3,3] +; AVX2-NEXT: vshufps {{.*#+}} xmm3 = xmm0[3,3,3,3] ; AVX2-NEXT: vaddss %xmm3, %xmm2, %xmm2 ; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX2-NEXT: vaddss %xmm0, %xmm2, %xmm2 @@ -716,14 +716,14 @@ ; AVX2-NEXT: vaddss %xmm3, %xmm2, %xmm2 ; AVX2-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0] ; AVX2-NEXT: vaddss %xmm3, %xmm2, %xmm2 -; AVX2-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX2-NEXT: vaddss %xmm0, %xmm2, %xmm0 ; AVX2-NEXT: vaddss %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] ; AVX2-NEXT: vaddss %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] ; AVX2-NEXT: vaddss %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[3,3,3,3] +; AVX2-NEXT: vshufps {{.*#+}} xmm2 = xmm1[3,3,3,3] ; AVX2-NEXT: vaddss %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX2-NEXT: vaddss %xmm1, %xmm0, %xmm0 @@ -731,7 +731,7 @@ ; AVX2-NEXT: vaddss %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] ; AVX2-NEXT: vaddss %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,3,3,3] +; AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3] ; AVX2-NEXT: vaddss %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -742,7 +742,7 @@ ; AVX512-NEXT: vaddss %xmm1, %xmm0, %xmm1 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] ; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1 -; AVX512-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[3,3,3,3] +; AVX512-NEXT: vshufps {{.*#+}} xmm2 = xmm0[3,3,3,3] ; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1 ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm2 ; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1 @@ -750,7 +750,7 @@ ; AVX512-NEXT: vaddss %xmm3, %xmm1, %xmm1 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm3 = xmm2[1,0] ; AVX512-NEXT: vaddss %xmm3, %xmm1, %xmm1 -; AVX512-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[3,3,3,3] +; AVX512-NEXT: vshufps {{.*#+}} xmm2 = xmm2[3,3,3,3] ; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1 ; AVX512-NEXT: vextractf32x4 $2, %zmm0, %xmm2 ; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1 @@ -758,7 +758,7 @@ ; AVX512-NEXT: vaddss %xmm3, %xmm1, %xmm1 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm3 = xmm2[1,0] ; AVX512-NEXT: vaddss %xmm3, %xmm1, %xmm1 -; AVX512-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[3,3,3,3] +; AVX512-NEXT: vshufps {{.*#+}} xmm2 = xmm2[3,3,3,3] ; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1 ; AVX512-NEXT: vextractf32x4 $3, %zmm0, %xmm0 ; AVX512-NEXT: vaddss %xmm0, %xmm1, %xmm1 @@ -766,7 +766,7 @@ ; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] ; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1 -; AVX512-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX512-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX512-NEXT: vaddss %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -836,7 +836,7 @@ ; AVX-NEXT: vaddss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] ; AVX-NEXT: vaddss %xmm2, %xmm1, %xmm1 -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX-NEXT: vaddss %xmm0, %xmm1, %xmm0 ; AVX-NEXT: retq ; @@ -846,7 +846,7 @@ ; AVX512-NEXT: vaddss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] ; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1 -; AVX512-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX512-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX512-NEXT: vaddss %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: retq %1 = call float @llvm.vector.reduce.fadd.f32.v4f32(float undef, <4 x float> %a0) @@ -900,7 +900,7 @@ ; AVX-NEXT: vaddss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] ; AVX-NEXT: vaddss %xmm2, %xmm1, %xmm1 -; AVX-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[3,3,3,3] +; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm0[3,3,3,3] ; AVX-NEXT: vaddss %xmm2, %xmm1, %xmm1 ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX-NEXT: vaddss %xmm0, %xmm1, %xmm1 @@ -908,7 +908,7 @@ ; AVX-NEXT: vaddss %xmm2, %xmm1, %xmm1 ; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] ; AVX-NEXT: vaddss %xmm2, %xmm1, %xmm1 -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX-NEXT: vaddss %xmm0, %xmm1, %xmm0 ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq @@ -919,7 +919,7 @@ ; AVX512-NEXT: vaddss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] ; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1 -; AVX512-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[3,3,3,3] +; AVX512-NEXT: vshufps {{.*#+}} xmm2 = xmm0[3,3,3,3] ; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1 ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX512-NEXT: vaddss %xmm0, %xmm1, %xmm1 @@ -927,7 +927,7 @@ ; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] ; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1 -; AVX512-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX512-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX512-NEXT: vaddss %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -1016,7 +1016,7 @@ ; AVX-NEXT: vaddss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 ; AVX-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0] ; AVX-NEXT: vaddss %xmm3, %xmm2, %xmm2 -; AVX-NEXT: vpermilps {{.*#+}} xmm3 = xmm0[3,3,3,3] +; AVX-NEXT: vshufps {{.*#+}} xmm3 = xmm0[3,3,3,3] ; AVX-NEXT: vaddss %xmm3, %xmm2, %xmm2 ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX-NEXT: vaddss %xmm0, %xmm2, %xmm2 @@ -1024,14 +1024,14 @@ ; AVX-NEXT: vaddss %xmm3, %xmm2, %xmm2 ; AVX-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0] ; AVX-NEXT: vaddss %xmm3, %xmm2, %xmm2 -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX-NEXT: vaddss %xmm0, %xmm2, %xmm0 ; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] ; AVX-NEXT: vaddss %xmm2, %xmm0, %xmm0 ; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] ; AVX-NEXT: vaddss %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[3,3,3,3] +; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm1[3,3,3,3] ; AVX-NEXT: vaddss %xmm2, %xmm0, %xmm0 ; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0 @@ -1039,7 +1039,7 @@ ; AVX-NEXT: vaddss %xmm2, %xmm0, %xmm0 ; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] ; AVX-NEXT: vaddss %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,3,3,3] +; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3] ; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq @@ -1050,7 +1050,7 @@ ; AVX512-NEXT: vaddss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] ; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1 -; AVX512-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[3,3,3,3] +; AVX512-NEXT: vshufps {{.*#+}} xmm2 = xmm0[3,3,3,3] ; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1 ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm2 ; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1 @@ -1058,7 +1058,7 @@ ; AVX512-NEXT: vaddss %xmm3, %xmm1, %xmm1 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm3 = xmm2[1,0] ; AVX512-NEXT: vaddss %xmm3, %xmm1, %xmm1 -; AVX512-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[3,3,3,3] +; AVX512-NEXT: vshufps {{.*#+}} xmm2 = xmm2[3,3,3,3] ; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1 ; AVX512-NEXT: vextractf32x4 $2, %zmm0, %xmm2 ; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1 @@ -1066,7 +1066,7 @@ ; AVX512-NEXT: vaddss %xmm3, %xmm1, %xmm1 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm3 = xmm2[1,0] ; AVX512-NEXT: vaddss %xmm3, %xmm1, %xmm1 -; AVX512-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[3,3,3,3] +; AVX512-NEXT: vshufps {{.*#+}} xmm2 = xmm2[3,3,3,3] ; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1 ; AVX512-NEXT: vextractf32x4 $3, %zmm0, %xmm0 ; AVX512-NEXT: vaddss %xmm0, %xmm1, %xmm1 @@ -1074,7 +1074,7 @@ ; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] ; AVX512-NEXT: vaddss %xmm2, %xmm1, %xmm1 -; AVX512-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX512-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX512-NEXT: vaddss %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-reduce-fmax.ll b/llvm/test/CodeGen/X86/vector-reduce-fmax.ll --- a/llvm/test/CodeGen/X86/vector-reduce-fmax.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-fmax.ll @@ -197,7 +197,7 @@ ; ; AVX-LABEL: test_v4f32: ; AVX: # %bb.0: -; AVX-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,3,3,3] +; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm0[3,3,3,3] ; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] ; AVX-NEXT: vmovshdup {{.*#+}} xmm3 = xmm0[1,1,3,3] ; AVX-NEXT: vmaxss %xmm0, %xmm3, %xmm4 @@ -213,7 +213,7 @@ ; ; AVX512-LABEL: test_v4f32: ; AVX512: # %bb.0: -; AVX512-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,3,3,3] +; AVX512-NEXT: vshufps {{.*#+}} xmm1 = xmm0[3,3,3,3] ; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] ; AVX512-NEXT: vmovshdup {{.*#+}} xmm3 = xmm0[1,1,3,3] ; AVX512-NEXT: vmaxss %xmm0, %xmm3, %xmm4 @@ -302,10 +302,10 @@ ; AVX-LABEL: test_v8f32: ; AVX: # %bb.0: ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[3,3,3,3] +; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm1[3,3,3,3] ; AVX-NEXT: vpermilpd {{.*#+}} xmm3 = xmm1[1,0] ; AVX-NEXT: vmovshdup {{.*#+}} xmm4 = xmm1[1,1,3,3] -; AVX-NEXT: vpermilps {{.*#+}} xmm5 = xmm0[3,3,3,3] +; AVX-NEXT: vshufps {{.*#+}} xmm5 = xmm0[3,3,3,3] ; AVX-NEXT: vpermilpd {{.*#+}} xmm6 = xmm0[1,0] ; AVX-NEXT: vmovshdup {{.*#+}} xmm7 = xmm0[1,1,3,3] ; AVX-NEXT: vmaxss %xmm0, %xmm7, %xmm8 @@ -335,10 +335,10 @@ ; AVX512BW-LABEL: test_v8f32: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX512BW-NEXT: vpermilps {{.*#+}} xmm1 = xmm3[3,3,3,3] +; AVX512BW-NEXT: vshufps {{.*#+}} xmm1 = xmm3[3,3,3,3] ; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm2 = xmm3[1,0] ; AVX512BW-NEXT: vmovshdup {{.*#+}} xmm4 = xmm3[1,1,3,3] -; AVX512BW-NEXT: vpermilps {{.*#+}} xmm5 = xmm0[3,3,3,3] +; AVX512BW-NEXT: vshufps {{.*#+}} xmm5 = xmm0[3,3,3,3] ; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm6 = xmm0[1,0] ; AVX512BW-NEXT: vmovshdup {{.*#+}} xmm7 = xmm0[1,1,3,3] ; AVX512BW-NEXT: vmaxss %xmm0, %xmm7, %xmm8 @@ -367,11 +367,11 @@ ; ; AVX512VL-LABEL: test_v8f32: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX512VL-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[3,3,3,3] -; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm3 = xmm1[1,0] -; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm4 = xmm1[1,1,3,3] -; AVX512VL-NEXT: vpermilps {{.*#+}} xmm5 = xmm0[3,3,3,3] +; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX512VL-NEXT: vshufps {{.*#+}} xmm1 = xmm2[3,3,3,3] +; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm3 = xmm2[1,0] +; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm4 = xmm2[1,1,3,3] +; AVX512VL-NEXT: vshufps {{.*#+}} xmm5 = xmm0[3,3,3,3] ; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm6 = xmm0[1,0] ; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm7 = xmm0[1,1,3,3] ; AVX512VL-NEXT: vmaxss %xmm0, %xmm7, %xmm8 @@ -384,8 +384,8 @@ ; AVX512VL-NEXT: vmaxss %xmm0, %xmm5, %xmm0 ; AVX512VL-NEXT: vmovss %xmm5, %xmm0, %xmm0 {%k1} ; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1 -; AVX512VL-NEXT: vmaxss %xmm0, %xmm1, %xmm0 -; AVX512VL-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1} +; AVX512VL-NEXT: vmaxss %xmm0, %xmm2, %xmm0 +; AVX512VL-NEXT: vmovss %xmm2, %xmm0, %xmm0 {%k1} ; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1 ; AVX512VL-NEXT: vmaxss %xmm0, %xmm4, %xmm0 ; AVX512VL-NEXT: vmovss %xmm4, %xmm0, %xmm0 {%k1} @@ -393,8 +393,8 @@ ; AVX512VL-NEXT: vmaxss %xmm0, %xmm3, %xmm0 ; AVX512VL-NEXT: vmovss %xmm3, %xmm0, %xmm0 {%k1} ; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1 -; AVX512VL-NEXT: vmaxss %xmm0, %xmm2, %xmm0 -; AVX512VL-NEXT: vmovss %xmm2, %xmm0, %xmm0 {%k1} +; AVX512VL-NEXT: vmaxss %xmm0, %xmm1, %xmm0 +; AVX512VL-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1} ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq %1 = call float @llvm.vector.reduce.fmax.v8f32(<8 x float> %a0) @@ -506,7 +506,7 @@ ; AVX-NEXT: vmaxss %xmm1, %xmm3, %xmm1 ; AVX-NEXT: vblendvps %xmm2, %xmm3, %xmm1, %xmm1 ; AVX-NEXT: vcmpunordss %xmm1, %xmm1, %xmm2 -; AVX-NEXT: vpermilps {{.*#+}} xmm3 = xmm0[3,3,3,3] +; AVX-NEXT: vshufps {{.*#+}} xmm3 = xmm0[3,3,3,3] ; AVX-NEXT: vmaxss %xmm1, %xmm3, %xmm1 ; AVX-NEXT: vblendvps %xmm2, %xmm3, %xmm1, %xmm1 ; AVX-NEXT: vcmpunordss %xmm1, %xmm1, %xmm2 @@ -522,7 +522,7 @@ ; AVX-NEXT: vmaxss %xmm1, %xmm3, %xmm1 ; AVX-NEXT: vblendvps %xmm2, %xmm3, %xmm1, %xmm1 ; AVX-NEXT: vcmpunordss %xmm1, %xmm1, %xmm2 -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX-NEXT: vmaxss %xmm1, %xmm0, %xmm1 ; AVX-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 ; AVX-NEXT: vzeroupper @@ -539,7 +539,7 @@ ; AVX512BW-NEXT: vcmpunordss %xmm3, %xmm3, %k1 ; AVX512BW-NEXT: vmaxss %xmm3, %xmm2, %xmm3 ; AVX512BW-NEXT: vmovss %xmm2, %xmm3, %xmm3 {%k1} -; AVX512BW-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[3,3,3,3] +; AVX512BW-NEXT: vshufps {{.*#+}} xmm2 = xmm0[3,3,3,3] ; AVX512BW-NEXT: vcmpunordss %xmm3, %xmm3, %k1 ; AVX512BW-NEXT: vmaxss %xmm3, %xmm2, %xmm3 ; AVX512BW-NEXT: vmovss %xmm2, %xmm3, %xmm3 {%k1} @@ -555,7 +555,7 @@ ; AVX512BW-NEXT: vmaxss %xmm3, %xmm2, %xmm3 ; AVX512BW-NEXT: vmovss %xmm2, %xmm3, %xmm3 {%k1} ; AVX512BW-NEXT: vextractf32x4 $2, %zmm0, %xmm2 -; AVX512BW-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,3,3,3] +; AVX512BW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3] ; AVX512BW-NEXT: vcmpunordss %xmm3, %xmm3, %k1 ; AVX512BW-NEXT: vmaxss %xmm3, %xmm1, %xmm3 ; AVX512BW-NEXT: vmovss %xmm1, %xmm3, %xmm3 {%k1} @@ -571,7 +571,7 @@ ; AVX512BW-NEXT: vmaxss %xmm3, %xmm1, %xmm3 ; AVX512BW-NEXT: vmovss %xmm1, %xmm3, %xmm3 {%k1} ; AVX512BW-NEXT: vextractf32x4 $3, %zmm0, %xmm0 -; AVX512BW-NEXT: vpermilps {{.*#+}} xmm1 = xmm2[3,3,3,3] +; AVX512BW-NEXT: vshufps {{.*#+}} xmm1 = xmm2[3,3,3,3] ; AVX512BW-NEXT: vcmpunordss %xmm3, %xmm3, %k1 ; AVX512BW-NEXT: vmaxss %xmm3, %xmm1, %xmm2 ; AVX512BW-NEXT: vmovss %xmm1, %xmm2, %xmm2 {%k1} @@ -586,7 +586,7 @@ ; AVX512BW-NEXT: vcmpunordss %xmm2, %xmm2, %k1 ; AVX512BW-NEXT: vmaxss %xmm2, %xmm1, %xmm2 ; AVX512BW-NEXT: vmovss %xmm1, %xmm2, %xmm2 {%k1} -; AVX512BW-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,3,3,3] +; AVX512BW-NEXT: vshufps {{.*#+}} xmm1 = xmm0[3,3,3,3] ; AVX512BW-NEXT: vcmpunordss %xmm2, %xmm2, %k1 ; AVX512BW-NEXT: vmaxss %xmm2, %xmm1, %xmm0 ; AVX512BW-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1} @@ -596,18 +596,18 @@ ; AVX512VL-LABEL: test_v16f32: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vextractf32x4 $3, %zmm0, %xmm3 -; AVX512VL-NEXT: vpermilps {{.*#+}} xmm1 = xmm3[3,3,3,3] +; AVX512VL-NEXT: vshufps {{.*#+}} xmm1 = xmm3[3,3,3,3] ; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm2 = xmm3[1,0] ; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm4 = xmm3[1,1,3,3] ; AVX512VL-NEXT: vextractf32x4 $2, %zmm0, %xmm6 -; AVX512VL-NEXT: vpermilps {{.*#+}} xmm5 = xmm6[3,3,3,3] -; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm7 = xmm6[1,0] -; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm8 = xmm6[1,1,3,3] -; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm9 -; AVX512VL-NEXT: vpermilps {{.*#+}} xmm10 = xmm9[3,3,3,3] -; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm11 = xmm9[1,0] -; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm12 = xmm9[1,1,3,3] -; AVX512VL-NEXT: vpermilps {{.*#+}} xmm13 = xmm0[3,3,3,3] +; AVX512VL-NEXT: vshufps {{.*#+}} xmm5 = xmm6[3,3,3,3] +; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm8 = xmm6[1,0] +; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm9 = xmm6[1,1,3,3] +; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm10 +; AVX512VL-NEXT: vshufps {{.*#+}} xmm7 = xmm10[3,3,3,3] +; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm11 = xmm10[1,0] +; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm12 = xmm10[1,1,3,3] +; AVX512VL-NEXT: vshufps {{.*#+}} xmm13 = xmm0[3,3,3,3] ; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm14 = xmm0[1,0] ; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm15 = xmm0[1,1,3,3] ; AVX512VL-NEXT: vmaxss %xmm0, %xmm15, %xmm16 @@ -620,8 +620,8 @@ ; AVX512VL-NEXT: vmaxss %xmm0, %xmm13, %xmm0 ; AVX512VL-NEXT: vmovss %xmm13, %xmm0, %xmm0 {%k1} ; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1 -; AVX512VL-NEXT: vmaxss %xmm0, %xmm9, %xmm0 -; AVX512VL-NEXT: vmovss %xmm9, %xmm0, %xmm0 {%k1} +; AVX512VL-NEXT: vmaxss %xmm0, %xmm10, %xmm0 +; AVX512VL-NEXT: vmovss %xmm10, %xmm0, %xmm0 {%k1} ; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1 ; AVX512VL-NEXT: vmaxss %xmm0, %xmm12, %xmm0 ; AVX512VL-NEXT: vmovss %xmm12, %xmm0, %xmm0 {%k1} @@ -629,18 +629,18 @@ ; AVX512VL-NEXT: vmaxss %xmm0, %xmm11, %xmm0 ; AVX512VL-NEXT: vmovss %xmm11, %xmm0, %xmm0 {%k1} ; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1 -; AVX512VL-NEXT: vmaxss %xmm0, %xmm10, %xmm0 -; AVX512VL-NEXT: vmovss %xmm10, %xmm0, %xmm0 {%k1} +; AVX512VL-NEXT: vmaxss %xmm0, %xmm7, %xmm0 +; AVX512VL-NEXT: vmovss %xmm7, %xmm0, %xmm0 {%k1} ; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1 ; AVX512VL-NEXT: vmaxss %xmm0, %xmm6, %xmm0 ; AVX512VL-NEXT: vmovss %xmm6, %xmm0, %xmm0 {%k1} ; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1 +; AVX512VL-NEXT: vmaxss %xmm0, %xmm9, %xmm0 +; AVX512VL-NEXT: vmovss %xmm9, %xmm0, %xmm0 {%k1} +; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1 ; AVX512VL-NEXT: vmaxss %xmm0, %xmm8, %xmm0 ; AVX512VL-NEXT: vmovss %xmm8, %xmm0, %xmm0 {%k1} ; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1 -; AVX512VL-NEXT: vmaxss %xmm0, %xmm7, %xmm0 -; AVX512VL-NEXT: vmovss %xmm7, %xmm0, %xmm0 {%k1} -; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1 ; AVX512VL-NEXT: vmaxss %xmm0, %xmm5, %xmm0 ; AVX512VL-NEXT: vmovss %xmm5, %xmm0, %xmm0 {%k1} ; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1 diff --git a/llvm/test/CodeGen/X86/vector-reduce-fmin.ll b/llvm/test/CodeGen/X86/vector-reduce-fmin.ll --- a/llvm/test/CodeGen/X86/vector-reduce-fmin.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-fmin.ll @@ -130,7 +130,7 @@ ; ; AVX-LABEL: test_v4f32: ; AVX: # %bb.0: -; AVX-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,3,3,3] +; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm0[3,3,3,3] ; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] ; AVX-NEXT: vmovshdup {{.*#+}} xmm3 = xmm0[1,1,3,3] ; AVX-NEXT: vminss %xmm0, %xmm3, %xmm4 @@ -146,7 +146,7 @@ ; ; AVX512-LABEL: test_v4f32: ; AVX512: # %bb.0: -; AVX512-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,3,3,3] +; AVX512-NEXT: vshufps {{.*#+}} xmm1 = xmm0[3,3,3,3] ; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] ; AVX512-NEXT: vmovshdup {{.*#+}} xmm3 = xmm0[1,1,3,3] ; AVX512-NEXT: vminss %xmm0, %xmm3, %xmm4 @@ -235,10 +235,10 @@ ; AVX-LABEL: test_v8f32: ; AVX: # %bb.0: ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[3,3,3,3] +; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm1[3,3,3,3] ; AVX-NEXT: vpermilpd {{.*#+}} xmm3 = xmm1[1,0] ; AVX-NEXT: vmovshdup {{.*#+}} xmm4 = xmm1[1,1,3,3] -; AVX-NEXT: vpermilps {{.*#+}} xmm5 = xmm0[3,3,3,3] +; AVX-NEXT: vshufps {{.*#+}} xmm5 = xmm0[3,3,3,3] ; AVX-NEXT: vpermilpd {{.*#+}} xmm6 = xmm0[1,0] ; AVX-NEXT: vmovshdup {{.*#+}} xmm7 = xmm0[1,1,3,3] ; AVX-NEXT: vminss %xmm0, %xmm7, %xmm8 @@ -268,10 +268,10 @@ ; AVX512BW-LABEL: test_v8f32: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX512BW-NEXT: vpermilps {{.*#+}} xmm1 = xmm3[3,3,3,3] +; AVX512BW-NEXT: vshufps {{.*#+}} xmm1 = xmm3[3,3,3,3] ; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm2 = xmm3[1,0] ; AVX512BW-NEXT: vmovshdup {{.*#+}} xmm4 = xmm3[1,1,3,3] -; AVX512BW-NEXT: vpermilps {{.*#+}} xmm5 = xmm0[3,3,3,3] +; AVX512BW-NEXT: vshufps {{.*#+}} xmm5 = xmm0[3,3,3,3] ; AVX512BW-NEXT: vpermilpd {{.*#+}} xmm6 = xmm0[1,0] ; AVX512BW-NEXT: vmovshdup {{.*#+}} xmm7 = xmm0[1,1,3,3] ; AVX512BW-NEXT: vminss %xmm0, %xmm7, %xmm8 @@ -300,11 +300,11 @@ ; ; AVX512VL-LABEL: test_v8f32: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX512VL-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[3,3,3,3] -; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm3 = xmm1[1,0] -; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm4 = xmm1[1,1,3,3] -; AVX512VL-NEXT: vpermilps {{.*#+}} xmm5 = xmm0[3,3,3,3] +; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX512VL-NEXT: vshufps {{.*#+}} xmm1 = xmm2[3,3,3,3] +; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm3 = xmm2[1,0] +; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm4 = xmm2[1,1,3,3] +; AVX512VL-NEXT: vshufps {{.*#+}} xmm5 = xmm0[3,3,3,3] ; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm6 = xmm0[1,0] ; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm7 = xmm0[1,1,3,3] ; AVX512VL-NEXT: vminss %xmm0, %xmm7, %xmm8 @@ -317,8 +317,8 @@ ; AVX512VL-NEXT: vminss %xmm0, %xmm5, %xmm0 ; AVX512VL-NEXT: vmovss %xmm5, %xmm0, %xmm0 {%k1} ; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1 -; AVX512VL-NEXT: vminss %xmm0, %xmm1, %xmm0 -; AVX512VL-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1} +; AVX512VL-NEXT: vminss %xmm0, %xmm2, %xmm0 +; AVX512VL-NEXT: vmovss %xmm2, %xmm0, %xmm0 {%k1} ; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1 ; AVX512VL-NEXT: vminss %xmm0, %xmm4, %xmm0 ; AVX512VL-NEXT: vmovss %xmm4, %xmm0, %xmm0 {%k1} @@ -326,8 +326,8 @@ ; AVX512VL-NEXT: vminss %xmm0, %xmm3, %xmm0 ; AVX512VL-NEXT: vmovss %xmm3, %xmm0, %xmm0 {%k1} ; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1 -; AVX512VL-NEXT: vminss %xmm0, %xmm2, %xmm0 -; AVX512VL-NEXT: vmovss %xmm2, %xmm0, %xmm0 {%k1} +; AVX512VL-NEXT: vminss %xmm0, %xmm1, %xmm0 +; AVX512VL-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1} ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq %1 = call float @llvm.vector.reduce.fmin.v8f32(<8 x float> %a0) @@ -439,7 +439,7 @@ ; AVX-NEXT: vminss %xmm1, %xmm3, %xmm1 ; AVX-NEXT: vblendvps %xmm2, %xmm3, %xmm1, %xmm1 ; AVX-NEXT: vcmpunordss %xmm1, %xmm1, %xmm2 -; AVX-NEXT: vpermilps {{.*#+}} xmm3 = xmm0[3,3,3,3] +; AVX-NEXT: vshufps {{.*#+}} xmm3 = xmm0[3,3,3,3] ; AVX-NEXT: vminss %xmm1, %xmm3, %xmm1 ; AVX-NEXT: vblendvps %xmm2, %xmm3, %xmm1, %xmm1 ; AVX-NEXT: vcmpunordss %xmm1, %xmm1, %xmm2 @@ -455,7 +455,7 @@ ; AVX-NEXT: vminss %xmm1, %xmm3, %xmm1 ; AVX-NEXT: vblendvps %xmm2, %xmm3, %xmm1, %xmm1 ; AVX-NEXT: vcmpunordss %xmm1, %xmm1, %xmm2 -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX-NEXT: vminss %xmm1, %xmm0, %xmm1 ; AVX-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 ; AVX-NEXT: vzeroupper @@ -472,7 +472,7 @@ ; AVX512BW-NEXT: vcmpunordss %xmm3, %xmm3, %k1 ; AVX512BW-NEXT: vminss %xmm3, %xmm2, %xmm3 ; AVX512BW-NEXT: vmovss %xmm2, %xmm3, %xmm3 {%k1} -; AVX512BW-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[3,3,3,3] +; AVX512BW-NEXT: vshufps {{.*#+}} xmm2 = xmm0[3,3,3,3] ; AVX512BW-NEXT: vcmpunordss %xmm3, %xmm3, %k1 ; AVX512BW-NEXT: vminss %xmm3, %xmm2, %xmm3 ; AVX512BW-NEXT: vmovss %xmm2, %xmm3, %xmm3 {%k1} @@ -488,7 +488,7 @@ ; AVX512BW-NEXT: vminss %xmm3, %xmm2, %xmm3 ; AVX512BW-NEXT: vmovss %xmm2, %xmm3, %xmm3 {%k1} ; AVX512BW-NEXT: vextractf32x4 $2, %zmm0, %xmm2 -; AVX512BW-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,3,3,3] +; AVX512BW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3] ; AVX512BW-NEXT: vcmpunordss %xmm3, %xmm3, %k1 ; AVX512BW-NEXT: vminss %xmm3, %xmm1, %xmm3 ; AVX512BW-NEXT: vmovss %xmm1, %xmm3, %xmm3 {%k1} @@ -504,7 +504,7 @@ ; AVX512BW-NEXT: vminss %xmm3, %xmm1, %xmm3 ; AVX512BW-NEXT: vmovss %xmm1, %xmm3, %xmm3 {%k1} ; AVX512BW-NEXT: vextractf32x4 $3, %zmm0, %xmm0 -; AVX512BW-NEXT: vpermilps {{.*#+}} xmm1 = xmm2[3,3,3,3] +; AVX512BW-NEXT: vshufps {{.*#+}} xmm1 = xmm2[3,3,3,3] ; AVX512BW-NEXT: vcmpunordss %xmm3, %xmm3, %k1 ; AVX512BW-NEXT: vminss %xmm3, %xmm1, %xmm2 ; AVX512BW-NEXT: vmovss %xmm1, %xmm2, %xmm2 {%k1} @@ -519,7 +519,7 @@ ; AVX512BW-NEXT: vcmpunordss %xmm2, %xmm2, %k1 ; AVX512BW-NEXT: vminss %xmm2, %xmm1, %xmm2 ; AVX512BW-NEXT: vmovss %xmm1, %xmm2, %xmm2 {%k1} -; AVX512BW-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,3,3,3] +; AVX512BW-NEXT: vshufps {{.*#+}} xmm1 = xmm0[3,3,3,3] ; AVX512BW-NEXT: vcmpunordss %xmm2, %xmm2, %k1 ; AVX512BW-NEXT: vminss %xmm2, %xmm1, %xmm0 ; AVX512BW-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1} @@ -529,18 +529,18 @@ ; AVX512VL-LABEL: test_v16f32: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vextractf32x4 $3, %zmm0, %xmm3 -; AVX512VL-NEXT: vpermilps {{.*#+}} xmm1 = xmm3[3,3,3,3] +; AVX512VL-NEXT: vshufps {{.*#+}} xmm1 = xmm3[3,3,3,3] ; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm2 = xmm3[1,0] ; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm4 = xmm3[1,1,3,3] ; AVX512VL-NEXT: vextractf32x4 $2, %zmm0, %xmm6 -; AVX512VL-NEXT: vpermilps {{.*#+}} xmm5 = xmm6[3,3,3,3] -; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm7 = xmm6[1,0] -; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm8 = xmm6[1,1,3,3] -; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm9 -; AVX512VL-NEXT: vpermilps {{.*#+}} xmm10 = xmm9[3,3,3,3] -; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm11 = xmm9[1,0] -; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm12 = xmm9[1,1,3,3] -; AVX512VL-NEXT: vpermilps {{.*#+}} xmm13 = xmm0[3,3,3,3] +; AVX512VL-NEXT: vshufps {{.*#+}} xmm5 = xmm6[3,3,3,3] +; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm8 = xmm6[1,0] +; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm9 = xmm6[1,1,3,3] +; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm10 +; AVX512VL-NEXT: vshufps {{.*#+}} xmm7 = xmm10[3,3,3,3] +; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm11 = xmm10[1,0] +; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm12 = xmm10[1,1,3,3] +; AVX512VL-NEXT: vshufps {{.*#+}} xmm13 = xmm0[3,3,3,3] ; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm14 = xmm0[1,0] ; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm15 = xmm0[1,1,3,3] ; AVX512VL-NEXT: vminss %xmm0, %xmm15, %xmm16 @@ -553,8 +553,8 @@ ; AVX512VL-NEXT: vminss %xmm0, %xmm13, %xmm0 ; AVX512VL-NEXT: vmovss %xmm13, %xmm0, %xmm0 {%k1} ; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1 -; AVX512VL-NEXT: vminss %xmm0, %xmm9, %xmm0 -; AVX512VL-NEXT: vmovss %xmm9, %xmm0, %xmm0 {%k1} +; AVX512VL-NEXT: vminss %xmm0, %xmm10, %xmm0 +; AVX512VL-NEXT: vmovss %xmm10, %xmm0, %xmm0 {%k1} ; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1 ; AVX512VL-NEXT: vminss %xmm0, %xmm12, %xmm0 ; AVX512VL-NEXT: vmovss %xmm12, %xmm0, %xmm0 {%k1} @@ -562,18 +562,18 @@ ; AVX512VL-NEXT: vminss %xmm0, %xmm11, %xmm0 ; AVX512VL-NEXT: vmovss %xmm11, %xmm0, %xmm0 {%k1} ; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1 -; AVX512VL-NEXT: vminss %xmm0, %xmm10, %xmm0 -; AVX512VL-NEXT: vmovss %xmm10, %xmm0, %xmm0 {%k1} +; AVX512VL-NEXT: vminss %xmm0, %xmm7, %xmm0 +; AVX512VL-NEXT: vmovss %xmm7, %xmm0, %xmm0 {%k1} ; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1 ; AVX512VL-NEXT: vminss %xmm0, %xmm6, %xmm0 ; AVX512VL-NEXT: vmovss %xmm6, %xmm0, %xmm0 {%k1} ; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1 +; AVX512VL-NEXT: vminss %xmm0, %xmm9, %xmm0 +; AVX512VL-NEXT: vmovss %xmm9, %xmm0, %xmm0 {%k1} +; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1 ; AVX512VL-NEXT: vminss %xmm0, %xmm8, %xmm0 ; AVX512VL-NEXT: vmovss %xmm8, %xmm0, %xmm0 {%k1} ; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1 -; AVX512VL-NEXT: vminss %xmm0, %xmm7, %xmm0 -; AVX512VL-NEXT: vmovss %xmm7, %xmm0, %xmm0 {%k1} -; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1 ; AVX512VL-NEXT: vminss %xmm0, %xmm5, %xmm0 ; AVX512VL-NEXT: vmovss %xmm5, %xmm0, %xmm0 {%k1} ; AVX512VL-NEXT: vcmpunordss %xmm0, %xmm0, %k1 diff --git a/llvm/test/CodeGen/X86/vector-reduce-fmul.ll b/llvm/test/CodeGen/X86/vector-reduce-fmul.ll --- a/llvm/test/CodeGen/X86/vector-reduce-fmul.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-fmul.ll @@ -75,7 +75,7 @@ ; AVX-NEXT: vmulss %xmm2, %xmm0, %xmm0 ; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] ; AVX-NEXT: vmulss %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,3,3,3] +; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3] ; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq ; @@ -86,7 +86,7 @@ ; AVX512-NEXT: vmulss %xmm2, %xmm0, %xmm0 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] ; AVX512-NEXT: vmulss %xmm2, %xmm0, %xmm0 -; AVX512-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,3,3,3] +; AVX512-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3] ; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: retq %1 = call float @llvm.vector.reduce.fmul.f32.v4f32(float %a0, <4 x float> %a1) @@ -143,7 +143,7 @@ ; AVX-NEXT: vmulss %xmm2, %xmm0, %xmm0 ; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] ; AVX-NEXT: vmulss %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[3,3,3,3] +; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm1[3,3,3,3] ; AVX-NEXT: vmulss %xmm2, %xmm0, %xmm0 ; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0 @@ -151,7 +151,7 @@ ; AVX-NEXT: vmulss %xmm2, %xmm0, %xmm0 ; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] ; AVX-NEXT: vmulss %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,3,3,3] +; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3] ; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq @@ -163,7 +163,7 @@ ; AVX512-NEXT: vmulss %xmm2, %xmm0, %xmm0 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] ; AVX512-NEXT: vmulss %xmm2, %xmm0, %xmm0 -; AVX512-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[3,3,3,3] +; AVX512-NEXT: vshufps {{.*#+}} xmm2 = xmm1[3,3,3,3] ; AVX512-NEXT: vmulss %xmm2, %xmm0, %xmm0 ; AVX512-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm0 @@ -171,7 +171,7 @@ ; AVX512-NEXT: vmulss %xmm2, %xmm0, %xmm0 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] ; AVX512-NEXT: vmulss %xmm2, %xmm0, %xmm0 -; AVX512-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,3,3,3] +; AVX512-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3] ; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -263,7 +263,7 @@ ; AVX-NEXT: vmulss %xmm3, %xmm0, %xmm0 ; AVX-NEXT: vpermilpd {{.*#+}} xmm3 = xmm1[1,0] ; AVX-NEXT: vmulss %xmm3, %xmm0, %xmm0 -; AVX-NEXT: vpermilps {{.*#+}} xmm3 = xmm1[3,3,3,3] +; AVX-NEXT: vshufps {{.*#+}} xmm3 = xmm1[3,3,3,3] ; AVX-NEXT: vmulss %xmm3, %xmm0, %xmm0 ; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0 @@ -271,14 +271,14 @@ ; AVX-NEXT: vmulss %xmm3, %xmm0, %xmm0 ; AVX-NEXT: vpermilpd {{.*#+}} xmm3 = xmm1[1,0] ; AVX-NEXT: vmulss %xmm3, %xmm0, %xmm0 -; AVX-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,3,3,3] +; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3] ; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vmulss %xmm2, %xmm0, %xmm0 ; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm2[1,1,3,3] ; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm2[1,0] ; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpermilps {{.*#+}} xmm1 = xmm2[3,3,3,3] +; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm2[3,3,3,3] ; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vextractf128 $1, %ymm2, %xmm1 ; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0 @@ -286,7 +286,7 @@ ; AVX-NEXT: vmulss %xmm2, %xmm0, %xmm0 ; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] ; AVX-NEXT: vmulss %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,3,3,3] +; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3] ; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq @@ -298,7 +298,7 @@ ; AVX512-NEXT: vmulss %xmm2, %xmm0, %xmm0 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] ; AVX512-NEXT: vmulss %xmm2, %xmm0, %xmm0 -; AVX512-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[3,3,3,3] +; AVX512-NEXT: vshufps {{.*#+}} xmm2 = xmm1[3,3,3,3] ; AVX512-NEXT: vmulss %xmm2, %xmm0, %xmm0 ; AVX512-NEXT: vextractf128 $1, %ymm1, %xmm2 ; AVX512-NEXT: vmulss %xmm2, %xmm0, %xmm0 @@ -306,7 +306,7 @@ ; AVX512-NEXT: vmulss %xmm3, %xmm0, %xmm0 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm3 = xmm2[1,0] ; AVX512-NEXT: vmulss %xmm3, %xmm0, %xmm0 -; AVX512-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[3,3,3,3] +; AVX512-NEXT: vshufps {{.*#+}} xmm2 = xmm2[3,3,3,3] ; AVX512-NEXT: vmulss %xmm2, %xmm0, %xmm0 ; AVX512-NEXT: vextractf32x4 $2, %zmm1, %xmm2 ; AVX512-NEXT: vmulss %xmm2, %xmm0, %xmm0 @@ -314,7 +314,7 @@ ; AVX512-NEXT: vmulss %xmm3, %xmm0, %xmm0 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm3 = xmm2[1,0] ; AVX512-NEXT: vmulss %xmm3, %xmm0, %xmm0 -; AVX512-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[3,3,3,3] +; AVX512-NEXT: vshufps {{.*#+}} xmm2 = xmm2[3,3,3,3] ; AVX512-NEXT: vmulss %xmm2, %xmm0, %xmm0 ; AVX512-NEXT: vextractf32x4 $3, %zmm1, %xmm1 ; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm0 @@ -322,7 +322,7 @@ ; AVX512-NEXT: vmulss %xmm2, %xmm0, %xmm0 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] ; AVX512-NEXT: vmulss %xmm2, %xmm0, %xmm0 -; AVX512-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,3,3,3] +; AVX512-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3] ; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -393,7 +393,7 @@ ; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm1 ; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] ; AVX-NEXT: vmulss %xmm2, %xmm1, %xmm1 -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX-NEXT: vmulss %xmm0, %xmm1, %xmm0 ; AVX-NEXT: retq ; @@ -403,7 +403,7 @@ ; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm1 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] ; AVX512-NEXT: vmulss %xmm2, %xmm1, %xmm1 -; AVX512-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX512-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX512-NEXT: vmulss %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: retq %1 = call float @llvm.vector.reduce.fmul.f32.v4f32(float 1.0, <4 x float> %a0) @@ -457,7 +457,7 @@ ; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm1 ; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] ; AVX-NEXT: vmulss %xmm2, %xmm1, %xmm1 -; AVX-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[3,3,3,3] +; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm0[3,3,3,3] ; AVX-NEXT: vmulss %xmm2, %xmm1, %xmm1 ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX-NEXT: vmulss %xmm0, %xmm1, %xmm1 @@ -465,7 +465,7 @@ ; AVX-NEXT: vmulss %xmm2, %xmm1, %xmm1 ; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] ; AVX-NEXT: vmulss %xmm2, %xmm1, %xmm1 -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX-NEXT: vmulss %xmm0, %xmm1, %xmm0 ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq @@ -476,7 +476,7 @@ ; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm1 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] ; AVX512-NEXT: vmulss %xmm2, %xmm1, %xmm1 -; AVX512-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[3,3,3,3] +; AVX512-NEXT: vshufps {{.*#+}} xmm2 = xmm0[3,3,3,3] ; AVX512-NEXT: vmulss %xmm2, %xmm1, %xmm1 ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX512-NEXT: vmulss %xmm0, %xmm1, %xmm1 @@ -484,7 +484,7 @@ ; AVX512-NEXT: vmulss %xmm2, %xmm1, %xmm1 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] ; AVX512-NEXT: vmulss %xmm2, %xmm1, %xmm1 -; AVX512-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX512-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX512-NEXT: vmulss %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -573,7 +573,7 @@ ; AVX-NEXT: vmulss %xmm2, %xmm0, %xmm2 ; AVX-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0] ; AVX-NEXT: vmulss %xmm3, %xmm2, %xmm2 -; AVX-NEXT: vpermilps {{.*#+}} xmm3 = xmm0[3,3,3,3] +; AVX-NEXT: vshufps {{.*#+}} xmm3 = xmm0[3,3,3,3] ; AVX-NEXT: vmulss %xmm3, %xmm2, %xmm2 ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX-NEXT: vmulss %xmm0, %xmm2, %xmm2 @@ -581,14 +581,14 @@ ; AVX-NEXT: vmulss %xmm3, %xmm2, %xmm2 ; AVX-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0] ; AVX-NEXT: vmulss %xmm3, %xmm2, %xmm2 -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX-NEXT: vmulss %xmm0, %xmm2, %xmm0 ; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] ; AVX-NEXT: vmulss %xmm2, %xmm0, %xmm0 ; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] ; AVX-NEXT: vmulss %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[3,3,3,3] +; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm1[3,3,3,3] ; AVX-NEXT: vmulss %xmm2, %xmm0, %xmm0 ; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0 @@ -596,7 +596,7 @@ ; AVX-NEXT: vmulss %xmm2, %xmm0, %xmm0 ; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] ; AVX-NEXT: vmulss %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,3,3,3] +; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3] ; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq @@ -607,7 +607,7 @@ ; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm1 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] ; AVX512-NEXT: vmulss %xmm2, %xmm1, %xmm1 -; AVX512-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[3,3,3,3] +; AVX512-NEXT: vshufps {{.*#+}} xmm2 = xmm0[3,3,3,3] ; AVX512-NEXT: vmulss %xmm2, %xmm1, %xmm1 ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm2 ; AVX512-NEXT: vmulss %xmm2, %xmm1, %xmm1 @@ -615,7 +615,7 @@ ; AVX512-NEXT: vmulss %xmm3, %xmm1, %xmm1 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm3 = xmm2[1,0] ; AVX512-NEXT: vmulss %xmm3, %xmm1, %xmm1 -; AVX512-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[3,3,3,3] +; AVX512-NEXT: vshufps {{.*#+}} xmm2 = xmm2[3,3,3,3] ; AVX512-NEXT: vmulss %xmm2, %xmm1, %xmm1 ; AVX512-NEXT: vextractf32x4 $2, %zmm0, %xmm2 ; AVX512-NEXT: vmulss %xmm2, %xmm1, %xmm1 @@ -623,7 +623,7 @@ ; AVX512-NEXT: vmulss %xmm3, %xmm1, %xmm1 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm3 = xmm2[1,0] ; AVX512-NEXT: vmulss %xmm3, %xmm1, %xmm1 -; AVX512-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[3,3,3,3] +; AVX512-NEXT: vshufps {{.*#+}} xmm2 = xmm2[3,3,3,3] ; AVX512-NEXT: vmulss %xmm2, %xmm1, %xmm1 ; AVX512-NEXT: vextractf32x4 $3, %zmm0, %xmm0 ; AVX512-NEXT: vmulss %xmm0, %xmm1, %xmm1 @@ -631,7 +631,7 @@ ; AVX512-NEXT: vmulss %xmm2, %xmm1, %xmm1 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] ; AVX512-NEXT: vmulss %xmm2, %xmm1, %xmm1 -; AVX512-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX512-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX512-NEXT: vmulss %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -701,7 +701,7 @@ ; AVX-NEXT: vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] ; AVX-NEXT: vmulss %xmm2, %xmm1, %xmm1 -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX-NEXT: vmulss %xmm0, %xmm1, %xmm0 ; AVX-NEXT: retq ; @@ -711,7 +711,7 @@ ; AVX512-NEXT: vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] ; AVX512-NEXT: vmulss %xmm2, %xmm1, %xmm1 -; AVX512-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX512-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX512-NEXT: vmulss %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: retq %1 = call float @llvm.vector.reduce.fmul.f32.v4f32(float undef, <4 x float> %a0) @@ -765,7 +765,7 @@ ; AVX-NEXT: vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] ; AVX-NEXT: vmulss %xmm2, %xmm1, %xmm1 -; AVX-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[3,3,3,3] +; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm0[3,3,3,3] ; AVX-NEXT: vmulss %xmm2, %xmm1, %xmm1 ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX-NEXT: vmulss %xmm0, %xmm1, %xmm1 @@ -773,7 +773,7 @@ ; AVX-NEXT: vmulss %xmm2, %xmm1, %xmm1 ; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] ; AVX-NEXT: vmulss %xmm2, %xmm1, %xmm1 -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX-NEXT: vmulss %xmm0, %xmm1, %xmm0 ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq @@ -784,7 +784,7 @@ ; AVX512-NEXT: vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] ; AVX512-NEXT: vmulss %xmm2, %xmm1, %xmm1 -; AVX512-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[3,3,3,3] +; AVX512-NEXT: vshufps {{.*#+}} xmm2 = xmm0[3,3,3,3] ; AVX512-NEXT: vmulss %xmm2, %xmm1, %xmm1 ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX512-NEXT: vmulss %xmm0, %xmm1, %xmm1 @@ -792,7 +792,7 @@ ; AVX512-NEXT: vmulss %xmm2, %xmm1, %xmm1 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] ; AVX512-NEXT: vmulss %xmm2, %xmm1, %xmm1 -; AVX512-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX512-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX512-NEXT: vmulss %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -881,7 +881,7 @@ ; AVX-NEXT: vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 ; AVX-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0] ; AVX-NEXT: vmulss %xmm3, %xmm2, %xmm2 -; AVX-NEXT: vpermilps {{.*#+}} xmm3 = xmm0[3,3,3,3] +; AVX-NEXT: vshufps {{.*#+}} xmm3 = xmm0[3,3,3,3] ; AVX-NEXT: vmulss %xmm3, %xmm2, %xmm2 ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX-NEXT: vmulss %xmm0, %xmm2, %xmm2 @@ -889,14 +889,14 @@ ; AVX-NEXT: vmulss %xmm3, %xmm2, %xmm2 ; AVX-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0] ; AVX-NEXT: vmulss %xmm3, %xmm2, %xmm2 -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX-NEXT: vmulss %xmm0, %xmm2, %xmm0 ; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] ; AVX-NEXT: vmulss %xmm2, %xmm0, %xmm0 ; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] ; AVX-NEXT: vmulss %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[3,3,3,3] +; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm1[3,3,3,3] ; AVX-NEXT: vmulss %xmm2, %xmm0, %xmm0 ; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0 @@ -904,7 +904,7 @@ ; AVX-NEXT: vmulss %xmm2, %xmm0, %xmm0 ; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0] ; AVX-NEXT: vmulss %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,3,3,3] +; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3] ; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq @@ -915,7 +915,7 @@ ; AVX512-NEXT: vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] ; AVX512-NEXT: vmulss %xmm2, %xmm1, %xmm1 -; AVX512-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[3,3,3,3] +; AVX512-NEXT: vshufps {{.*#+}} xmm2 = xmm0[3,3,3,3] ; AVX512-NEXT: vmulss %xmm2, %xmm1, %xmm1 ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm2 ; AVX512-NEXT: vmulss %xmm2, %xmm1, %xmm1 @@ -923,7 +923,7 @@ ; AVX512-NEXT: vmulss %xmm3, %xmm1, %xmm1 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm3 = xmm2[1,0] ; AVX512-NEXT: vmulss %xmm3, %xmm1, %xmm1 -; AVX512-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[3,3,3,3] +; AVX512-NEXT: vshufps {{.*#+}} xmm2 = xmm2[3,3,3,3] ; AVX512-NEXT: vmulss %xmm2, %xmm1, %xmm1 ; AVX512-NEXT: vextractf32x4 $2, %zmm0, %xmm2 ; AVX512-NEXT: vmulss %xmm2, %xmm1, %xmm1 @@ -931,7 +931,7 @@ ; AVX512-NEXT: vmulss %xmm3, %xmm1, %xmm1 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm3 = xmm2[1,0] ; AVX512-NEXT: vmulss %xmm3, %xmm1, %xmm1 -; AVX512-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[3,3,3,3] +; AVX512-NEXT: vshufps {{.*#+}} xmm2 = xmm2[3,3,3,3] ; AVX512-NEXT: vmulss %xmm2, %xmm1, %xmm1 ; AVX512-NEXT: vextractf32x4 $3, %zmm0, %xmm0 ; AVX512-NEXT: vmulss %xmm0, %xmm1, %xmm1 @@ -939,7 +939,7 @@ ; AVX512-NEXT: vmulss %xmm2, %xmm1, %xmm1 ; AVX512-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] ; AVX512-NEXT: vmulss %xmm2, %xmm1, %xmm1 -; AVX512-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,3] +; AVX512-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3] ; AVX512-NEXT: vmulss %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-reduce-or.ll b/llvm/test/CodeGen/X86/vector-reduce-or.ll --- a/llvm/test/CodeGen/X86/vector-reduce-or.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-or.ll @@ -86,7 +86,7 @@ ; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vmovq %xmm0, %rax ; AVX1-NEXT: vzeroupper @@ -140,7 +140,7 @@ ; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vmovq %xmm0, %rax ; AVX1-NEXT: vzeroupper @@ -287,9 +287,9 @@ ; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vmovd %xmm0, %eax ; AVX1-NEXT: vzeroupper @@ -349,9 +349,9 @@ ; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vmovd %xmm0, %eax ; AVX1-NEXT: vzeroupper @@ -554,9 +554,9 @@ ; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 ; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 @@ -629,9 +629,9 @@ ; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 ; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 @@ -891,9 +891,9 @@ ; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 ; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 @@ -975,9 +975,9 @@ ; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 ; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 diff --git a/llvm/test/CodeGen/X86/vector-reduce-smax.ll b/llvm/test/CodeGen/X86/vector-reduce-smax.ll --- a/llvm/test/CodeGen/X86/vector-reduce-smax.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-smax.ll @@ -170,7 +170,7 @@ ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vmovq %xmm0, %rax @@ -182,7 +182,7 @@ ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: vmovq %xmm0, %rax @@ -360,7 +360,7 @@ ; AVX1-NEXT: vblendvpd %xmm3, %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm0, %xmm1 ; AVX1-NEXT: vblendvpd %xmm1, %xmm0, %xmm2, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vmovq %xmm0, %rax @@ -374,7 +374,7 @@ ; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: vmovq %xmm0, %rax @@ -681,7 +681,7 @@ ; AVX1-NEXT: vblendvpd %xmm1, %xmm5, %xmm4, %xmm1 ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 ; AVX1-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vmovq %xmm0, %rax @@ -699,7 +699,7 @@ ; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 ; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: vmovq %xmm0, %rax diff --git a/llvm/test/CodeGen/X86/vector-reduce-smin.ll b/llvm/test/CodeGen/X86/vector-reduce-smin.ll --- a/llvm/test/CodeGen/X86/vector-reduce-smin.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-smin.ll @@ -170,7 +170,7 @@ ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 ; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 ; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vmovq %xmm0, %rax @@ -182,7 +182,7 @@ ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 ; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 ; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: vmovq %xmm0, %rax @@ -360,7 +360,7 @@ ; AVX1-NEXT: vblendvpd %xmm3, %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm0, %xmm1 ; AVX1-NEXT: vblendvpd %xmm1, %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 ; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vmovq %xmm0, %rax @@ -374,7 +374,7 @@ ; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 ; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 ; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: vmovq %xmm0, %rax @@ -681,7 +681,7 @@ ; AVX1-NEXT: vblendvpd %xmm1, %xmm4, %xmm5, %xmm1 ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 ; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 ; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vmovq %xmm0, %rax @@ -699,7 +699,7 @@ ; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 ; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 ; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: vmovq %xmm0, %rax diff --git a/llvm/test/CodeGen/X86/vector-reduce-umax.ll b/llvm/test/CodeGen/X86/vector-reduce-umax.ll --- a/llvm/test/CodeGen/X86/vector-reduce-umax.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-umax.ll @@ -185,7 +185,7 @@ ; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm4 ; AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3 ; AVX1-NEXT: vblendvpd %xmm3, %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vxorpd %xmm2, %xmm0, %xmm3 ; AVX1-NEXT: vxorpd %xmm2, %xmm1, %xmm2 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 @@ -202,7 +202,7 @@ ; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm4 ; AVX2-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3 ; AVX2-NEXT: vblendvpd %xmm3, %xmm0, %xmm1, %xmm0 -; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vxorpd %xmm2, %xmm0, %xmm3 ; AVX2-NEXT: vxorpd %xmm2, %xmm1, %xmm2 ; AVX2-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 @@ -400,7 +400,7 @@ ; AVX1-NEXT: vxorpd %xmm2, %xmm3, %xmm4 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm4, %xmm1 ; AVX1-NEXT: vblendvpd %xmm1, %xmm3, %xmm0, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vxorpd %xmm2, %xmm0, %xmm3 ; AVX1-NEXT: vxorpd %xmm2, %xmm1, %xmm2 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 @@ -421,7 +421,7 @@ ; AVX2-NEXT: vxorpd %xmm2, %xmm0, %xmm4 ; AVX2-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3 ; AVX2-NEXT: vblendvpd %xmm3, %xmm0, %xmm1, %xmm0 -; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vxorpd %xmm2, %xmm0, %xmm3 ; AVX2-NEXT: vxorpd %xmm2, %xmm1, %xmm2 ; AVX2-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 @@ -769,7 +769,7 @@ ; AVX1-NEXT: vxorpd %xmm4, %xmm0, %xmm3 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 ; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vxorpd %xmm4, %xmm0, %xmm2 ; AVX1-NEXT: vxorpd %xmm4, %xmm1, %xmm3 ; AVX1-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2 @@ -798,7 +798,7 @@ ; AVX2-NEXT: vxorpd %xmm4, %xmm0, %xmm3 ; AVX2-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 ; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vxorpd %xmm4, %xmm0, %xmm2 ; AVX2-NEXT: vxorpd %xmm4, %xmm1, %xmm3 ; AVX2-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2 diff --git a/llvm/test/CodeGen/X86/vector-reduce-umin.ll b/llvm/test/CodeGen/X86/vector-reduce-umin.ll --- a/llvm/test/CodeGen/X86/vector-reduce-umin.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-umin.ll @@ -186,7 +186,7 @@ ; AVX1-NEXT: vpxor %xmm1, %xmm3, %xmm4 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm4, %xmm2 ; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm3, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[2,3,2,3] +; AVX1-NEXT: vshufps {{.*#+}} xmm2 = xmm0[2,3,2,3] ; AVX1-NEXT: vxorpd %xmm1, %xmm0, %xmm3 ; AVX1-NEXT: vxorpd %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vpcmpgtq %xmm3, %xmm1, %xmm1 @@ -203,7 +203,7 @@ ; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm4 ; AVX2-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3 ; AVX2-NEXT: vblendvpd %xmm3, %xmm0, %xmm1, %xmm0 -; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vxorpd %xmm2, %xmm0, %xmm3 ; AVX2-NEXT: vxorpd %xmm2, %xmm1, %xmm2 ; AVX2-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2 @@ -403,7 +403,7 @@ ; AVX1-NEXT: vxorpd %xmm3, %xmm2, %xmm4 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm4, %xmm1 ; AVX1-NEXT: vblendvpd %xmm1, %xmm0, %xmm2, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vxorpd %xmm3, %xmm0, %xmm2 ; AVX1-NEXT: vxorpd %xmm3, %xmm1, %xmm3 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 @@ -424,7 +424,7 @@ ; AVX2-NEXT: vxorpd %xmm2, %xmm1, %xmm4 ; AVX2-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3 ; AVX2-NEXT: vblendvpd %xmm3, %xmm0, %xmm1, %xmm0 -; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vxorpd %xmm2, %xmm0, %xmm3 ; AVX2-NEXT: vxorpd %xmm2, %xmm1, %xmm2 ; AVX2-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2 @@ -773,7 +773,7 @@ ; AVX1-NEXT: vxorpd %xmm4, %xmm0, %xmm3 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 ; AVX1-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vxorpd %xmm4, %xmm0, %xmm2 ; AVX1-NEXT: vxorpd %xmm4, %xmm1, %xmm3 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 @@ -802,7 +802,7 @@ ; AVX2-NEXT: vxorpd %xmm4, %xmm1, %xmm3 ; AVX2-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 ; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX2-NEXT: vxorpd %xmm4, %xmm0, %xmm2 ; AVX2-NEXT: vxorpd %xmm4, %xmm1, %xmm3 ; AVX2-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 diff --git a/llvm/test/CodeGen/X86/vector-reduce-xor.ll b/llvm/test/CodeGen/X86/vector-reduce-xor.ll --- a/llvm/test/CodeGen/X86/vector-reduce-xor.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-xor.ll @@ -86,7 +86,7 @@ ; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vmovq %xmm0, %rax ; AVX1-NEXT: vzeroupper @@ -140,7 +140,7 @@ ; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vmovq %xmm0, %rax ; AVX1-NEXT: vzeroupper @@ -287,9 +287,9 @@ ; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vmovd %xmm0, %eax ; AVX1-NEXT: vzeroupper @@ -349,9 +349,9 @@ ; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vmovd %xmm0, %eax ; AVX1-NEXT: vzeroupper @@ -554,9 +554,9 @@ ; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 ; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 @@ -629,9 +629,9 @@ ; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 ; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 @@ -891,9 +891,9 @@ ; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 ; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 @@ -975,9 +975,9 @@ ; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,2,3] +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3] ; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,1,1] +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[1,1,1,1] ; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 ; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 diff --git a/llvm/test/CodeGen/X86/vector-replicaton-i1-mask.ll b/llvm/test/CodeGen/X86/vector-replicaton-i1-mask.ll --- a/llvm/test/CodeGen/X86/vector-replicaton-i1-mask.ll +++ b/llvm/test/CodeGen/X86/vector-replicaton-i1-mask.ll @@ -56,7 +56,7 @@ ; AVX512F-SLOW-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 ; AVX512F-SLOW-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} ; AVX512F-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] -; AVX512F-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,1,1,4,4,5,5] +; AVX512F-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0,1,1,4,4,5,5] ; AVX512F-SLOW-NEXT: vptestmd %ymm0, %ymm0, %k1 ; AVX512F-SLOW-NEXT: vmovdqa32 (%rsi), %ymm0 {%k1} {z} ; AVX512F-SLOW-NEXT: vmovdqa %ymm0, (%rdx) @@ -81,7 +81,7 @@ ; AVX512DQ-SLOW-NEXT: kmovw (%rdi), %k0 ; AVX512DQ-SLOW-NEXT: vpmovm2d %k0, %ymm0 ; AVX512DQ-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] -; AVX512DQ-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,1,1,4,4,5,5] +; AVX512DQ-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0,1,1,4,4,5,5] ; AVX512DQ-SLOW-NEXT: vpmovd2m %ymm0, %k1 ; AVX512DQ-SLOW-NEXT: vmovdqa32 (%rsi), %ymm0 {%k1} {z} ; AVX512DQ-SLOW-NEXT: vmovdqa %ymm0, (%rdx) @@ -106,7 +106,7 @@ ; AVX512BW-SLOW-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 ; AVX512BW-SLOW-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} ; AVX512BW-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] -; AVX512BW-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,1,1,4,4,5,5] +; AVX512BW-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0,1,1,4,4,5,5] ; AVX512BW-SLOW-NEXT: vptestmd %ymm0, %ymm0, %k1 ; AVX512BW-SLOW-NEXT: vmovdqa32 (%rsi), %ymm0 {%k1} {z} ; AVX512BW-SLOW-NEXT: vmovdqa %ymm0, (%rdx) @@ -132,7 +132,7 @@ ; AVX512VBMI-SLOW-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 ; AVX512VBMI-SLOW-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} ; AVX512VBMI-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] -; AVX512VBMI-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,1,1,4,4,5,5] +; AVX512VBMI-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0,1,1,4,4,5,5] ; AVX512VBMI-SLOW-NEXT: vptestmd %ymm0, %ymm0, %k1 ; AVX512VBMI-SLOW-NEXT: vmovdqa32 (%rsi), %ymm0 {%k1} {z} ; AVX512VBMI-SLOW-NEXT: vmovdqa %ymm0, (%rdx) diff --git a/llvm/test/CodeGen/X86/vector-sext.ll b/llvm/test/CodeGen/X86/vector-sext.ll --- a/llvm/test/CodeGen/X86/vector-sext.ll +++ b/llvm/test/CodeGen/X86/vector-sext.ll @@ -2681,7 +2681,7 @@ ; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,1,1,4,5,6,7] ; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,2,3,3,4,5,6,7] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,1,1,4,4,5,5] +; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0,1,1,4,4,5,5] ; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [9241421688590303745,9241421688590303745] diff --git a/llvm/test/CodeGen/X86/vector-shift-lshr-128.ll b/llvm/test/CodeGen/X86/vector-shift-lshr-128.ll --- a/llvm/test/CodeGen/X86/vector-shift-lshr-128.ll +++ b/llvm/test/CodeGen/X86/vector-shift-lshr-128.ll @@ -1526,8 +1526,8 @@ ; XOPAVX1-LABEL: vector_variable_shift_right: ; XOPAVX1: # %bb.0: ; XOPAVX1-NEXT: vpslld $31, %xmm0, %xmm0 -; XOPAVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,0,0,0] -; XOPAVX1-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[0,0,0,0] +; XOPAVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,0,0,0] +; XOPAVX1-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,0,0,0] ; XOPAVX1-NEXT: vblendvps %xmm0, %xmm1, %xmm2, %xmm0 ; XOPAVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; XOPAVX1-NEXT: vpsubd %xmm0, %xmm1, %xmm0 diff --git a/llvm/test/CodeGen/X86/vector-shift-lshr-256.ll b/llvm/test/CodeGen/X86/vector-shift-lshr-256.ll --- a/llvm/test/CodeGen/X86/vector-shift-lshr-256.ll +++ b/llvm/test/CodeGen/X86/vector-shift-lshr-256.ll @@ -1712,7 +1712,7 @@ ; AVX1: # %bb.0: ; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm1[1,3],ymm0[5,7],ymm1[5,7] -; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7] +; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7] ; AVX1-NEXT: retq ; ; AVX2-LABEL: shift32_v4i64: @@ -1724,7 +1724,7 @@ ; XOPAVX1: # %bb.0: ; XOPAVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; XOPAVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm1[1,3],ymm0[5,7],ymm1[5,7] -; XOPAVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7] +; XOPAVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7] ; XOPAVX1-NEXT: retq ; ; XOPAVX2-LABEL: shift32_v4i64: @@ -1746,7 +1746,7 @@ ; X86-AVX1: # %bb.0: ; X86-AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; X86-AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm1[1,3],ymm0[5,7],ymm1[5,7] -; X86-AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7] +; X86-AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7] ; X86-AVX1-NEXT: retl ; ; X86-AVX2-LABEL: shift32_v4i64: diff --git a/llvm/test/CodeGen/X86/vector-shift-shl-256.ll b/llvm/test/CodeGen/X86/vector-shift-shl-256.ll --- a/llvm/test/CodeGen/X86/vector-shift-shl-256.ll +++ b/llvm/test/CodeGen/X86/vector-shift-shl-256.ll @@ -1600,7 +1600,7 @@ ; AVX1: # %bb.0: ; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,2],ymm0[0,2],ymm1[4,6],ymm0[4,6] -; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7] +; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7] ; AVX1-NEXT: retq ; ; AVX2-LABEL: shift32_v4i64: @@ -1612,7 +1612,7 @@ ; XOPAVX1: # %bb.0: ; XOPAVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; XOPAVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,2],ymm0[0,2],ymm1[4,6],ymm0[4,6] -; XOPAVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7] +; XOPAVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7] ; XOPAVX1-NEXT: retq ; ; XOPAVX2-LABEL: shift32_v4i64: @@ -1634,7 +1634,7 @@ ; X86-AVX1: # %bb.0: ; X86-AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; X86-AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,2],ymm0[0,2],ymm1[4,6],ymm0[4,6] -; X86-AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7] +; X86-AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7] ; X86-AVX1-NEXT: retl ; ; X86-AVX2-LABEL: shift32_v4i64: diff --git a/llvm/test/CodeGen/X86/vector-shuffle-128-v2.ll b/llvm/test/CodeGen/X86/vector-shuffle-128-v2.ll --- a/llvm/test/CodeGen/X86/vector-shuffle-128-v2.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-128-v2.ll @@ -15,7 +15,7 @@ ; ; AVX1-LABEL: shuffle_v2i64_00: ; AVX1: # %bb.0: -; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,0,1] +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1,0,1] ; AVX1-NEXT: retq ; ; AVX2-LABEL: shuffle_v2i64_00: @@ -38,7 +38,7 @@ ; ; AVX-LABEL: shuffle_v2i64_10: ; AVX: # %bb.0: -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,0,1] ; AVX-NEXT: retq %shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> ret <2 x i64> %shuffle @@ -51,7 +51,7 @@ ; ; AVX-LABEL: shuffle_v2i64_11: ; AVX: # %bb.0: -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX-NEXT: retq %shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> ret <2 x i64> %shuffle @@ -64,7 +64,7 @@ ; ; AVX1-LABEL: shuffle_v2i64_22: ; AVX1: # %bb.0: -; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm1[0,1,0,1] +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm1[0,1,0,1] ; AVX1-NEXT: retq ; ; AVX2-LABEL: shuffle_v2i64_22: @@ -87,7 +87,7 @@ ; ; AVX-LABEL: shuffle_v2i64_32: ; AVX: # %bb.0: -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm1[2,3,0,1] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm1[2,3,0,1] ; AVX-NEXT: retq %shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> ret <2 x i64> %shuffle @@ -100,7 +100,7 @@ ; ; AVX-LABEL: shuffle_v2i64_33: ; AVX: # %bb.0: -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm1[2,3,2,3] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm1[2,3,2,3] ; AVX-NEXT: retq %shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> ret <2 x i64> %shuffle diff --git a/llvm/test/CodeGen/X86/vector-shuffle-128-v4.ll b/llvm/test/CodeGen/X86/vector-shuffle-128-v4.ll --- a/llvm/test/CodeGen/X86/vector-shuffle-128-v4.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-128-v4.ll @@ -18,7 +18,7 @@ ; ; AVX-LABEL: shuffle_v4i32_0001: ; AVX: # %bb.0: -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,1] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0,0,1] ; AVX-NEXT: retq %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> ret <4 x i32> %shuffle @@ -31,7 +31,7 @@ ; ; AVX-LABEL: shuffle_v4i32_0020: ; AVX: # %bb.0: -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,2,0] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0,2,0] ; AVX-NEXT: retq %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> ret <4 x i32> %shuffle @@ -44,7 +44,7 @@ ; ; AVX-LABEL: shuffle_v4i32_0112: ; AVX: # %bb.0: -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,2] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1,1,2] ; AVX-NEXT: retq %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> ret <4 x i32> %shuffle @@ -57,7 +57,7 @@ ; ; AVX-LABEL: shuffle_v4i32_0300: ; AVX: # %bb.0: -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,3,0,0] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,3,0,0] ; AVX-NEXT: retq %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> ret <4 x i32> %shuffle @@ -70,7 +70,7 @@ ; ; AVX-LABEL: shuffle_v4i32_1000: ; AVX: # %bb.0: -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,0,0,0] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,0,0,0] ; AVX-NEXT: retq %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> ret <4 x i32> %shuffle @@ -83,7 +83,7 @@ ; ; AVX-LABEL: shuffle_v4i32_2200: ; AVX: # %bb.0: -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,2,0,0] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,2,0,0] ; AVX-NEXT: retq %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> ret <4 x i32> %shuffle @@ -96,7 +96,7 @@ ; ; AVX-LABEL: shuffle_v4i32_3330: ; AVX: # %bb.0: -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,0] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,0] ; AVX-NEXT: retq %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> ret <4 x i32> %shuffle @@ -109,7 +109,7 @@ ; ; AVX-LABEL: shuffle_v4i32_3210: ; AVX: # %bb.0: -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,2,1,0] ; AVX-NEXT: retq %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> ret <4 x i32> %shuffle @@ -123,7 +123,7 @@ ; ; AVX-LABEL: shuffle_v4i32_2121: ; AVX: # %bb.0: -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,1,2,1] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,1,2,1] ; AVX-NEXT: retq %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> ret <4 x i32> %shuffle @@ -137,7 +137,7 @@ ; ; AVX-LABEL: shuffle_v4f32_0001: ; AVX: # %bb.0: -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,1] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0,0,1] ; AVX-NEXT: retq %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> ret <4 x float> %shuffle @@ -150,7 +150,7 @@ ; ; AVX-LABEL: shuffle_v4f32_0020: ; AVX: # %bb.0: -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,2,0] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0,2,0] ; AVX-NEXT: retq %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> ret <4 x float> %shuffle @@ -163,7 +163,7 @@ ; ; AVX-LABEL: shuffle_v4f32_0300: ; AVX: # %bb.0: -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,3,0,0] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,3,0,0] ; AVX-NEXT: retq %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> ret <4 x float> %shuffle @@ -176,7 +176,7 @@ ; ; AVX-LABEL: shuffle_v4f32_1000: ; AVX: # %bb.0: -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,0,0,0] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,0,0,0] ; AVX-NEXT: retq %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> ret <4 x float> %shuffle @@ -189,7 +189,7 @@ ; ; AVX-LABEL: shuffle_v4f32_2200: ; AVX: # %bb.0: -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,2,0,0] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,2,0,0] ; AVX-NEXT: retq %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> ret <4 x float> %shuffle @@ -202,7 +202,7 @@ ; ; AVX-LABEL: shuffle_v4f32_3330: ; AVX: # %bb.0: -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,3,3,0] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,0] ; AVX-NEXT: retq %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> ret <4 x float> %shuffle @@ -215,7 +215,7 @@ ; ; AVX-LABEL: shuffle_v4f32_3210: ; AVX: # %bb.0: -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,2,1,0] ; AVX-NEXT: retq %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> ret <4 x float> %shuffle @@ -228,7 +228,7 @@ ; ; AVX-LABEL: shuffle_v4f32_0011: ; AVX: # %bb.0: -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,1,1] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0,1,1] ; AVX-NEXT: retq %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> ret <4 x float> %shuffle @@ -241,7 +241,7 @@ ; ; AVX-LABEL: shuffle_v4f32_2233: ; AVX: # %bb.0: -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,2,3,3] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,2,3,3] ; AVX-NEXT: retq %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> ret <4 x float> %shuffle @@ -358,7 +358,7 @@ ; ; AVX1-LABEL: shuffle_v4i32_0124: ; AVX1: # %bb.0: -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,0,0,0] +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,0,0,0] ; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3] ; AVX1-NEXT: retq ; @@ -404,15 +404,15 @@ ; ; AVX1-LABEL: shuffle_v4i32_0142: ; AVX1: # %bb.0: -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,1,0,1] -; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,2,2] +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,0,1] +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1,2,2] ; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3] ; AVX1-NEXT: retq ; ; AVX2-LABEL: shuffle_v4i32_0142: ; AVX2: # %bb.0: ; AVX2-NEXT: vbroadcastss %xmm1, %xmm1 -; AVX2-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,2,2] +; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1,2,2] ; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3] ; AVX2-NEXT: retq ; @@ -455,15 +455,15 @@ ; ; AVX1-LABEL: shuffle_v4i32_0412: ; AVX1: # %bb.0: -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,0,1,1] -; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,2] +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,0,1,1] +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1,1,2] ; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] ; AVX1-NEXT: retq ; ; AVX2-LABEL: shuffle_v4i32_0412: ; AVX2: # %bb.0: ; AVX2-NEXT: vbroadcastss %xmm1, %xmm1 -; AVX2-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,2] +; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1,1,2] ; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] ; AVX2-NEXT: retq ; @@ -505,7 +505,7 @@ ; ; AVX1OR2-LABEL: shuffle_v4i32_4012: ; AVX1OR2: # %bb.0: -; AVX1OR2-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,1,2] +; AVX1OR2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0,1,2] ; AVX1OR2-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] ; AVX1OR2-NEXT: retq ; @@ -540,7 +540,7 @@ ; AVX1OR2-LABEL: shuffle_v4i32_0451: ; AVX1OR2: # %bb.0: ; AVX1OR2-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX1OR2-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,3,2] +; AVX1OR2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1,3,2] ; AVX1OR2-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v4i32_0451: @@ -575,7 +575,7 @@ ; AVX1OR2-LABEL: shuffle_v4i32_4015: ; AVX1OR2: # %bb.0: ; AVX1OR2-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX1OR2-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,0,2,3] +; AVX1OR2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,0,2,3] ; AVX1OR2-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v4i32_4015: @@ -1203,14 +1203,14 @@ ; AVX1: # %bb.0: ; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] -; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,0,1,1] +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,0,1,1] ; AVX1-NEXT: retq ; ; AVX2-SLOW-LABEL: shuffle_v4i32_z4zz: ; AVX2-SLOW: # %bb.0: ; AVX2-SLOW-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,0,1,1] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,0,1,1] ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-LABEL: shuffle_v4i32_z4zz: @@ -1259,14 +1259,14 @@ ; AVX1: # %bb.0: ; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] -; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,0,1] +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1,0,1] ; AVX1-NEXT: retq ; ; AVX2-SLOW-LABEL: shuffle_v4i32_zz4z: ; AVX2-SLOW: # %bb.0: ; AVX2-SLOW-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,0,1] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1,0,1] ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-LABEL: shuffle_v4i32_zz4z: @@ -1327,14 +1327,14 @@ ; ; AVX1-LABEL: shuffle_v4i32_z6zz: ; AVX1: # %bb.0: -; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,2,3,3] +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,2,3,3] ; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] ; AVX1-NEXT: retq ; ; AVX2-SLOW-LABEL: shuffle_v4i32_z6zz: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,2,3,3] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,2,3,3] ; AVX2-SLOW-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] ; AVX2-SLOW-NEXT: retq @@ -2409,7 +2409,7 @@ ; AVX1-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] ; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vunpckhps {{.*#+}} xmm1 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0] +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0,0,0] ; AVX1-NEXT: vmovaps %xmm1, (%rsi) ; AVX1-NEXT: retq ; @@ -2517,7 +2517,7 @@ ; AVX1OR2-LABEL: shuffle_mem_v4f32_0624: ; AVX1OR2: # %bb.0: ; AVX1OR2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,0],mem[0,2] -; AVX1OR2-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,0,3,1] +; AVX1OR2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,0,3,1] ; AVX1OR2-NEXT: retq ; ; AVX512VL-LABEL: shuffle_mem_v4f32_0624: diff --git a/llvm/test/CodeGen/X86/vector-shuffle-128-v8.ll b/llvm/test/CodeGen/X86/vector-shuffle-128-v8.ll --- a/llvm/test/CodeGen/X86/vector-shuffle-128-v8.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-128-v8.ll @@ -20,7 +20,7 @@ ; ; AVX-LABEL: shuffle_v8i16_01012323: ; AVX: # %bb.0: -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,1,1] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0,1,1] ; AVX-NEXT: retq %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> ret <8 x i16> %shuffle @@ -33,7 +33,7 @@ ; ; AVX-LABEL: shuffle_v8i16_67452301: ; AVX: # %bb.0: -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,2,1,0] ; AVX-NEXT: retq %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> ret <8 x i16> %shuffle @@ -240,7 +240,7 @@ ; ; AVX-LABEL: shuffle_v8i16_23016745: ; AVX: # %bb.0: -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,0,3,2] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,0,3,2] ; AVX-NEXT: retq %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> ret <8 x i16> %shuffle @@ -1382,7 +1382,7 @@ ; ; AVX-LABEL: shuffle_v8i16_XXXdXXXX: ; AVX: # %bb.0: -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm1[2,2,3,3] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm1[2,2,3,3] ; AVX-NEXT: retq %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> ret <8 x i16> %shuffle diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll --- a/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll @@ -691,7 +691,7 @@ ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4] +; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4] ; AVX1-NEXT: retq ; ; AVX2-SLOW-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08: @@ -722,7 +722,7 @@ ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] ; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; XOPAVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4] +; XOPAVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4] ; XOPAVX1-NEXT: retq ; ; XOPAVX2-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08: @@ -741,7 +741,7 @@ ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,7,7,7] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,2,2,2,6,6,6,6] +; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,2,2,2,6,6,6,6] ; AVX1-NEXT: retq ; ; AVX2-SLOW-LABEL: shuffle_v16i16_07_07_07_07_07_07_07_07_15_15_15_15_15_15_15_15: @@ -772,7 +772,7 @@ ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; XOPAVX1-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,7,7,7] ; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; XOPAVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,2,2,2,6,6,6,6] +; XOPAVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,2,2,2,6,6,6,6] ; XOPAVX1-NEXT: retq ; ; XOPAVX2-LABEL: shuffle_v16i16_07_07_07_07_07_07_07_07_15_15_15_15_15_15_15_15: @@ -1542,7 +1542,7 @@ ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4] +; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4] ; AVX1-NEXT: retq ; ; AVX2-LABEL: shuffle_v16i16_00_16_00_16_00_16_00_16_08_24_08_24_08_24_08_24: @@ -1564,7 +1564,7 @@ ; XOPAVX1-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] ; XOPAVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; XOPAVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4] +; XOPAVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4] ; XOPAVX1-NEXT: retq ; ; XOPAVX2-LABEL: shuffle_v16i16_00_16_00_16_00_16_00_16_08_24_08_24_08_24_08_24: @@ -1728,7 +1728,7 @@ ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,1,0,4,5,6,7] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,0,1,4,4,4,5] +; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0,0,1,4,4,4,5] ; AVX1-NEXT: retq ; ; AVX2-SLOW-LABEL: shuffle_v16i16_00_00_00_00_00_00_01_00_08_08_08_08_08_08_09_08: @@ -1759,7 +1759,7 @@ ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,1,0,4,5,6,7] ; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; XOPAVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,0,1,4,4,4,5] +; XOPAVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0,0,1,4,4,4,5] ; XOPAVX1-NEXT: retq ; ; XOPAVX2-LABEL: shuffle_v16i16_00_00_00_00_00_00_01_00_08_08_08_08_08_08_09_08: @@ -1778,7 +1778,7 @@ ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,2,4,5,6,7] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,1,0,4,4,5,4] +; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0,1,0,4,4,5,4] ; AVX1-NEXT: retq ; ; AVX2-SLOW-LABEL: shuffle_v16i16_00_00_00_00_00_02_00_00_08_08_08_08_08_10_08_08: @@ -1809,7 +1809,7 @@ ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,2,4,5,6,7] ; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; XOPAVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,1,0,4,4,5,4] +; XOPAVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0,1,0,4,4,5,4] ; XOPAVX1-NEXT: retq ; ; XOPAVX2-LABEL: shuffle_v16i16_00_00_00_00_00_02_00_00_08_08_08_08_08_10_08_08: @@ -1828,7 +1828,7 @@ ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,3,0,4,5,6,7] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,1,0,4,4,5,4] +; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0,1,0,4,4,5,4] ; AVX1-NEXT: retq ; ; AVX2-SLOW-LABEL: shuffle_v16i16_00_00_00_00_03_00_00_00_08_08_08_08_11_08_08_08: @@ -1859,7 +1859,7 @@ ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,3,0,4,5,6,7] ; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; XOPAVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,1,0,4,4,5,4] +; XOPAVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0,1,0,4,4,5,4] ; XOPAVX1-NEXT: retq ; ; XOPAVX2-LABEL: shuffle_v16i16_00_00_00_00_03_00_00_00_08_08_08_08_11_08_08_08: @@ -2990,7 +2990,7 @@ ; AVX1: # %bb.0: ; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm1[1,3],ymm0[5,7],ymm1[5,7] -; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7] +; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7] ; AVX1-NEXT: retq ; ; AVX2OR512VL-LABEL: shuffle_v16i16_02_03_zz_zz_06_07_zz_zz_10_11_zz_zz_14_15_zz_zz: @@ -3002,7 +3002,7 @@ ; XOPAVX1: # %bb.0: ; XOPAVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; XOPAVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm1[1,3],ymm0[5,7],ymm1[5,7] -; XOPAVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7] +; XOPAVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7] ; XOPAVX1-NEXT: retq ; ; XOPAVX2-LABEL: shuffle_v16i16_02_03_zz_zz_06_07_zz_zz_10_11_zz_zz_14_15_zz_zz: @@ -3363,7 +3363,7 @@ ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7] ; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] ; AVX1-NEXT: retq ; ; AVX2-LABEL: shuffle_v16i16_06_07_04_05_02_03_00_09_14_15_12_13_10_11_08_09: @@ -3384,7 +3384,7 @@ ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; XOPAVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7] ; XOPAVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; XOPAVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; XOPAVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] ; XOPAVX1-NEXT: retq ; ; XOPAVX2-LABEL: shuffle_v16i16_06_07_04_05_02_03_00_09_14_15_12_13_10_11_08_09: @@ -3638,7 +3638,7 @@ ; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,0,4,5,6,7] ; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,0,4,5,6,7] ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,1,3,2,4,5,7,6] +; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,3,2,4,5,7,6] ; AVX1-NEXT: retq ; ; AVX2-SLOW-LABEL: shuffle_v16i16_03_01_02_00_06_07_04_13_11_09_10_08_14_15_12_13: @@ -3747,7 +3747,7 @@ ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1,2,3,4],xmm1[5],xmm0[6,7] ; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6] +; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6] ; AVX1-NEXT: retq ; ; AVX2-LABEL: shuffle_v16i16_02_03_00_01_06_07_04_13_10_11_08_09_14_15_12_13: @@ -3768,7 +3768,7 @@ ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; XOPAVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1,2,3,4],xmm1[5],xmm0[6,7] ; XOPAVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; XOPAVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6] +; XOPAVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6] ; XOPAVX1-NEXT: retq ; ; XOPAVX2-LABEL: shuffle_v16i16_02_03_00_01_06_07_04_13_10_11_08_09_14_15_12_13: @@ -3789,7 +3789,7 @@ ; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,3,0,2,4,5,6,7] ; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,3,0,2,4,5,6,7] ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,1,3,2,4,5,7,6] +; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,3,2,4,5,7,6] ; AVX1-NEXT: retq ; ; AVX2-SLOW-LABEL: shuffle_v16i16_02_03_00_02_06_07_04_13_10_11_08_10_14_15_12_13: @@ -5788,7 +5788,7 @@ define <16 x i16> @shuffle_v16i16_uu_uu_uu_21_uu_uu_uu_uu_uu_uu_uu_29_uu_uu_uu_uu(<16 x i16> %a, <16 x i16> %b) { ; ALL-LABEL: shuffle_v16i16_uu_uu_uu_21_uu_uu_uu_uu_uu_uu_uu_29_uu_uu_uu_uu: ; ALL: # %bb.0: -; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm1[2,2,2,2,6,6,6,6] +; ALL-NEXT: vshufps {{.*#+}} ymm0 = ymm1[2,2,2,2,6,6,6,6] ; ALL-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle @@ -6979,7 +6979,7 @@ ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5] +; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5] ; AVX1-NEXT: retq ; ; AVX2-LABEL: shuffle_v16i16_02_18_03_19_00_16_01_17_10_26_11_27_08_24_09_25: @@ -7001,7 +7001,7 @@ ; XOPAVX1-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] ; XOPAVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; XOPAVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5] +; XOPAVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5] ; XOPAVX1-NEXT: retq ; ; XOPAVX2-LABEL: shuffle_v16i16_02_18_03_19_00_16_01_17_10_26_11_27_08_24_09_25: @@ -7171,7 +7171,7 @@ ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,7,5] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] +; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] ; AVX1-NEXT: retq ; ; AVX2-SLOW-LABEL: shuffle_v16i16_04_06_07_uu_uu_06_07_05_12_14_15_uu_uu_14_15_13: @@ -7202,7 +7202,7 @@ ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; XOPAVX1-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,7,5] ; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; XOPAVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] +; XOPAVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] ; XOPAVX1-NEXT: retq ; ; XOPAVX2-LABEL: shuffle_v16i16_04_06_07_uu_uu_06_07_05_12_14_15_uu_uu_14_15_13: diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll --- a/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll @@ -2436,7 +2436,7 @@ ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 -; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4] +; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4] ; AVX1-NEXT: retq ; ; AVX2-SLOW-LABEL: shuffle_v32i8_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_16_48_16_48_16_48_16_48_16_48_16_48_16_48_16_48: diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll --- a/llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll @@ -838,7 +838,7 @@ define <4 x i64> @shuffle_v4i64_0000(<4 x i64> %a, <4 x i64> %b) { ; AVX1-LABEL: shuffle_v4i64_0000: ; AVX1: # %bb.0: -; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,0,1] +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1,0,1] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-NEXT: retq ; @@ -858,7 +858,7 @@ define <4 x i64> @shuffle_v4i64_0001(<4 x i64> %a, <4 x i64> %b) { ; AVX1-LABEL: shuffle_v4i64_0001: ; AVX1: # %bb.0: -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,1,0,1] +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[0,1,0,1] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; @@ -1288,13 +1288,13 @@ ; AVX2-LABEL: shuffle_v4i64_1054: ; AVX2: # %bb.0: ; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5] +; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5] ; AVX2-NEXT: retq ; ; AVX512VL-SLOW-LABEL: shuffle_v4i64_1054: ; AVX512VL-SLOW: # %bb.0: ; AVX512VL-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX512VL-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5] +; AVX512VL-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5] ; AVX512VL-SLOW-NEXT: retq ; ; AVX512VL-FAST-ALL-LABEL: shuffle_v4i64_1054: @@ -1306,7 +1306,7 @@ ; AVX512VL-FAST-PERLANE-LABEL: shuffle_v4i64_1054: ; AVX512VL-FAST-PERLANE: # %bb.0: ; AVX512VL-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX512VL-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5] +; AVX512VL-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5] ; AVX512VL-FAST-PERLANE-NEXT: retq %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> ret <4 x i64> %shuffle @@ -1356,7 +1356,7 @@ ; AVX2-LABEL: shuffle_v4i64_3276: ; AVX2: # %bb.0: ; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] -; AVX2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5] +; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5] ; AVX2-NEXT: retq ; ; AVX512VL-SLOW-LABEL: shuffle_v4i64_3276: @@ -1390,13 +1390,13 @@ ; AVX2-LABEL: shuffle_v4i64_1076: ; AVX2: # %bb.0: ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5] +; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5] ; AVX2-NEXT: retq ; ; AVX512VL-SLOW-LABEL: shuffle_v4i64_1076: ; AVX512VL-SLOW: # %bb.0: ; AVX512VL-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512VL-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5] +; AVX512VL-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5] ; AVX512VL-SLOW-NEXT: retq ; ; AVX512VL-FAST-LABEL: shuffle_v4i64_1076: @@ -1551,7 +1551,7 @@ define <4 x i64> @shuffle_v4i64_11uu(<4 x i64> %a, <4 x i64> %b) { ; ALL-LABEL: shuffle_v4i64_11uu: ; ALL: # %bb.0: -; ALL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,2,3] +; ALL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] ; ALL-NEXT: retq %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> ret <4 x i64> %shuffle @@ -1561,7 +1561,7 @@ ; AVX1-LABEL: shuffle_v4i64_22uu: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,0,1] +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1,0,1] ; AVX1-NEXT: retq ; ; AVX2-LABEL: shuffle_v4i64_22uu: @@ -1629,14 +1629,14 @@ ; AVX2: # %bb.0: ; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 ; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,1,0,1,4,5,4,5] +; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,0,1,4,5,4,5] ; AVX2-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v4i64_0044_v2i64: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 ; AVX512VL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX512VL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,1,0,1,4,5,4,5] +; AVX512VL-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,0,1,4,5,4,5] ; AVX512VL-NEXT: retq %1 = shufflevector <2 x i64> %a, <2 x i64> undef, <2 x i32> %2 = shufflevector <2 x i64> %b, <2 x i64> undef, <2 x i32> @@ -1649,14 +1649,14 @@ ; AVX1OR2: # %bb.0: ; AVX1OR2-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 ; AVX1OR2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1OR2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5] +; AVX1OR2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5] ; AVX1OR2-NEXT: retq ; ; AVX512VL-SLOW-LABEL: shuffle_v4i64_1032_v2i64: ; AVX512VL-SLOW: # %bb.0: ; AVX512VL-SLOW-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 ; AVX512VL-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX512VL-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5] +; AVX512VL-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5] ; AVX512VL-SLOW-NEXT: retq ; ; AVX512VL-FAST-ALL-LABEL: shuffle_v4i64_1032_v2i64: @@ -1671,7 +1671,7 @@ ; AVX512VL-FAST-PERLANE: # %bb.0: ; AVX512VL-FAST-PERLANE-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 ; AVX512VL-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX512VL-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5] +; AVX512VL-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5] ; AVX512VL-FAST-PERLANE-NEXT: retq %1 = shufflevector <2 x i64> %a, <2 x i64> undef, <2 x i32> %2 = shufflevector <2 x i64> %b, <2 x i64> undef, <2 x i32> @@ -1833,7 +1833,7 @@ define <4 x double> @broadcast_v4f64_0000_from_v2i64(<2 x i64> %a0) { ; AVX1-LABEL: broadcast_v4f64_0000_from_v2i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,0,1] +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1,0,1] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll --- a/llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll @@ -10,7 +10,7 @@ define <8 x float> @shuffle_v8f32_00000000(<8 x float> %a, <8 x float> %b) { ; AVX1-LABEL: shuffle_v8f32_00000000: ; AVX1: # %bb.0: -; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0] +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0,0,0] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-NEXT: retq ; @@ -25,14 +25,14 @@ define <8 x float> @shuffle_v8f32_00000010(<8 x float> %a, <8 x float> %b) { ; AVX1-LABEL: shuffle_v8f32_00000010: ; AVX1: # %bb.0: -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,0,0,0] -; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,1,0] +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[0,0,0,0] +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0,1,0] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-SLOW-LABEL: shuffle_v8f32_00000010: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,1,0] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0,1,0] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,0,1] ; AVX2-SLOW-NEXT: retq ; @@ -44,13 +44,13 @@ ; ; AVX2-FAST-PERLANE-LABEL: shuffle_v8f32_00000010: ; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,1,0] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0,1,0] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,0,1] ; AVX2-FAST-PERLANE-NEXT: retq ; ; AVX512VL-SLOW-LABEL: shuffle_v8f32_00000010: ; AVX512VL-SLOW: # %bb.0: -; AVX512VL-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,1,0] +; AVX512VL-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0,1,0] ; AVX512VL-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,0,1] ; AVX512VL-SLOW-NEXT: retq ; @@ -62,7 +62,7 @@ ; ; AVX512VL-FAST-PERLANE-LABEL: shuffle_v8f32_00000010: ; AVX512VL-FAST-PERLANE: # %bb.0: -; AVX512VL-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,1,0] +; AVX512VL-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0,1,0] ; AVX512VL-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,0,1] ; AVX512VL-FAST-PERLANE-NEXT: retq %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> @@ -72,14 +72,14 @@ define <8 x float> @shuffle_v8f32_00000200(<8 x float> %a, <8 x float> %b) { ; AVX1-LABEL: shuffle_v8f32_00000200: ; AVX1: # %bb.0: -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,0,0,0] -; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,0,0] +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[0,0,0,0] +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,0,0] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-SLOW-LABEL: shuffle_v8f32_00000200: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0,0,2] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,0] ; AVX2-SLOW-NEXT: retq ; @@ -91,13 +91,13 @@ ; ; AVX2-FAST-PERLANE-LABEL: shuffle_v8f32_00000200: ; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0,0,2] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,0] ; AVX2-FAST-PERLANE-NEXT: retq ; ; AVX512VL-SLOW-LABEL: shuffle_v8f32_00000200: ; AVX512VL-SLOW: # %bb.0: -; AVX512VL-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,2] +; AVX512VL-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0,0,2] ; AVX512VL-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,0] ; AVX512VL-SLOW-NEXT: retq ; @@ -109,7 +109,7 @@ ; ; AVX512VL-FAST-PERLANE-LABEL: shuffle_v8f32_00000200: ; AVX512VL-FAST-PERLANE: # %bb.0: -; AVX512VL-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,2] +; AVX512VL-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0,0,2] ; AVX512VL-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,0] ; AVX512VL-FAST-PERLANE-NEXT: retq %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> @@ -119,14 +119,14 @@ define <8 x float> @shuffle_v8f32_00003000(<8 x float> %a, <8 x float> %b) { ; AVX1-LABEL: shuffle_v8f32_00003000: ; AVX1: # %bb.0: -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,0,0,0] -; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,0,0,0] +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[0,0,0,0] +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,0,0,0] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-SLOW-LABEL: shuffle_v8f32_00003000: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,3,0] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0,3,0] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,0] ; AVX2-SLOW-NEXT: retq ; @@ -138,13 +138,13 @@ ; ; AVX2-FAST-PERLANE-LABEL: shuffle_v8f32_00003000: ; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,3,0] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0,3,0] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,0] ; AVX2-FAST-PERLANE-NEXT: retq ; ; AVX512VL-SLOW-LABEL: shuffle_v8f32_00003000: ; AVX512VL-SLOW: # %bb.0: -; AVX512VL-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,3,0] +; AVX512VL-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0,3,0] ; AVX512VL-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,0] ; AVX512VL-SLOW-NEXT: retq ; @@ -156,7 +156,7 @@ ; ; AVX512VL-FAST-PERLANE-LABEL: shuffle_v8f32_00003000: ; AVX512VL-FAST-PERLANE: # %bb.0: -; AVX512VL-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,3,0] +; AVX512VL-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0,3,0] ; AVX512VL-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,0] ; AVX512VL-FAST-PERLANE-NEXT: retq %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> @@ -244,8 +244,8 @@ define <8 x float> @shuffle_v8f32_00112233(<8 x float> %a, <8 x float> %b) { ; AVX1-LABEL: shuffle_v8f32_00112233: ; AVX1: # %bb.0: -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,0,1,1] -; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,2,3,3] +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[0,0,1,1] +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,2,3,3] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; @@ -261,14 +261,14 @@ define <8 x float> @shuffle_v8f32_00001111(<8 x float> %a, <8 x float> %b) { ; AVX1-LABEL: shuffle_v8f32_00001111: ; AVX1: # %bb.0: -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,0,0,0] -; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,1,1] +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[0,0,0,0] +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1,1,1] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-SLOW-LABEL: shuffle_v8f32_00001111: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,1,1] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0,1,1] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,1] ; AVX2-SLOW-NEXT: retq ; @@ -280,13 +280,13 @@ ; ; AVX2-FAST-PERLANE-LABEL: shuffle_v8f32_00001111: ; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,1,1] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0,1,1] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,1] ; AVX2-FAST-PERLANE-NEXT: retq ; ; AVX512VL-SLOW-LABEL: shuffle_v8f32_00001111: ; AVX512VL-SLOW: # %bb.0: -; AVX512VL-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,1,1] +; AVX512VL-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0,1,1] ; AVX512VL-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,1] ; AVX512VL-SLOW-NEXT: retq ; @@ -298,7 +298,7 @@ ; ; AVX512VL-FAST-PERLANE-LABEL: shuffle_v8f32_00001111: ; AVX512VL-FAST-PERLANE: # %bb.0: -; AVX512VL-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,1,1] +; AVX512VL-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0,1,1] ; AVX512VL-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,1] ; AVX512VL-FAST-PERLANE-NEXT: retq %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> @@ -318,7 +318,7 @@ ; AVX1-LABEL: shuffle_v8f32_08080808: ; AVX1: # %bb.0: ; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,0] -; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,1,3] +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,1,3] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-NEXT: retq ; @@ -353,13 +353,13 @@ ; AVX1OR2-LABEL: shuffle_v8f32_08084c4c: ; AVX1OR2: # %bb.0: ; AVX1OR2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0],ymm1[0,0],ymm0[4,4],ymm1[4,4] -; AVX1OR2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7] +; AVX1OR2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7] ; AVX1OR2-NEXT: retq ; ; AVX512VL-SLOW-LABEL: shuffle_v8f32_08084c4c: ; AVX512VL-SLOW: # %bb.0: ; AVX512VL-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0],ymm1[0,0],ymm0[4,4],ymm1[4,4] -; AVX512VL-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7] +; AVX512VL-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7] ; AVX512VL-SLOW-NEXT: retq ; ; AVX512VL-FAST-LABEL: shuffle_v8f32_08084c4c: @@ -439,7 +439,7 @@ ; AVX1-NEXT: vmovlhps {{.*#+}} xmm2 = xmm0[0],xmm1[0] ; AVX1-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,2],xmm1[1,1] ; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,2,3,3] +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,2,3,3] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 ; AVX1-NEXT: retq ; @@ -447,7 +447,7 @@ ; AVX2-SLOW: # %bb.0: ; AVX2-SLOW-NEXT: vmovaps {{.*#+}} ymm2 = ; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm2, %ymm1 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,3] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1,1,3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] ; AVX2-SLOW-NEXT: retq @@ -465,7 +465,7 @@ ; AVX2-FAST-PERLANE: # %bb.0: ; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} ymm2 = ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm2, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,3] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1,1,3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] ; AVX2-FAST-PERLANE-NEXT: retq @@ -483,8 +483,8 @@ define <8 x float> @shuffle_v8f32_091b2d3f(<8 x float> %a, <8 x float> %b) { ; AVX1-LABEL: shuffle_v8f32_091b2d3f: ; AVX1: # %bb.0: -; AVX1-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[0,1,1,3] -; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,1,3,3] +; AVX1-NEXT: vshufps {{.*#+}} xmm2 = xmm0[0,1,1,3] +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,1,3,3] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 ; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] ; AVX1-NEXT: retq @@ -526,7 +526,7 @@ ; ; AVX2-SLOW-LABEL: shuffle_v8f32_09ab1def: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,3] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1,1,3] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] ; AVX2-SLOW-NEXT: retq @@ -540,7 +540,7 @@ ; ; AVX2-FAST-PERLANE-LABEL: shuffle_v8f32_09ab1def: ; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,3] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1,1,3] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] ; AVX2-FAST-PERLANE-NEXT: retq @@ -558,7 +558,7 @@ define <8 x float> @shuffle_v8f32_00014445(<8 x float> %a, <8 x float> %b) { ; ALL-LABEL: shuffle_v8f32_00014445: ; ALL: # %bb.0: -; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,0,1,4,4,4,5] +; ALL-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0,0,1,4,4,4,5] ; ALL-NEXT: retq %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> ret <8 x float> %shuffle @@ -567,7 +567,7 @@ define <8 x float> @shuffle_v8f32_00204464(<8 x float> %a, <8 x float> %b) { ; ALL-LABEL: shuffle_v8f32_00204464: ; ALL: # %bb.0: -; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,2,0,4,4,6,4] +; ALL-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0,2,0,4,4,6,4] ; ALL-NEXT: retq %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> ret <8 x float> %shuffle @@ -576,7 +576,7 @@ define <8 x float> @shuffle_v8f32_03004744(<8 x float> %a, <8 x float> %b) { ; ALL-LABEL: shuffle_v8f32_03004744: ; ALL: # %bb.0: -; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,3,0,0,4,7,4,4] +; ALL-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,3,0,0,4,7,4,4] ; ALL-NEXT: retq %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> ret <8 x float> %shuffle @@ -585,7 +585,7 @@ define <8 x float> @shuffle_v8f32_10005444(<8 x float> %a, <8 x float> %b) { ; ALL-LABEL: shuffle_v8f32_10005444: ; ALL: # %bb.0: -; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,0,0,0,5,4,4,4] +; ALL-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,0,0,0,5,4,4,4] ; ALL-NEXT: retq %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> ret <8 x float> %shuffle @@ -594,7 +594,7 @@ define <8 x float> @shuffle_v8f32_22006644(<8 x float> %a, <8 x float> %b) { ; ALL-LABEL: shuffle_v8f32_22006644: ; ALL: # %bb.0: -; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,2,0,0,6,6,4,4] +; ALL-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,2,0,0,6,6,4,4] ; ALL-NEXT: retq %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> ret <8 x float> %shuffle @@ -603,7 +603,7 @@ define <8 x float> @shuffle_v8f32_33307774(<8 x float> %a, <8 x float> %b) { ; ALL-LABEL: shuffle_v8f32_33307774: ; ALL: # %bb.0: -; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,3,3,0,7,7,7,4] +; ALL-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,3,3,0,7,7,7,4] ; ALL-NEXT: retq %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> ret <8 x float> %shuffle @@ -612,7 +612,7 @@ define <8 x float> @shuffle_v8f32_32107654(<8 x float> %a, <8 x float> %b) { ; ALL-LABEL: shuffle_v8f32_32107654: ; ALL: # %bb.0: -; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; ALL-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] ; ALL-NEXT: retq %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> ret <8 x float> %shuffle @@ -621,7 +621,7 @@ define <8 x float> @shuffle_v8f32_00234467(<8 x float> %a, <8 x float> %b) { ; ALL-LABEL: shuffle_v8f32_00234467: ; ALL: # %bb.0: -; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,2,3,4,4,6,7] +; ALL-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0,2,3,4,4,6,7] ; ALL-NEXT: retq %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> ret <8 x float> %shuffle @@ -676,14 +676,14 @@ ; AVX1OR2: # %bb.0: ; AVX1OR2-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 ; AVX1OR2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1OR2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4] +; AVX1OR2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4] ; AVX1OR2-NEXT: retq ; ; AVX512VL-SLOW-LABEL: shuffle_v8f32_00004444_v4f32: ; AVX512VL-SLOW: # %bb.0: ; AVX512VL-SLOW-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 ; AVX512VL-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX512VL-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4] +; AVX512VL-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4] ; AVX512VL-SLOW-NEXT: retq ; ; AVX512VL-FAST-ALL-LABEL: shuffle_v8f32_00004444_v4f32: @@ -698,7 +698,7 @@ ; AVX512VL-FAST-PERLANE: # %bb.0: ; AVX512VL-FAST-PERLANE-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 ; AVX512VL-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX512VL-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4] +; AVX512VL-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4] ; AVX512VL-FAST-PERLANE-NEXT: retq %1 = shufflevector <4 x float> %a, <4 x float> %b, <8 x i32> ret <8 x float> %1 @@ -707,7 +707,7 @@ define <8 x float> @shuffle_v8f32_10325476(<8 x float> %a, <8 x float> %b) { ; ALL-LABEL: shuffle_v8f32_10325476: ; ALL: # %bb.0: -; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6] +; ALL-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6] ; ALL-NEXT: retq %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> ret <8 x float> %shuffle @@ -760,7 +760,7 @@ define <8 x float> @shuffle_v8f32_10235467(<8 x float> %a, <8 x float> %b) { ; ALL-LABEL: shuffle_v8f32_10235467: ; ALL: # %bb.0: -; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,0,2,3,5,4,6,7] +; ALL-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,0,2,3,5,4,6,7] ; ALL-NEXT: retq %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> ret <8 x float> %shuffle @@ -769,7 +769,7 @@ define <8 x float> @shuffle_v8f32_10225466(<8 x float> %a, <8 x float> %b) { ; ALL-LABEL: shuffle_v8f32_10225466: ; ALL: # %bb.0: -; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,0,2,2,5,4,6,6] +; ALL-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,0,2,2,5,4,6,6] ; ALL-NEXT: retq %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> ret <8 x float> %shuffle @@ -962,7 +962,7 @@ ; AVX2-SLOW-NEXT: vbroadcastf128 {{.*#+}} ymm2 = [4,5,2,0,4,5,2,0] ; AVX2-SLOW-NEXT: # ymm2 = mem[0,1,0,1] ; AVX2-SLOW-NEXT: vpermps %ymm1, %ymm2, %ymm1 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,3,2,0,4,7,6,4] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,3,2,0,4,7,6,4] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,1] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4,5,6],ymm0[7] ; AVX2-SLOW-NEXT: retq @@ -982,7 +982,7 @@ ; AVX2-FAST-PERLANE-NEXT: vbroadcastf128 {{.*#+}} ymm2 = [4,5,2,0,4,5,2,0] ; AVX2-FAST-PERLANE-NEXT: # ymm2 = mem[0,1,0,1] ; AVX2-FAST-PERLANE-NEXT: vpermps %ymm1, %ymm2, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,3,2,0,4,7,6,4] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,3,2,0,4,7,6,4] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,1] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4,5,6],ymm0[7] ; AVX2-FAST-PERLANE-NEXT: retq @@ -1001,18 +1001,18 @@ ; AVX1-LABEL: shuffle_v8f32_f511235a: ; AVX1: # %bb.0: ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3,0,1] -; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,1,1,1,5,5,5,5] +; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,1,1,1,5,5,5,5] ; AVX1-NEXT: vshufpd {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[3],ymm0[3] ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,0,1] -; AVX1-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[3,1,2,2,7,5,6,6] +; AVX1-NEXT: vshufps {{.*#+}} ymm1 = ymm1[3,1,2,2,7,5,6,6] ; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6],ymm1[7] ; AVX1-NEXT: retq ; ; AVX2-SLOW-LABEL: shuffle_v8f32_f511235a: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[3,2,2,3,7,6,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm1[3,2,2,3,7,6,6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,0] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,1,2,3,5,5,6,7] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,1,2,3,5,5,6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,0,1,2] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6],ymm1[7] ; AVX2-SLOW-NEXT: retq @@ -1028,9 +1028,9 @@ ; ; AVX2-FAST-PERLANE-LABEL: shuffle_v8f32_f511235a: ; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[3,2,2,3,7,6,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm1[3,2,2,3,7,6,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,0] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,1,2,3,5,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,1,2,3,5,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,0,1,2] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6],ymm1[7] ; AVX2-FAST-PERLANE-NEXT: retq @@ -1047,13 +1047,13 @@ define <8 x float> @shuffle_v8f32_32103210(<8 x float> %a, <8 x float> %b) { ; AVX1-LABEL: shuffle_v8f32_32103210: ; AVX1: # %bb.0: -; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0] +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,2,1,0] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-SLOW-LABEL: shuffle_v8f32_32103210: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,2,1,0] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,1] ; AVX2-SLOW-NEXT: retq ; @@ -1066,13 +1066,13 @@ ; ; AVX2-FAST-PERLANE-LABEL: shuffle_v8f32_32103210: ; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,2,1,0] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,1] ; AVX2-FAST-PERLANE-NEXT: retq ; ; AVX512VL-SLOW-LABEL: shuffle_v8f32_32103210: ; AVX512VL-SLOW: # %bb.0: -; AVX512VL-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0] +; AVX512VL-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,2,1,0] ; AVX512VL-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,1] ; AVX512VL-SLOW-NEXT: retq ; @@ -1085,7 +1085,7 @@ ; ; AVX512VL-FAST-PERLANE-LABEL: shuffle_v8f32_32103210: ; AVX512VL-FAST-PERLANE: # %bb.0: -; AVX512VL-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0] +; AVX512VL-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,2,1,0] ; AVX512VL-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,1] ; AVX512VL-FAST-PERLANE-NEXT: retq %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> @@ -1096,12 +1096,12 @@ ; AVX1-LABEL: shuffle_v8f32_76547654: ; AVX1: # %bb.0: ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] ; AVX1-NEXT: retq ; ; AVX2-SLOW-LABEL: shuffle_v8f32_76547654: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,3,2,3] ; AVX2-SLOW-NEXT: retq ; @@ -1114,13 +1114,13 @@ ; ; AVX2-FAST-PERLANE-LABEL: shuffle_v8f32_76547654: ; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,3,2,3] ; AVX2-FAST-PERLANE-NEXT: retq ; ; AVX512VL-SLOW-LABEL: shuffle_v8f32_76547654: ; AVX512VL-SLOW: # %bb.0: -; AVX512VL-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX512VL-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] ; AVX512VL-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,3,2,3] ; AVX512VL-SLOW-NEXT: retq ; @@ -1133,7 +1133,7 @@ ; ; AVX512VL-FAST-PERLANE-LABEL: shuffle_v8f32_76547654: ; AVX512VL-FAST-PERLANE: # %bb.0: -; AVX512VL-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX512VL-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] ; AVX512VL-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,3,2,3] ; AVX512VL-FAST-PERLANE-NEXT: retq %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> @@ -1144,12 +1144,12 @@ ; AVX1-LABEL: shuffle_v8f32_76543210: ; AVX1: # %bb.0: ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1] -; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] ; AVX1-NEXT: retq ; ; AVX2-SLOW-LABEL: shuffle_v8f32_76543210: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,3,0,1] ; AVX2-SLOW-NEXT: retq ; @@ -1161,13 +1161,13 @@ ; ; AVX2-FAST-PERLANE-LABEL: shuffle_v8f32_76543210: ; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,3,0,1] ; AVX2-FAST-PERLANE-NEXT: retq ; ; AVX512VL-SLOW-LABEL: shuffle_v8f32_76543210: ; AVX512VL-SLOW: # %bb.0: -; AVX512VL-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX512VL-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] ; AVX512VL-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,3,0,1] ; AVX512VL-SLOW-NEXT: retq ; @@ -1179,7 +1179,7 @@ ; ; AVX512VL-FAST-PERLANE-LABEL: shuffle_v8f32_76543210: ; AVX512VL-FAST-PERLANE: # %bb.0: -; AVX512VL-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX512VL-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] ; AVX512VL-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,3,0,1] ; AVX512VL-FAST-PERLANE-NEXT: retq %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> @@ -1190,13 +1190,13 @@ ; AVX1OR2-LABEL: shuffle_v8f32_3210ba98: ; AVX1OR2: # %bb.0: ; AVX1OR2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1OR2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX1OR2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] ; AVX1OR2-NEXT: retq ; ; AVX512VL-SLOW-LABEL: shuffle_v8f32_3210ba98: ; AVX512VL-SLOW: # %bb.0: ; AVX512VL-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX512VL-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX512VL-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] ; AVX512VL-SLOW-NEXT: retq ; ; AVX512VL-FAST-ALL-LABEL: shuffle_v8f32_3210ba98: @@ -1208,7 +1208,7 @@ ; AVX512VL-FAST-PERLANE-LABEL: shuffle_v8f32_3210ba98: ; AVX512VL-FAST-PERLANE: # %bb.0: ; AVX512VL-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX512VL-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX512VL-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] ; AVX512VL-FAST-PERLANE-NEXT: retq %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> ret <8 x float> %shuffle @@ -1218,13 +1218,13 @@ ; AVX1OR2-LABEL: shuffle_v8f32_3210fedc: ; AVX1OR2: # %bb.0: ; AVX1OR2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX1OR2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX1OR2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] ; AVX1OR2-NEXT: retq ; ; AVX512VL-SLOW-LABEL: shuffle_v8f32_3210fedc: ; AVX512VL-SLOW: # %bb.0: ; AVX512VL-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512VL-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX512VL-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] ; AVX512VL-SLOW-NEXT: retq ; ; AVX512VL-FAST-LABEL: shuffle_v8f32_3210fedc: @@ -1240,13 +1240,13 @@ ; AVX1OR2-LABEL: shuffle_v8f32_7654fedc: ; AVX1OR2: # %bb.0: ; AVX1OR2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] -; AVX1OR2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX1OR2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] ; AVX1OR2-NEXT: retq ; ; AVX512VL-SLOW-LABEL: shuffle_v8f32_7654fedc: ; AVX512VL-SLOW: # %bb.0: ; AVX512VL-SLOW-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] -; AVX512VL-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX512VL-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] ; AVX512VL-SLOW-NEXT: retq ; ; AVX512VL-FAST-ALL-LABEL: shuffle_v8f32_7654fedc: @@ -1258,7 +1258,7 @@ ; AVX512VL-FAST-PERLANE-LABEL: shuffle_v8f32_7654fedc: ; AVX512VL-FAST-PERLANE: # %bb.0: ; AVX512VL-FAST-PERLANE-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] -; AVX512VL-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX512VL-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] ; AVX512VL-FAST-PERLANE-NEXT: retq %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> ret <8 x float> %shuffle @@ -1268,13 +1268,13 @@ ; AVX1OR2-LABEL: shuffle_v8f32_fedc7654: ; AVX1OR2: # %bb.0: ; AVX1OR2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX1OR2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX1OR2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] ; AVX1OR2-NEXT: retq ; ; AVX512VL-SLOW-LABEL: shuffle_v8f32_fedc7654: ; AVX512VL-SLOW: # %bb.0: ; AVX512VL-SLOW-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512VL-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX512VL-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] ; AVX512VL-SLOW-NEXT: retq ; ; AVX512VL-FAST-ALL-LABEL: shuffle_v8f32_fedc7654: @@ -1287,7 +1287,7 @@ ; AVX512VL-FAST-PERLANE-LABEL: shuffle_v8f32_fedc7654: ; AVX512VL-FAST-PERLANE: # %bb.0: ; AVX512VL-FAST-PERLANE-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX512VL-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX512VL-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] ; AVX512VL-FAST-PERLANE-NEXT: retq %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> ret <8 x float> %shuffle @@ -1332,13 +1332,13 @@ ; AVX1OR2-LABEL: shuffle_v8f32_ba987654: ; AVX1OR2: # %bb.0: ; AVX1OR2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX1OR2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX1OR2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] ; AVX1OR2-NEXT: retq ; ; AVX512VL-SLOW-LABEL: shuffle_v8f32_ba987654: ; AVX512VL-SLOW: # %bb.0: ; AVX512VL-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX512VL-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX512VL-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] ; AVX512VL-SLOW-NEXT: retq ; ; AVX512VL-FAST-LABEL: shuffle_v8f32_ba987654: @@ -1355,13 +1355,13 @@ ; AVX1OR2-LABEL: shuffle_v8f32_ba983210: ; AVX1OR2: # %bb.0: ; AVX1OR2-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1OR2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX1OR2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] ; AVX1OR2-NEXT: retq ; ; AVX512VL-SLOW-LABEL: shuffle_v8f32_ba983210: ; AVX512VL-SLOW: # %bb.0: ; AVX512VL-SLOW-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX512VL-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX512VL-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] ; AVX512VL-SLOW-NEXT: retq ; ; AVX512VL-FAST-ALL-LABEL: shuffle_v8f32_ba983210: @@ -1374,7 +1374,7 @@ ; AVX512VL-FAST-PERLANE-LABEL: shuffle_v8f32_ba983210: ; AVX512VL-FAST-PERLANE: # %bb.0: ; AVX512VL-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX512VL-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX512VL-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] ; AVX512VL-FAST-PERLANE-NEXT: retq %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> ret <8 x float> %shuffle @@ -1402,12 +1402,12 @@ ; AVX1-LABEL: shuffle_v8f32_084c195d: ; AVX1: # %bb.0: ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm1[2,3,0,1] -; AVX1-NEXT: vpermilps {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,4] -; AVX1-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,0,2,1,4,4,6,5] +; AVX1-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,4] +; AVX1-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,0,2,1,4,4,6,5] ; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3,4,5],ymm1[6,7] ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3,0,1] -; AVX1-NEXT: vpermilps {{.*#+}} ymm2 = ymm2[1,1,0,3,5,5,4,7] -; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,1,1,3,4,5,5,7] +; AVX1-NEXT: vshufps {{.*#+}} ymm2 = ymm2[1,1,0,3,5,5,4,7] +; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,1,3,4,5,5,7] ; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3,4,5],ymm0[6,7] ; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] ; AVX1-NEXT: retq @@ -1540,7 +1540,7 @@ define <8 x float> @shuffle_v8f32_uuuu1111(<8 x float> %a, <8 x float> %b) { ; ALL-LABEL: shuffle_v8f32_uuuu1111: ; ALL: # %bb.0: -; ALL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,1,1] +; ALL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1,1,1] ; ALL-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> @@ -1551,7 +1551,7 @@ ; AVX1-LABEL: shuffle_v8f32_44444444: ; AVX1: # %bb.0: ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4] +; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4] ; AVX1-NEXT: retq ; ; AVX2-SLOW-LABEL: shuffle_v8f32_44444444: @@ -1605,7 +1605,7 @@ define <8 x float> @shuffle_v8f32_uuuu3210(<8 x float> %a, <8 x float> %b) { ; ALL-LABEL: shuffle_v8f32_uuuu3210: ; ALL: # %bb.0: -; ALL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0] +; ALL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,2,1,0] ; ALL-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> @@ -1625,7 +1625,7 @@ define <8 x float> @shuffle_v8f32_1111uuuu(<8 x float> %a, <8 x float> %b) { ; ALL-LABEL: shuffle_v8f32_1111uuuu: ; ALL: # %bb.0: -; ALL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,1,1] +; ALL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1,1,1] ; ALL-NEXT: retq %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> ret <8 x float> %shuffle @@ -1635,7 +1635,7 @@ ; ALL-LABEL: shuffle_v8f32_5555uuuu: ; ALL: # %bb.0: ; ALL-NEXT: vextractf128 $1, %ymm0, %xmm0 -; ALL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,1,1] +; ALL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1,1,1] ; ALL-NEXT: retq %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> ret <8 x float> %shuffle @@ -1646,14 +1646,14 @@ ; AVX1OR2: # %bb.0: ; AVX1OR2-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 ; AVX1OR2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1OR2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX1OR2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] ; AVX1OR2-NEXT: retq ; ; AVX512VL-SLOW-LABEL: shuffle_v8f32_32107654_v4f32: ; AVX512VL-SLOW: # %bb.0: ; AVX512VL-SLOW-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 ; AVX512VL-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX512VL-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX512VL-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] ; AVX512VL-SLOW-NEXT: retq ; ; AVX512VL-FAST-ALL-LABEL: shuffle_v8f32_32107654_v4f32: @@ -1668,7 +1668,7 @@ ; AVX512VL-FAST-PERLANE: # %bb.0: ; AVX512VL-FAST-PERLANE-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 ; AVX512VL-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX512VL-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX512VL-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] ; AVX512VL-FAST-PERLANE-NEXT: retq %1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> %2 = shufflevector <4 x float> %b, <4 x float> undef, <4 x i32> @@ -1702,7 +1702,7 @@ define <8 x i32> @shuffle_v8i32_00000000(<8 x i32> %a, <8 x i32> %b) { ; AVX1-LABEL: shuffle_v8i32_00000000: ; AVX1: # %bb.0: -; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0] +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0,0,0] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-NEXT: retq ; @@ -1717,14 +1717,14 @@ define <8 x i32> @shuffle_v8i32_00000010(<8 x i32> %a, <8 x i32> %b) { ; AVX1-LABEL: shuffle_v8i32_00000010: ; AVX1: # %bb.0: -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,0,0,0] -; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,1,0] +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[0,0,0,0] +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0,1,0] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-SLOW-LABEL: shuffle_v8i32_00000010: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,1,0] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0,1,0] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,0,1] ; AVX2-SLOW-NEXT: retq ; @@ -1736,13 +1736,13 @@ ; ; AVX2-FAST-PERLANE-LABEL: shuffle_v8i32_00000010: ; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,1,0] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0,1,0] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,0,1] ; AVX2-FAST-PERLANE-NEXT: retq ; ; AVX512VL-SLOW-LABEL: shuffle_v8i32_00000010: ; AVX512VL-SLOW: # %bb.0: -; AVX512VL-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,1,0] +; AVX512VL-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0,1,0] ; AVX512VL-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,0,1] ; AVX512VL-SLOW-NEXT: retq ; @@ -1754,7 +1754,7 @@ ; ; AVX512VL-FAST-PERLANE-LABEL: shuffle_v8i32_00000010: ; AVX512VL-FAST-PERLANE: # %bb.0: -; AVX512VL-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,1,0] +; AVX512VL-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0,1,0] ; AVX512VL-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,0,1] ; AVX512VL-FAST-PERLANE-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> @@ -1764,14 +1764,14 @@ define <8 x i32> @shuffle_v8i32_00000200(<8 x i32> %a, <8 x i32> %b) { ; AVX1-LABEL: shuffle_v8i32_00000200: ; AVX1: # %bb.0: -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,0,0,0] -; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,0,0] +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[0,0,0,0] +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,0,0] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-SLOW-LABEL: shuffle_v8i32_00000200: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,2] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0,0,2] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,0] ; AVX2-SLOW-NEXT: retq ; @@ -1783,13 +1783,13 @@ ; ; AVX2-FAST-PERLANE-LABEL: shuffle_v8i32_00000200: ; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,2] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0,0,2] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,0] ; AVX2-FAST-PERLANE-NEXT: retq ; ; AVX512VL-SLOW-LABEL: shuffle_v8i32_00000200: ; AVX512VL-SLOW: # %bb.0: -; AVX512VL-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,2] +; AVX512VL-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0,0,2] ; AVX512VL-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,0] ; AVX512VL-SLOW-NEXT: retq ; @@ -1801,7 +1801,7 @@ ; ; AVX512VL-FAST-PERLANE-LABEL: shuffle_v8i32_00000200: ; AVX512VL-FAST-PERLANE: # %bb.0: -; AVX512VL-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,2] +; AVX512VL-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0,0,2] ; AVX512VL-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,0] ; AVX512VL-FAST-PERLANE-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> @@ -1811,14 +1811,14 @@ define <8 x i32> @shuffle_v8i32_00003000(<8 x i32> %a, <8 x i32> %b) { ; AVX1-LABEL: shuffle_v8i32_00003000: ; AVX1: # %bb.0: -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,0,0,0] -; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,0,0,0] +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[0,0,0,0] +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,0,0,0] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-SLOW-LABEL: shuffle_v8i32_00003000: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,3,0] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0,3,0] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,0] ; AVX2-SLOW-NEXT: retq ; @@ -1830,13 +1830,13 @@ ; ; AVX2-FAST-PERLANE-LABEL: shuffle_v8i32_00003000: ; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,3,0] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0,3,0] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,0] ; AVX2-FAST-PERLANE-NEXT: retq ; ; AVX512VL-SLOW-LABEL: shuffle_v8i32_00003000: ; AVX512VL-SLOW: # %bb.0: -; AVX512VL-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,3,0] +; AVX512VL-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0,3,0] ; AVX512VL-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,0] ; AVX512VL-SLOW-NEXT: retq ; @@ -1848,7 +1848,7 @@ ; ; AVX512VL-FAST-PERLANE-LABEL: shuffle_v8i32_00003000: ; AVX512VL-FAST-PERLANE: # %bb.0: -; AVX512VL-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,3,0] +; AVX512VL-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0,3,0] ; AVX512VL-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,0] ; AVX512VL-FAST-PERLANE-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> @@ -1932,7 +1932,7 @@ ; ; AVX2OR512VL-LABEL: shuffle_v8i32_01014545: ; AVX2OR512VL: # %bb.0: -; AVX2OR512VL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,1,0,1,4,5,4,5] +; AVX2OR512VL-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,0,1,4,5,4,5] ; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> ret <8 x i32> %shuffle @@ -1941,15 +1941,15 @@ define <8 x i32> @shuffle_v8i32_00112233(<8 x i32> %a, <8 x i32> %b) { ; AVX1-LABEL: shuffle_v8i32_00112233: ; AVX1: # %bb.0: -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,0,1,1] -; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,2,3,3] +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[0,0,1,1] +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,2,3,3] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-SLOW-LABEL: shuffle_v8i32_00112233: ; AVX2-SLOW: # %bb.0: ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,1,1,4,4,5,5] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0,1,1,4,4,5,5] ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-ALL-LABEL: shuffle_v8i32_00112233: @@ -1961,13 +1961,13 @@ ; AVX2-FAST-PERLANE-LABEL: shuffle_v8i32_00112233: ; AVX2-FAST-PERLANE: # %bb.0: ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,1,1,4,4,5,5] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0,1,1,4,4,5,5] ; AVX2-FAST-PERLANE-NEXT: retq ; ; AVX512VL-SLOW-LABEL: shuffle_v8i32_00112233: ; AVX512VL-SLOW: # %bb.0: ; AVX512VL-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3] -; AVX512VL-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,1,1,4,4,5,5] +; AVX512VL-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0,1,1,4,4,5,5] ; AVX512VL-SLOW-NEXT: retq ; ; AVX512VL-FAST-ALL-LABEL: shuffle_v8i32_00112233: @@ -1979,7 +1979,7 @@ ; AVX512VL-FAST-PERLANE-LABEL: shuffle_v8i32_00112233: ; AVX512VL-FAST-PERLANE: # %bb.0: ; AVX512VL-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3] -; AVX512VL-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,1,1,4,4,5,5] +; AVX512VL-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0,1,1,4,4,5,5] ; AVX512VL-FAST-PERLANE-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> ret <8 x i32> %shuffle @@ -1988,14 +1988,14 @@ define <8 x i32> @shuffle_v8i32_00001111(<8 x i32> %a, <8 x i32> %b) { ; AVX1-LABEL: shuffle_v8i32_00001111: ; AVX1: # %bb.0: -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,0,0,0] -; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,1,1] +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[0,0,0,0] +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1,1,1] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-SLOW-LABEL: shuffle_v8i32_00001111: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,1,1] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0,1,1] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,1] ; AVX2-SLOW-NEXT: retq ; @@ -2007,13 +2007,13 @@ ; ; AVX2-FAST-PERLANE-LABEL: shuffle_v8i32_00001111: ; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,1,1] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0,1,1] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,1] ; AVX2-FAST-PERLANE-NEXT: retq ; ; AVX512VL-SLOW-LABEL: shuffle_v8i32_00001111: ; AVX512VL-SLOW: # %bb.0: -; AVX512VL-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,1,1] +; AVX512VL-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0,1,1] ; AVX512VL-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,1] ; AVX512VL-SLOW-NEXT: retq ; @@ -2025,7 +2025,7 @@ ; ; AVX512VL-FAST-PERLANE-LABEL: shuffle_v8i32_00001111: ; AVX512VL-FAST-PERLANE: # %bb.0: -; AVX512VL-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,1,1] +; AVX512VL-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0,1,1] ; AVX512VL-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,1] ; AVX512VL-FAST-PERLANE-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> @@ -2045,7 +2045,7 @@ ; AVX1-LABEL: shuffle_v8i32_08080808: ; AVX1: # %bb.0: ; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,0] -; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,1,3] +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,1,3] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-NEXT: retq ; @@ -2080,19 +2080,19 @@ ; AVX1-LABEL: shuffle_v8i32_08084c4c: ; AVX1: # %bb.0: ; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0],ymm1[0,0],ymm0[4,4],ymm1[4,4] -; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7] +; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7] ; AVX1-NEXT: retq ; ; AVX2-LABEL: shuffle_v8i32_08084c4c: ; AVX2: # %bb.0: ; AVX2-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] -; AVX2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,1,0,1,4,5,4,5] +; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,0,1,4,5,4,5] ; AVX2-NEXT: retq ; ; AVX512VL-SLOW-LABEL: shuffle_v8i32_08084c4c: ; AVX512VL-SLOW: # %bb.0: ; AVX512VL-SLOW-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] -; AVX512VL-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,1,0,1,4,5,4,5] +; AVX512VL-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,0,1,4,5,4,5] ; AVX512VL-SLOW-NEXT: retq ; ; AVX512VL-FAST-LABEL: shuffle_v8i32_08084c4c: @@ -2172,7 +2172,7 @@ ; AVX1-NEXT: vmovlhps {{.*#+}} xmm2 = xmm0[0],xmm1[0] ; AVX1-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,2],xmm1[1,1] ; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,2,3,3] +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,2,3,3] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 ; AVX1-NEXT: retq ; @@ -2180,9 +2180,9 @@ ; AVX2-SLOW: # %bb.0: ; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,1,3] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,0,1,1,4,4,5,5] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,1,3] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,0,1,1,4,4,5,5] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-ALL-LABEL: shuffle_v8i32_08991abb: @@ -2198,9 +2198,9 @@ ; AVX2-FAST-PERLANE: # %bb.0: ; AVX2-FAST-PERLANE-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,1,3] -; AVX2-FAST-PERLANE-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,0,1,1,4,4,5,5] -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,1,3] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,0,1,1,4,4,5,5] +; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] ; AVX2-FAST-PERLANE-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v8i32_08991abb: @@ -2252,7 +2252,7 @@ define <8 x i32> @shuffle_v8i32_09ab1def(<8 x i32> %a, <8 x i32> %b) { ; AVX1-LABEL: shuffle_v8i32_09ab1def: ; AVX1: # %bb.0: -; AVX1-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[1,1,3,3] +; AVX1-NEXT: vshufps {{.*#+}} xmm2 = xmm0[1,1,3,3] ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7] ; AVX1-NEXT: retq @@ -2291,7 +2291,7 @@ define <8 x i32> @shuffle_v8i32_00014445(<8 x i32> %a, <8 x i32> %b) { ; ALL-LABEL: shuffle_v8i32_00014445: ; ALL: # %bb.0: -; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,0,1,4,4,4,5] +; ALL-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0,0,1,4,4,4,5] ; ALL-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> ret <8 x i32> %shuffle @@ -2300,7 +2300,7 @@ define <8 x i32> @shuffle_v8i32_00204464(<8 x i32> %a, <8 x i32> %b) { ; ALL-LABEL: shuffle_v8i32_00204464: ; ALL: # %bb.0: -; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,2,0,4,4,6,4] +; ALL-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0,2,0,4,4,6,4] ; ALL-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> ret <8 x i32> %shuffle @@ -2309,7 +2309,7 @@ define <8 x i32> @shuffle_v8i32_03004744(<8 x i32> %a, <8 x i32> %b) { ; ALL-LABEL: shuffle_v8i32_03004744: ; ALL: # %bb.0: -; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,3,0,0,4,7,4,4] +; ALL-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,3,0,0,4,7,4,4] ; ALL-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> ret <8 x i32> %shuffle @@ -2318,7 +2318,7 @@ define <8 x i32> @shuffle_v8i32_10005444(<8 x i32> %a, <8 x i32> %b) { ; ALL-LABEL: shuffle_v8i32_10005444: ; ALL: # %bb.0: -; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,0,0,0,5,4,4,4] +; ALL-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,0,0,0,5,4,4,4] ; ALL-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> ret <8 x i32> %shuffle @@ -2327,7 +2327,7 @@ define <8 x i32> @shuffle_v8i32_22006644(<8 x i32> %a, <8 x i32> %b) { ; ALL-LABEL: shuffle_v8i32_22006644: ; ALL: # %bb.0: -; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,2,0,0,6,6,4,4] +; ALL-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,2,0,0,6,6,4,4] ; ALL-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> ret <8 x i32> %shuffle @@ -2336,7 +2336,7 @@ define <8 x i32> @shuffle_v8i32_33307774(<8 x i32> %a, <8 x i32> %b) { ; ALL-LABEL: shuffle_v8i32_33307774: ; ALL: # %bb.0: -; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,3,3,0,7,7,7,4] +; ALL-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,3,3,0,7,7,7,4] ; ALL-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> ret <8 x i32> %shuffle @@ -2345,7 +2345,7 @@ define <8 x i32> @shuffle_v8i32_32107654(<8 x i32> %a, <8 x i32> %b) { ; ALL-LABEL: shuffle_v8i32_32107654: ; ALL: # %bb.0: -; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; ALL-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] ; ALL-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> ret <8 x i32> %shuffle @@ -2354,7 +2354,7 @@ define <8 x i32> @shuffle_v8i32_00234467(<8 x i32> %a, <8 x i32> %b) { ; ALL-LABEL: shuffle_v8i32_00234467: ; ALL: # %bb.0: -; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,2,3,4,4,6,7] +; ALL-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0,2,3,4,4,6,7] ; ALL-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> ret <8 x i32> %shuffle @@ -2368,7 +2368,7 @@ ; ; AVX2OR512VL-LABEL: shuffle_v8i32_00224466: ; AVX2OR512VL: # %bb.0: -; AVX2OR512VL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,2,2,4,4,6,6] +; AVX2OR512VL-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0,2,2,4,4,6,6] ; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> ret <8 x i32> %shuffle @@ -2377,7 +2377,7 @@ define <8 x i32> @shuffle_v8i32_10325476(<8 x i32> %a, <8 x i32> %b) { ; ALL-LABEL: shuffle_v8i32_10325476: ; ALL: # %bb.0: -; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6] +; ALL-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6] ; ALL-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> ret <8 x i32> %shuffle @@ -2391,7 +2391,7 @@ ; ; AVX2OR512VL-LABEL: shuffle_v8i32_11335577: ; AVX2OR512VL: # %bb.0: -; AVX2OR512VL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,1,3,3,5,5,7,7] +; AVX2OR512VL-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,1,3,3,5,5,7,7] ; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> ret <8 x i32> %shuffle @@ -2400,7 +2400,7 @@ define <8 x i32> @shuffle_v8i32_10235467(<8 x i32> %a, <8 x i32> %b) { ; ALL-LABEL: shuffle_v8i32_10235467: ; ALL: # %bb.0: -; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,0,2,3,5,4,6,7] +; ALL-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,0,2,3,5,4,6,7] ; ALL-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> ret <8 x i32> %shuffle @@ -2409,7 +2409,7 @@ define <8 x i32> @shuffle_v8i32_10225466(<8 x i32> %a, <8 x i32> %b) { ; ALL-LABEL: shuffle_v8i32_10225466: ; ALL: # %bb.0: -; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,0,2,2,5,4,6,6] +; ALL-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,0,2,2,5,4,6,6] ; ALL-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> ret <8 x i32> %shuffle @@ -2713,7 +2713,7 @@ ; AVX2-SLOW-LABEL: shuffle_v8i32_6caa87e5: ; AVX2-SLOW: # %bb.0: ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,1,3,2] -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,0,2,2,4,4,6,6] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,0,2,2,4,4,6,6] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,0,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4],ymm0[5],ymm1[6],ymm0[7] ; AVX2-SLOW-NEXT: retq @@ -2729,7 +2729,7 @@ ; AVX2-FAST-PERLANE-LABEL: shuffle_v8i32_6caa87e5: ; AVX2-FAST-PERLANE: # %bb.0: ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,1,3,2] -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,0,2,2,4,4,6,6] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,0,2,2,4,4,6,6] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,0,3] ; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4],ymm0[5],ymm1[6],ymm0[7] ; AVX2-FAST-PERLANE-NEXT: retq @@ -2747,13 +2747,13 @@ define <8 x i32> @shuffle_v8i32_32103210(<8 x i32> %a, <8 x i32> %b) { ; AVX1-LABEL: shuffle_v8i32_32103210: ; AVX1: # %bb.0: -; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0] +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,2,1,0] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-SLOW-LABEL: shuffle_v8i32_32103210: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,2,1,0] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,1] ; AVX2-SLOW-NEXT: retq ; @@ -2766,13 +2766,13 @@ ; ; AVX2-FAST-PERLANE-LABEL: shuffle_v8i32_32103210: ; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,2,1,0] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,1] ; AVX2-FAST-PERLANE-NEXT: retq ; ; AVX512VL-SLOW-LABEL: shuffle_v8i32_32103210: ; AVX512VL-SLOW: # %bb.0: -; AVX512VL-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0] +; AVX512VL-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,2,1,0] ; AVX512VL-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,1] ; AVX512VL-SLOW-NEXT: retq ; @@ -2785,7 +2785,7 @@ ; ; AVX512VL-FAST-PERLANE-LABEL: shuffle_v8i32_32103210: ; AVX512VL-FAST-PERLANE: # %bb.0: -; AVX512VL-FAST-PERLANE-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0] +; AVX512VL-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,2,1,0] ; AVX512VL-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,1] ; AVX512VL-FAST-PERLANE-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> @@ -2796,12 +2796,12 @@ ; AVX1-LABEL: shuffle_v8i32_76547654: ; AVX1: # %bb.0: ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] ; AVX1-NEXT: retq ; ; AVX2-SLOW-LABEL: shuffle_v8i32_76547654: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,3,2,3] ; AVX2-SLOW-NEXT: retq ; @@ -2814,13 +2814,13 @@ ; ; AVX2-FAST-PERLANE-LABEL: shuffle_v8i32_76547654: ; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,3,2,3] ; AVX2-FAST-PERLANE-NEXT: retq ; ; AVX512VL-SLOW-LABEL: shuffle_v8i32_76547654: ; AVX512VL-SLOW: # %bb.0: -; AVX512VL-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX512VL-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] ; AVX512VL-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,3,2,3] ; AVX512VL-SLOW-NEXT: retq ; @@ -2833,7 +2833,7 @@ ; ; AVX512VL-FAST-PERLANE-LABEL: shuffle_v8i32_76547654: ; AVX512VL-FAST-PERLANE: # %bb.0: -; AVX512VL-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX512VL-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] ; AVX512VL-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,3,2,3] ; AVX512VL-FAST-PERLANE-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> @@ -2844,12 +2844,12 @@ ; AVX1-LABEL: shuffle_v8i32_76543210: ; AVX1: # %bb.0: ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1] -; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] ; AVX1-NEXT: retq ; ; AVX2-SLOW-LABEL: shuffle_v8i32_76543210: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,3,0,1] ; AVX2-SLOW-NEXT: retq ; @@ -2861,13 +2861,13 @@ ; ; AVX2-FAST-PERLANE-LABEL: shuffle_v8i32_76543210: ; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] ; AVX2-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,3,0,1] ; AVX2-FAST-PERLANE-NEXT: retq ; ; AVX512VL-SLOW-LABEL: shuffle_v8i32_76543210: ; AVX512VL-SLOW: # %bb.0: -; AVX512VL-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX512VL-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] ; AVX512VL-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,3,0,1] ; AVX512VL-SLOW-NEXT: retq ; @@ -2879,7 +2879,7 @@ ; ; AVX512VL-FAST-PERLANE-LABEL: shuffle_v8i32_76543210: ; AVX512VL-FAST-PERLANE: # %bb.0: -; AVX512VL-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX512VL-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] ; AVX512VL-FAST-PERLANE-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,3,0,1] ; AVX512VL-FAST-PERLANE-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> @@ -2890,13 +2890,13 @@ ; AVX1OR2-LABEL: shuffle_v8i32_3210ba98: ; AVX1OR2: # %bb.0: ; AVX1OR2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1OR2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX1OR2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] ; AVX1OR2-NEXT: retq ; ; AVX512VL-SLOW-LABEL: shuffle_v8i32_3210ba98: ; AVX512VL-SLOW: # %bb.0: ; AVX512VL-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX512VL-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX512VL-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] ; AVX512VL-SLOW-NEXT: retq ; ; AVX512VL-FAST-ALL-LABEL: shuffle_v8i32_3210ba98: @@ -2908,7 +2908,7 @@ ; AVX512VL-FAST-PERLANE-LABEL: shuffle_v8i32_3210ba98: ; AVX512VL-FAST-PERLANE: # %bb.0: ; AVX512VL-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX512VL-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX512VL-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] ; AVX512VL-FAST-PERLANE-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> ret <8 x i32> %shuffle @@ -2918,13 +2918,13 @@ ; AVX1OR2-LABEL: shuffle_v8i32_3210fedc: ; AVX1OR2: # %bb.0: ; AVX1OR2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX1OR2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX1OR2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] ; AVX1OR2-NEXT: retq ; ; AVX512VL-SLOW-LABEL: shuffle_v8i32_3210fedc: ; AVX512VL-SLOW: # %bb.0: ; AVX512VL-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX512VL-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX512VL-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] ; AVX512VL-SLOW-NEXT: retq ; ; AVX512VL-FAST-LABEL: shuffle_v8i32_3210fedc: @@ -2940,7 +2940,7 @@ ; AVX1OR2-LABEL: shuffle_v8i32_7654fedc: ; AVX1OR2: # %bb.0: ; AVX1OR2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] -; AVX1OR2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX1OR2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] ; AVX1OR2-NEXT: retq ; ; AVX512VL-SLOW-LABEL: shuffle_v8i32_7654fedc: @@ -2968,7 +2968,7 @@ ; AVX1OR2-LABEL: shuffle_v8i32_fedc7654: ; AVX1OR2: # %bb.0: ; AVX1OR2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[2,3] -; AVX1OR2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX1OR2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] ; AVX1OR2-NEXT: retq ; ; AVX512VL-SLOW-LABEL: shuffle_v8i32_fedc7654: @@ -2997,13 +2997,13 @@ ; AVX1OR2-LABEL: shuffle_v8i32_ba987654: ; AVX1OR2: # %bb.0: ; AVX1OR2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX1OR2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX1OR2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] ; AVX1OR2-NEXT: retq ; ; AVX512VL-SLOW-LABEL: shuffle_v8i32_ba987654: ; AVX512VL-SLOW: # %bb.0: ; AVX512VL-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX512VL-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX512VL-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] ; AVX512VL-SLOW-NEXT: retq ; ; AVX512VL-FAST-LABEL: shuffle_v8i32_ba987654: @@ -3020,13 +3020,13 @@ ; AVX1OR2-LABEL: shuffle_v8i32_ba983210: ; AVX1OR2: # %bb.0: ; AVX1OR2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX1OR2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX1OR2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] ; AVX1OR2-NEXT: retq ; ; AVX512VL-SLOW-LABEL: shuffle_v8i32_ba983210: ; AVX512VL-SLOW: # %bb.0: ; AVX512VL-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX512VL-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX512VL-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] ; AVX512VL-SLOW-NEXT: retq ; ; AVX512VL-FAST-LABEL: shuffle_v8i32_ba983210: @@ -3193,7 +3193,7 @@ define <8 x i32> @shuffle_v8i32_uuuu1111(<8 x i32> %a, <8 x i32> %b) { ; ALL-LABEL: shuffle_v8i32_uuuu1111: ; ALL: # %bb.0: -; ALL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,1,1] +; ALL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1,1,1] ; ALL-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; ALL-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> @@ -3203,7 +3203,7 @@ define <8 x i32> @shuffle_v8i32_2222uuuu(<8 x i32> %a, <8 x i32> %b) { ; ALL-LABEL: shuffle_v8i32_2222uuuu: ; ALL: # %bb.0: -; ALL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,2,2,2] +; ALL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,2,2,2] ; ALL-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> ret <8 x i32> %shuffle @@ -3222,7 +3222,7 @@ ; AVX1-LABEL: shuffle_v8i32_44444444: ; AVX1: # %bb.0: ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4] +; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4] ; AVX1-NEXT: retq ; ; AVX2-SLOW-LABEL: shuffle_v8i32_44444444: @@ -3268,7 +3268,7 @@ ; AVX1-LABEL: shuffle_v8i32_44444444_bc: ; AVX1: # %bb.0: ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4] +; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4] ; AVX1-NEXT: retq ; ; AVX2-SLOW-LABEL: shuffle_v8i32_44444444_bc: @@ -3316,7 +3316,7 @@ ; ALL-LABEL: shuffle_v8i32_5555uuuu: ; ALL: # %bb.0: ; ALL-NEXT: vextractf128 $1, %ymm0, %xmm0 -; ALL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,1,1] +; ALL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1,1,1] ; ALL-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> ret <8 x i32> %shuffle @@ -3327,7 +3327,7 @@ ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 ; AVX1-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm0[1,2,3] -; AVX1-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[3,1,1,0] +; AVX1-NEXT: vshufps {{.*#+}} xmm2 = xmm2[3,1,1,0] ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,2,3] ; AVX1-NEXT: vpermilpd {{.*#+}} ymm1 = ymm1[0,0,3,2] @@ -3361,7 +3361,7 @@ ; ; AVX2OR512VL-LABEL: shuffle_v8i32_uuuuuu7u: ; AVX2OR512VL: # %bb.0: -; AVX2OR512VL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,3,3,3,7,7,7,7] +; AVX2OR512VL-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,3,3,3,7,7,7,7] ; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> ret <8 x i32> %shuffle @@ -3372,14 +3372,14 @@ ; AVX1OR2: # %bb.0: ; AVX1OR2-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 ; AVX1OR2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1OR2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX1OR2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] ; AVX1OR2-NEXT: retq ; ; AVX512VL-SLOW-LABEL: shuffle_v8i32_32107654_v4i32: ; AVX512VL-SLOW: # %bb.0: ; AVX512VL-SLOW-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 ; AVX512VL-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX512VL-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX512VL-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] ; AVX512VL-SLOW-NEXT: retq ; ; AVX512VL-FAST-ALL-LABEL: shuffle_v8i32_32107654_v4i32: @@ -3394,7 +3394,7 @@ ; AVX512VL-FAST-PERLANE: # %bb.0: ; AVX512VL-FAST-PERLANE-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 ; AVX512VL-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX512VL-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX512VL-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] ; AVX512VL-FAST-PERLANE-NEXT: retq %1 = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> %2 = shufflevector <4 x i32> %b, <4 x i32> undef, <4 x i32> @@ -3407,14 +3407,14 @@ ; AVX1OR2: # %bb.0: ; AVX1OR2-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 ; AVX1OR2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1OR2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4] +; AVX1OR2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4] ; AVX1OR2-NEXT: retq ; ; AVX512VL-SLOW-LABEL: shuffle_v8i32_00004444_v4f32: ; AVX512VL-SLOW: # %bb.0: ; AVX512VL-SLOW-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 ; AVX512VL-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX512VL-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4] +; AVX512VL-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4] ; AVX512VL-SLOW-NEXT: retq ; ; AVX512VL-FAST-ALL-LABEL: shuffle_v8i32_00004444_v4f32: @@ -3429,7 +3429,7 @@ ; AVX512VL-FAST-PERLANE: # %bb.0: ; AVX512VL-FAST-PERLANE-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 ; AVX512VL-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX512VL-FAST-PERLANE-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4] +; AVX512VL-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4] ; AVX512VL-FAST-PERLANE-NEXT: retq %1 = shufflevector <4 x i32> %a, <4 x i32> %b, <8 x i32> ret <8 x i32> %1 @@ -3449,7 +3449,7 @@ define <8 x float> @splat_v8f32(<4 x float> %r) { ; AVX1-LABEL: splat_v8f32: ; AVX1: # %bb.0: -; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0] +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0,0,0] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-NEXT: retq ; @@ -3470,7 +3470,7 @@ ; AVX1: # %bb.0: ; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm1[0,2],ymm0[4,6],ymm1[4,6] -; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,0,3,1,6,4,7,5] +; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0,3,1,6,4,7,5] ; AVX1-NEXT: retq ; ; AVX2OR512VL-LABEL: shuffle_v8i32_z0U2zUz6: @@ -3486,7 +3486,7 @@ ; AVX1: # %bb.0: ; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm1[1,3],ymm0[5,7],ymm1[5,7] -; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7] +; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7] ; AVX1-NEXT: retq ; ; AVX2OR512VL-LABEL: shuffle_v8i32_1U3z5zUU: @@ -3560,7 +3560,7 @@ define <8 x i32> @shuffle_v8i32_30127456(<8 x i32> %a, <8 x i32> %b) { ; ALL-LABEL: shuffle_v8i32_30127456: ; ALL: # %bb.0: -; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,0,1,2,7,4,5,6] +; ALL-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,0,1,2,7,4,5,6] ; ALL-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> ret <8 x i32> %shuffle @@ -3569,7 +3569,7 @@ define <8 x i32> @shuffle_v8i32_12305674(<8 x i32> %a, <8 x i32> %b) { ; ALL-LABEL: shuffle_v8i32_12305674: ; ALL: # %bb.0: -; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,2,3,0,5,6,7,4] +; ALL-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,2,3,0,5,6,7,4] ; ALL-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> ret <8 x i32> %shuffle @@ -3822,14 +3822,14 @@ define <8 x float> @broadcast_concat_crash(<4 x float> %x, <4 x float> %y, float %z) { ; AVX1OR2-LABEL: broadcast_concat_crash: ; AVX1OR2: # %bb.0: # %entry -; AVX1OR2-NEXT: vpermilps {{.*#+}} xmm0 = xmm1[3,3,3,3] +; AVX1OR2-NEXT: vshufps {{.*#+}} xmm0 = xmm1[3,3,3,3] ; AVX1OR2-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[2,3] ; AVX1OR2-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1OR2-NEXT: retq ; ; AVX512VL-SLOW-LABEL: broadcast_concat_crash: ; AVX512VL-SLOW: # %bb.0: # %entry -; AVX512VL-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm1[3,3,3,3] +; AVX512VL-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm1[3,3,3,3] ; AVX512VL-SLOW-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[2,3] ; AVX512VL-SLOW-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX512VL-SLOW-NEXT: retq @@ -3931,7 +3931,7 @@ ; AVX1-LABEL: lowhalf_v8i32: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[2,2,2,2] +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,2,2,2] ; AVX1-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX1-NEXT: retq ; @@ -3958,7 +3958,7 @@ ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3],xmm1[2,2] -; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,1,3] +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,1,3] ; AVX1-NEXT: retq ; ; AVX2-LABEL: lowhalf_v8f32: diff --git a/llvm/test/CodeGen/X86/vector-shuffle-512-v16.ll b/llvm/test/CodeGen/X86/vector-shuffle-512-v16.ll --- a/llvm/test/CodeGen/X86/vector-shuffle-512-v16.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-512-v16.ll @@ -411,7 +411,7 @@ define <16 x i32> @shuffle_v16i16_1_0_0_0_5_4_4_4_9_8_8_8_13_12_12_12(<16 x i32> %a, <16 x i32> %b) { ; ALL-LABEL: shuffle_v16i16_1_0_0_0_5_4_4_4_9_8_8_8_13_12_12_12: ; ALL: # %bb.0: -; ALL-NEXT: vpermilps {{.*#+}} zmm0 = zmm0[1,0,0,0,5,4,4,4,9,8,8,8,13,12,12,12] +; ALL-NEXT: vshufps {{.*#+}} zmm0 = zmm0[1,0,0,0,5,4,4,4,9,8,8,8,13,12,12,12] ; ALL-NEXT: retq %c = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32> ret <16 x i32> %c @@ -420,7 +420,7 @@ define <16 x i32> @shuffle_v16i16_3_3_0_0_7_7_4_4_11_11_8_8_15_15_12_12(<16 x i32> %a, <16 x i32> %b) { ; ALL-LABEL: shuffle_v16i16_3_3_0_0_7_7_4_4_11_11_8_8_15_15_12_12: ; ALL: # %bb.0: -; ALL-NEXT: vpermilps {{.*#+}} zmm0 = zmm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13] +; ALL-NEXT: vshufps {{.*#+}} zmm0 = zmm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13] ; ALL-NEXT: retq %c = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32> ret <16 x i32> %c diff --git a/llvm/test/CodeGen/X86/vector-shuffle-512-v8.ll b/llvm/test/CodeGen/X86/vector-shuffle-512-v8.ll --- a/llvm/test/CodeGen/X86/vector-shuffle-512-v8.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-512-v8.ll @@ -1300,7 +1300,7 @@ define <8 x i64> @shuffle_v8i64_00224466(<8 x i64> %a, <8 x i64> %b) { ; ALL-LABEL: shuffle_v8i64_00224466: ; ALL: # %bb.0: -; ALL-NEXT: vpermilps {{.*#+}} zmm0 = zmm0[0,1,0,1,4,5,4,5,8,9,8,9,12,13,12,13] +; ALL-NEXT: vshufps {{.*#+}} zmm0 = zmm0[0,1,0,1,4,5,4,5,8,9,8,9,12,13,12,13] ; ALL-NEXT: ret{{[l|q]}} %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> ret <8 x i64> %shuffle @@ -1309,7 +1309,7 @@ define <8 x i64> @shuffle_v8i64_10325476(<8 x i64> %a, <8 x i64> %b) { ; ALL-LABEL: shuffle_v8i64_10325476: ; ALL: # %bb.0: -; ALL-NEXT: vpermilps {{.*#+}} zmm0 = zmm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13] +; ALL-NEXT: vshufps {{.*#+}} zmm0 = zmm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13] ; ALL-NEXT: ret{{[l|q]}} %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> ret <8 x i64> %shuffle @@ -1318,7 +1318,7 @@ define <8 x i64> @shuffle_v8i64_11335577(<8 x i64> %a, <8 x i64> %b) { ; ALL-LABEL: shuffle_v8i64_11335577: ; ALL: # %bb.0: -; ALL-NEXT: vpermilps {{.*#+}} zmm0 = zmm0[2,3,2,3,6,7,6,7,10,11,10,11,14,15,14,15] +; ALL-NEXT: vshufps {{.*#+}} zmm0 = zmm0[2,3,2,3,6,7,6,7,10,11,10,11,14,15,14,15] ; ALL-NEXT: ret{{[l|q]}} %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> ret <8 x i64> %shuffle diff --git a/llvm/test/CodeGen/X86/vector-shuffle-avx512.ll b/llvm/test/CodeGen/X86/vector-shuffle-avx512.ll --- a/llvm/test/CodeGen/X86/vector-shuffle-avx512.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-avx512.ll @@ -16,7 +16,7 @@ ; ; KNL-LABEL: expand: ; KNL: # %bb.0: -; KNL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,3] +; KNL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1,1,3] ; KNL-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; KNL-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3,4,5,6,7] ; KNL-NEXT: ret{{[l|q]}} @@ -291,7 +291,7 @@ ; ; KNL-LABEL: expand15: ; KNL: # %bb.0: -; KNL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,3] +; KNL-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1,1,3] ; KNL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,3] ; KNL-NEXT: vblendps {{.*#+}} ymm0 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6,7] ; KNL-NEXT: ret{{[l|q]}} diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll --- a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll @@ -76,7 +76,7 @@ define <4 x float> @combine_vpermilvar_4f32_unpckh(<4 x float> %a0) { ; CHECK-LABEL: combine_vpermilvar_4f32_unpckh: ; CHECK: # %bb.0: -; CHECK-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,2,3,3] +; CHECK-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,2,3,3] ; CHECK-NEXT: ret{{[l|q]}} %1 = tail call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> %a0, <4 x i32> ) ret <4 x float> %1 @@ -85,7 +85,7 @@ define <4 x float> @combine_vpermilvar_4f32_unpckl(<4 x float> %a0) { ; CHECK-LABEL: combine_vpermilvar_4f32_unpckl: ; CHECK: # %bb.0: -; CHECK-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,1,1] +; CHECK-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0,1,1] ; CHECK-NEXT: ret{{[l|q]}} %1 = tail call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> %a0, <4 x i32> ) ret <4 x float> %1 @@ -143,7 +143,7 @@ ; AVX512-NEXT: vmovaps {{.*#+}} ymm1 = [16,17,18,19,3,2,1,0] ; AVX512-NEXT: vxorps %xmm2, %xmm2, %xmm2 ; AVX512-NEXT: vpermt2ps %zmm2, %zmm1, %zmm0 -; AVX512-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX512-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] ; AVX512-NEXT: ret{{[l|q]}} %1 = tail call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %a0, <8 x i32> ) %2 = shufflevector <8 x float> %1, <8 x float> zeroinitializer, <8 x i32> @@ -274,7 +274,7 @@ define <4 x float> @combine_vpermilvar_4f32_4stage(<4 x float> %a0) { ; CHECK-LABEL: combine_vpermilvar_4f32_4stage: ; CHECK: # %bb.0: -; CHECK-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,0,3,1] +; CHECK-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,0,3,1] ; CHECK-NEXT: ret{{[l|q]}} %1 = tail call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> %a0, <4 x i32> ) %2 = tail call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> %1, <4 x i32> ) @@ -286,7 +286,7 @@ define <8 x float> @combine_vpermilvar_8f32_4stage(<8 x float> %a0) { ; CHECK-LABEL: combine_vpermilvar_8f32_4stage: ; CHECK: # %bb.0: -; CHECK-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,0,3,1,6,4,7,5] +; CHECK-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0,3,1,6,4,7,5] ; CHECK-NEXT: ret{{[l|q]}} %1 = tail call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %a0, <8 x i32> ) %2 = tail call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %1, <8 x i32> ) diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx2.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx2.ll --- a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx2.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx2.ll @@ -755,7 +755,7 @@ define <32 x i8> @combine_pshufb_pshufb_or_pshufb(<32 x i8> %a0) { ; CHECK-LABEL: combine_pshufb_pshufb_or_pshufb: ; CHECK: # %bb.0: -; CHECK-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4] +; CHECK-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4] ; CHECK-NEXT: ret{{[l|q]}} %1 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> ) %2 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> ) diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll --- a/llvm/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll @@ -204,7 +204,7 @@ ; ; AVX-LABEL: combine_pshufb_palignr: ; AVX: # %bb.0: -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX-NEXT: retq %1 = shufflevector <16 x i8> %a0, <16 x i8> %a1, <16 x i32> %2 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %1, <16 x i8> ) @@ -723,7 +723,7 @@ ; ; AVX1-LABEL: combine_pshufb_pshufb_or_pshufb: ; AVX1: # %bb.0: -; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0] +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0,0,0] ; AVX1-NEXT: retq ; ; AVX2-LABEL: combine_pshufb_pshufb_or_pshufb: diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining.ll --- a/llvm/test/CodeGen/X86/vector-shuffle-combining.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-combining.ll @@ -100,7 +100,7 @@ ; ; AVX1-LABEL: combine_pshufd6: ; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0] +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0,0,0] ; AVX1-NEXT: retq ; ; AVX2-LABEL: combine_pshufd6: @@ -178,7 +178,7 @@ ; AVX-LABEL: combine_bitwise_ops_test1: ; AVX: # %bb.0: ; AVX-NEXT: vandps %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,1,3] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,1,3] ; AVX-NEXT: retq %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32> %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32> @@ -196,7 +196,7 @@ ; AVX-LABEL: combine_bitwise_ops_test2: ; AVX: # %bb.0: ; AVX-NEXT: vorps %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,1,3] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,1,3] ; AVX-NEXT: retq %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32> %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32> @@ -214,7 +214,7 @@ ; AVX-LABEL: combine_bitwise_ops_test3: ; AVX: # %bb.0: ; AVX-NEXT: vxorps %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,1,3] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,1,3] ; AVX-NEXT: retq %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32> %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32> @@ -232,7 +232,7 @@ ; AVX-LABEL: combine_bitwise_ops_test4: ; AVX: # %bb.0: ; AVX-NEXT: vandps %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,1,3] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,1,3] ; AVX-NEXT: retq %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32> %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32> @@ -250,7 +250,7 @@ ; AVX-LABEL: combine_bitwise_ops_test5: ; AVX: # %bb.0: ; AVX-NEXT: vorps %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,1,3] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,1,3] ; AVX-NEXT: retq %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32> %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32> @@ -268,7 +268,7 @@ ; AVX-LABEL: combine_bitwise_ops_test6: ; AVX: # %bb.0: ; AVX-NEXT: vxorps %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,1,3] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,1,3] ; AVX-NEXT: retq %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32> %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32> @@ -628,7 +628,7 @@ ; ; AVX-LABEL: combine_nested_undef_test1: ; AVX: # %bb.0: -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,0,1] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,1,0,1] ; AVX-NEXT: retq %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> @@ -643,7 +643,7 @@ ; ; AVX-LABEL: combine_nested_undef_test2: ; AVX: # %bb.0: -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,1,0,3] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,1,0,3] ; AVX-NEXT: retq %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> @@ -658,7 +658,7 @@ ; ; AVX-LABEL: combine_nested_undef_test3: ; AVX: # %bb.0: -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,1,0,3] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,1,0,3] ; AVX-NEXT: retq %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> @@ -673,7 +673,7 @@ ; ; AVX1-LABEL: combine_nested_undef_test4: ; AVX1: # %bb.0: -; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,0,1] +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1,0,1] ; AVX1-NEXT: retq ; ; AVX2-LABEL: combine_nested_undef_test4: @@ -693,7 +693,7 @@ ; ; AVX-LABEL: combine_nested_undef_test5: ; AVX: # %bb.0: -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX-NEXT: retq %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> @@ -708,7 +708,7 @@ ; ; AVX-LABEL: combine_nested_undef_test6: ; AVX: # %bb.0: -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,0,1] ; AVX-NEXT: retq %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> @@ -723,7 +723,7 @@ ; ; AVX-LABEL: combine_nested_undef_test7: ; AVX: # %bb.0: -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,0,2] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,0,2] ; AVX-NEXT: retq %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> @@ -738,7 +738,7 @@ ; ; AVX-LABEL: combine_nested_undef_test8: ; AVX: # %bb.0: -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,3,3] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1,3,3] ; AVX-NEXT: retq %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> @@ -753,7 +753,7 @@ ; ; AVX-LABEL: combine_nested_undef_test9: ; AVX: # %bb.0: -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,3,2,2] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3,2,2] ; AVX-NEXT: retq %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> @@ -768,7 +768,7 @@ ; ; AVX-LABEL: combine_nested_undef_test10: ; AVX: # %bb.0: -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,1,1] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1,1,1] ; AVX-NEXT: retq %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> @@ -783,7 +783,7 @@ ; ; AVX-LABEL: combine_nested_undef_test11: ; AVX: # %bb.0: -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,2,1] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1,2,1] ; AVX-NEXT: retq %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> @@ -798,7 +798,7 @@ ; ; AVX1-LABEL: combine_nested_undef_test12: ; AVX1: # %bb.0: -; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,0,1] +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1,0,1] ; AVX1-NEXT: retq ; ; AVX2-LABEL: combine_nested_undef_test12: @@ -867,15 +867,15 @@ ; ; AVX1-LABEL: combine_nested_undef_test15: ; AVX1: # %bb.0: -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,0,1,1] -; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,0,1] +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,0,1,1] +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,1,0,1] ; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] ; AVX1-NEXT: retq ; ; AVX2-LABEL: combine_nested_undef_test15: ; AVX2: # %bb.0: ; AVX2-NEXT: vbroadcastss %xmm1, %xmm1 -; AVX2-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,0,1] +; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,1,0,1] ; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] ; AVX2-NEXT: retq %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> @@ -906,7 +906,7 @@ ; ; AVX-LABEL: combine_nested_undef_test16: ; AVX: # %bb.0: -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,0,1] ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] ; AVX-NEXT: retq %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> @@ -936,7 +936,7 @@ ; AVX-LABEL: combine_nested_undef_test17: ; AVX: # %bb.0: ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,0,1] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,1,0,1] ; AVX-NEXT: retq %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> @@ -951,7 +951,7 @@ ; ; AVX-LABEL: combine_nested_undef_test18: ; AVX: # %bb.0: -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm1[1,1,0,3] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm1[1,1,0,3] ; AVX-NEXT: retq %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> @@ -980,7 +980,7 @@ ; AVX-LABEL: combine_nested_undef_test19: ; AVX: # %bb.0: ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,0,0,0] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,0,0,0] ; AVX-NEXT: retq %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> @@ -1011,7 +1011,7 @@ ; AVX-LABEL: combine_nested_undef_test20: ; AVX: # %bb.0: ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,3,0] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,3,0] ; AVX-NEXT: retq %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> @@ -1040,7 +1040,7 @@ ; AVX1-LABEL: combine_nested_undef_test21: ; AVX1: # %bb.0: ; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] -; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,0,1] +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1,0,1] ; AVX1-NEXT: retq ; ; AVX2-LABEL: combine_nested_undef_test21: @@ -1065,7 +1065,7 @@ ; ; AVX-LABEL: combine_nested_undef_test22: ; AVX: # %bb.0: -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm1[1,1,1,3] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm1[1,1,1,3] ; AVX-NEXT: retq %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> @@ -1080,7 +1080,7 @@ ; ; AVX-LABEL: combine_nested_undef_test23: ; AVX: # %bb.0: -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm1[0,1,0,3] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm1[0,1,0,3] ; AVX-NEXT: retq %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> @@ -1095,7 +1095,7 @@ ; ; AVX-LABEL: combine_nested_undef_test24: ; AVX: # %bb.0: -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm1[0,3,2,3] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm1[0,3,2,3] ; AVX-NEXT: retq %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> @@ -1110,7 +1110,7 @@ ; ; AVX1-LABEL: combine_nested_undef_test25: ; AVX1: # %bb.0: -; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,0,1] +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1,0,1] ; AVX1-NEXT: retq ; ; AVX2-LABEL: combine_nested_undef_test25: @@ -1130,7 +1130,7 @@ ; ; AVX-LABEL: combine_nested_undef_test26: ; AVX: # %bb.0: -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX-NEXT: retq %1 = shufflevector <4 x i32> %B, <4 x i32> %A, <4 x i32> %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> @@ -1145,7 +1145,7 @@ ; ; AVX1-LABEL: combine_nested_undef_test27: ; AVX1: # %bb.0: -; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,0,1] +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1,0,1] ; AVX1-NEXT: retq ; ; AVX2-LABEL: combine_nested_undef_test27: @@ -1165,7 +1165,7 @@ ; ; AVX-LABEL: combine_nested_undef_test28: ; AVX: # %bb.0: -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,0] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1,1,0] ; AVX-NEXT: retq %1 = shufflevector <4 x i32> %B, <4 x i32> %A, <4 x i32> %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> @@ -1640,7 +1640,7 @@ ; ; AVX-LABEL: combine_test1b: ; AVX: # %bb.0: -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm1[0,1,2,0] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm1[0,1,2,0] ; AVX-NEXT: retq %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> @@ -1695,7 +1695,7 @@ ; AVX-LABEL: combine_test3b: ; AVX: # %bb.0: ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,3,2,3] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,3,2,3] ; AVX-NEXT: retq %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> @@ -1711,7 +1711,7 @@ ; ; AVX-LABEL: combine_test4b: ; AVX: # %bb.0: -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm1[1,1,2,3] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm1[1,1,2,3] ; AVX-NEXT: retq %1 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> @@ -2457,7 +2457,7 @@ ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] ; AVX1-NEXT: retq ; ; AVX2-SLOW-LABEL: combine_unneeded_subvector1: @@ -2499,7 +2499,7 @@ ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[2,3],ymm0[0,1] -; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] ; AVX1-NEXT: retq ; ; AVX2-LABEL: combine_unneeded_subvector2: @@ -2741,7 +2741,7 @@ ; AVX: # %bb.0: # %entry ; AVX-NEXT: vhaddps %xmm0, %xmm0, %xmm1 ; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,1] -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,1,3] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,1,3] ; AVX-NEXT: retq entry: %s1 = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> @@ -2777,7 +2777,7 @@ ; ; AVX-LABEL: PR22390: ; AVX: # %bb.0: # %entry -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,0,1,2] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,0,1,2] ; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm0[1,2,3] ; AVX-NEXT: vaddps %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-shuffle-concatenation.ll b/llvm/test/CodeGen/X86/vector-shuffle-concatenation.ll --- a/llvm/test/CodeGen/X86/vector-shuffle-concatenation.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-concatenation.ll @@ -22,7 +22,7 @@ ; AVX-LABEL: concat_a_to_shuf_of_a: ; AVX: # %bb.0: ; AVX-NEXT: vmovaps (%rdi), %xmm0 -; AVX-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,0,1] ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX-NEXT: vmovaps %ymm0, (%rsi) ; AVX-NEXT: vzeroupper @@ -69,7 +69,7 @@ ; AVX-LABEL: concat_shuf_of_a_to_a: ; AVX: # %bb.0: ; AVX-NEXT: vmovaps (%rdi), %xmm0 -; AVX-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,0,1] ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX-NEXT: vmovaps %ymm0, (%rdx) ; AVX-NEXT: vzeroupper @@ -119,7 +119,7 @@ ; AVX-LABEL: concat_a_to_shuf_of_a_extrause_of_shuf: ; AVX: # %bb.0: ; AVX-NEXT: vmovaps (%rdi), %xmm0 -; AVX-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,0,1] ; AVX-NEXT: vmovaps %xmm1, (%rdx) ; AVX-NEXT: vmovaps %xmm0, 16(%rsi) ; AVX-NEXT: vmovaps %xmm1, (%rsi) @@ -128,7 +128,7 @@ ; AVX2-LABEL: concat_a_to_shuf_of_a_extrause_of_shuf: ; AVX2: # %bb.0: ; AVX2-NEXT: vmovaps (%rdi), %xmm0 -; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,0,1] ; AVX2-NEXT: vmovaps %xmm1, (%rdx) ; AVX2-NEXT: vmovaps %xmm0, 16(%rsi) ; AVX2-NEXT: vmovaps %xmm1, (%rsi) @@ -137,7 +137,7 @@ ; AVX512F-LABEL: concat_a_to_shuf_of_a_extrause_of_shuf: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovaps (%rdi), %xmm0 -; AVX512F-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512F-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,0,1] ; AVX512F-NEXT: vmovaps %xmm1, (%rdx) ; AVX512F-NEXT: vmovaps %xmm0, 16(%rsi) ; AVX512F-NEXT: vmovaps %xmm1, (%rsi) @@ -146,7 +146,7 @@ ; AVX512BW-LABEL: concat_a_to_shuf_of_a_extrause_of_shuf: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovaps (%rdi), %xmm0 -; AVX512BW-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512BW-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,0,1] ; AVX512BW-NEXT: vmovaps %xmm1, (%rdx) ; AVX512BW-NEXT: vmovaps %xmm0, 16(%rsi) ; AVX512BW-NEXT: vmovaps %xmm1, (%rsi) @@ -618,7 +618,7 @@ ; AVX-LABEL: concat_aaa_to_shuf_of_a: ; AVX: # %bb.0: ; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] -; AVX-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,0,1] ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1 ; AVX-NEXT: vmovaps %ymm0, 32(%rsi) ; AVX-NEXT: vmovaps %ymm1, (%rsi) @@ -675,7 +675,7 @@ ; AVX-LABEL: concat_shuf_of_a_to_aaa: ; AVX: # %bb.0: ; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] -; AVX-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm0[2,3,0,1] ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX-NEXT: vmovaps %ymm0, (%rsi) ; AVX-NEXT: vmovaps %ymm1, 32(%rsi) diff --git a/llvm/test/CodeGen/X86/vector-trunc-ssat.ll b/llvm/test/CodeGen/X86/vector-trunc-ssat.ll --- a/llvm/test/CodeGen/X86/vector-trunc-ssat.ll +++ b/llvm/test/CodeGen/X86/vector-trunc-ssat.ll @@ -119,7 +119,7 @@ ; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [18446744071562067968,18446744071562067968] ; AVX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 ; AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3] ; AVX-NEXT: retq ; ; AVX512F-LABEL: trunc_ssat_v2i64_v2i32: @@ -263,7 +263,7 @@ ; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [18446744071562067968,18446744071562067968] ; AVX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 ; AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3] ; AVX-NEXT: vmovlpd %xmm0, (%rdi) ; AVX-NEXT: retq ; @@ -1119,7 +1119,7 @@ ; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [18446744073709518848,18446744073709518848] ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3] ; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] ; AVX1-NEXT: retq ; @@ -1131,7 +1131,7 @@ ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = [18446744073709518848,18446744073709518848] ; AVX2-SLOW-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 ; AVX2-SLOW-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] ; AVX2-SLOW-NEXT: retq ; @@ -1288,7 +1288,7 @@ ; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [18446744073709518848,18446744073709518848] ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 ; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3] ; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] ; AVX1-NEXT: vmovd %xmm0, (%rdi) ; AVX1-NEXT: retq @@ -1301,7 +1301,7 @@ ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = [18446744073709518848,18446744073709518848] ; AVX2-SLOW-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 ; AVX2-SLOW-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovd %xmm0, (%rdi) ; AVX2-SLOW-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-trunc-usat.ll b/llvm/test/CodeGen/X86/vector-trunc-usat.ll --- a/llvm/test/CodeGen/X86/vector-trunc-usat.ll +++ b/llvm/test/CodeGen/X86/vector-trunc-usat.ll @@ -80,7 +80,7 @@ ; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [9223372041149743103,9223372041149743103] ; AVX-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 ; AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3] ; AVX-NEXT: retq ; ; AVX512F-LABEL: trunc_usat_v2i64_v2i32: @@ -183,7 +183,7 @@ ; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [9223372041149743103,9223372041149743103] ; AVX-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 ; AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3] ; AVX-NEXT: vmovlpd %xmm0, (%rdi) ; AVX-NEXT: retq ; @@ -781,7 +781,7 @@ ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [9223372036854841343,9223372036854841343] ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 ; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3] ; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] ; AVX1-NEXT: retq ; @@ -792,7 +792,7 @@ ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = [9223372036854841343,9223372036854841343] ; AVX2-SLOW-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 ; AVX2-SLOW-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] ; AVX2-SLOW-NEXT: retq ; @@ -907,7 +907,7 @@ ; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [9223372036854841343,9223372036854841343] ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 ; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3] ; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] ; AVX1-NEXT: vmovd %xmm0, (%rdi) ; AVX1-NEXT: retq @@ -919,7 +919,7 @@ ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm3 = [9223372036854841343,9223372036854841343] ; AVX2-SLOW-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 ; AVX2-SLOW-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vmovd %xmm0, (%rdi) ; AVX2-SLOW-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vselect-avx.ll b/llvm/test/CodeGen/X86/vselect-avx.ll --- a/llvm/test/CodeGen/X86/vselect-avx.ll +++ b/llvm/test/CodeGen/X86/vselect-avx.ll @@ -288,16 +288,16 @@ ; AVX1-LABEL: vselect_concat_splat: ; AVX1: ## %bb.0: ## %entry ; AVX1-NEXT: vmovups (%rax), %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,3,2,1] -; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,0,3,2] +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm0[0,3,2,1] +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,0,3,2] ; AVX1-NEXT: vmovups 16, %xmm2 ; AVX1-NEXT: vmovups 32, %xmm3 ; AVX1-NEXT: vblendps {{.*#+}} xmm4 = mem[0],xmm3[1],mem[2,3] ; AVX1-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],xmm2[2],xmm4[3] -; AVX1-NEXT: vpermilps {{.*#+}} xmm4 = xmm4[0,3,2,1] +; AVX1-NEXT: vshufps {{.*#+}} xmm4 = xmm4[0,3,2,1] ; AVX1-NEXT: vblendps {{.*#+}} xmm3 = mem[0,1],xmm3[2,3] ; AVX1-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm3[1,2],xmm2[3] -; AVX1-NEXT: vpermilps {{.*#+}} xmm2 = xmm2[1,0,3,2] +; AVX1-NEXT: vshufps {{.*#+}} xmm2 = xmm2[1,0,3,2] ; AVX1-NEXT: vxorps %xmm3, %xmm3, %xmm3 ; AVX1-NEXT: vcmpneqps %xmm3, %xmm1, %xmm3 ; AVX1-NEXT: vblendvps %xmm3, %xmm4, %xmm1, %xmm1 diff --git a/llvm/test/CodeGen/X86/x86-interleaved-access.ll b/llvm/test/CodeGen/X86/x86-interleaved-access.ll --- a/llvm/test/CodeGen/X86/x86-interleaved-access.ll +++ b/llvm/test/CodeGen/X86/x86-interleaved-access.ll @@ -1720,13 +1720,13 @@ ; AVX2: # %bb.0: ; AVX2-NEXT: vmovups (%rdi), %ymm0 ; AVX2-NEXT: vmovups (%rdi), %xmm1 -; AVX2-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[0,0,1,1] +; AVX2-NEXT: vshufps {{.*#+}} xmm2 = xmm1[0,0,1,1] ; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,0,1,1] -; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[2,2,3,3] +; AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,2,3,3] ; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,0,1,1] -; AVX2-NEXT: vpermilps {{.*#+}} ymm3 = ymm0[0,0,1,1,4,4,5,5] +; AVX2-NEXT: vshufps {{.*#+}} ymm3 = ymm0[0,0,1,1,4,4,5,5] ; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[2,2,3,3] -; AVX2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,2,3,3,6,6,7,7] +; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,2,3,3,6,6,7,7] ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,3,3] ; AVX2-NEXT: vmovups %ymm0, 96(%rsi) ; AVX2-NEXT: vmovups %ymm3, 64(%rsi) @@ -1780,13 +1780,13 @@ ; AVX2: # %bb.0: ; AVX2-NEXT: vmovups (%rdi), %ymm0 ; AVX2-NEXT: vmovups (%rdi), %xmm1 -; AVX2-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[0,0,1,1] +; AVX2-NEXT: vshufps {{.*#+}} xmm2 = xmm1[0,0,1,1] ; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,0,1,1] -; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[2,2,3,3] +; AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,2,3,3] ; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,0,1,1] -; AVX2-NEXT: vpermilps {{.*#+}} ymm3 = ymm0[0,0,1,1,4,4,5,5] +; AVX2-NEXT: vshufps {{.*#+}} ymm3 = ymm0[0,0,1,1,4,4,5,5] ; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[2,2,3,3] -; AVX2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,2,3,3,6,6,7,7] +; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,2,3,3,6,6,7,7] ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,3,3] ; AVX2-NEXT: vmovups %ymm0, 96(%rsi) ; AVX2-NEXT: vmovups %ymm3, 64(%rsi) diff --git a/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll b/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll --- a/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll +++ b/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll @@ -1942,7 +1942,7 @@ ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0],ymm1[1,3],ymm0[4,4],ymm1[5,7] -; AVX-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7] +; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7] ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX-NEXT: vpaddb 16(%rdx), %xmm1, %xmm1 ; AVX-NEXT: vpaddb (%rdx), %xmm0, %xmm0 @@ -4649,7 +4649,7 @@ ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm2 ; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,0],ymm1[1,3],ymm2[4,4],ymm1[5,7] -; AVX-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[0,2,1,3] +; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm1[0,2,1,3] ; AVX-NEXT: vxorps %xmm3, %xmm3, %xmm3 ; AVX-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[4],ymm3[4],ymm1[5],ymm3[5] ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] @@ -4849,14 +4849,14 @@ ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7] -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,0] +; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,0] ; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,1,0,1] ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 ; AVX-NEXT: vxorps %xmm3, %xmm3, %xmm3 ; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5],ymm2[6],ymm3[7] ; AVX-NEXT: vxorps %xmm3, %xmm3, %xmm3 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3] -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,0,1,1] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,0,1,1] ; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 ; AVX-NEXT: vextractf128 $1, %ymm2, %xmm2 ; AVX-NEXT: vpaddb 16(%rdx), %xmm2, %xmm2 @@ -6613,7 +6613,7 @@ ; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0],ymm1[1,3],ymm0[4,4],ymm1[5,7] -; AVX-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7] +; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7] ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX-NEXT: vpaddb 48(%rdx), %xmm1, %xmm2 ; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm3 diff --git a/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast_from_memory.ll b/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast_from_memory.ll --- a/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast_from_memory.ll +++ b/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast_from_memory.ll @@ -1562,7 +1562,7 @@ ; AVX: # %bb.0: ; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0],mem[1,3],ymm0[4,4],mem[5,7] -; AVX-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7] +; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7] ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 @@ -3743,7 +3743,7 @@ ; AVX-NEXT: vmovaps 48(%rdi), %xmm1 ; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm2 = mem[0,1,0,1] ; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,0],ymm1[1,3],ymm2[4,4],ymm1[5,7] -; AVX-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[0,2,1,3] +; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm1[0,2,1,3] ; AVX-NEXT: vxorps %xmm3, %xmm3, %xmm3 ; AVX-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[4],ymm3[4],ymm1[5],ymm3[5] ; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1 @@ -3896,13 +3896,13 @@ ; AVX: # %bb.0: ; AVX-NEXT: vmovaps (%rdi), %xmm0 ; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm0[0],mem[1,2,3] -; AVX-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,1,2,0] +; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,2,0] ; AVX-NEXT: vbroadcastss (%rdi), %ymm2 ; AVX-NEXT: vxorps %xmm3, %xmm3, %xmm3 ; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5],ymm2[6],ymm3[7] ; AVX-NEXT: vxorps %xmm3, %xmm3, %xmm3 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3] -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,0,1,1] +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,0,1,1] ; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm0 ; AVX-NEXT: vextractf128 $1, %ymm2, %xmm2 ; AVX-NEXT: vpaddb 16(%rsi), %xmm2, %xmm2 @@ -5369,7 +5369,7 @@ ; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] ; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0],ymm1[1,3],ymm0[4,4],ymm1[5,7] -; AVX-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7] +; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7] ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX-NEXT: vpaddb 48(%rsi), %xmm1, %xmm2 ; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm3