Index: test/CodeGen/X86/avx2-conversions.ll =================================================================== --- test/CodeGen/X86/avx2-conversions.ll +++ test/CodeGen/X86/avx2-conversions.ll @@ -1,15 +1,25 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=X32 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=X64 +; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=X32,X32-SLOW +; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefixes=X32,X32-FAST +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=X64,X64-SLOW +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefixes=X64-FAST define <4 x i32> @trunc4(<4 x i64> %A) nounwind { -; X32-LABEL: trunc4: -; X32: # %bb.0: -; X32-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] -; X32-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3] -; X32-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0 -; X32-NEXT: vzeroupper -; X32-NEXT: retl +; X32-SLOW-LABEL: trunc4: +; X32-SLOW: # %bb.0: +; X32-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] +; X32-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3] +; X32-SLOW-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0 +; X32-SLOW-NEXT: vzeroupper +; X32-SLOW-NEXT: retl +; +; X32-FAST-LABEL: trunc4: +; X32-FAST: # %bb.0: +; X32-FAST-NEXT: vmovaps {{.*#+}} ymm1 = [0,2,4,6,4,6,6,7] +; X32-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; X32-FAST-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0 +; X32-FAST-NEXT: vzeroupper +; X32-FAST-NEXT: retl ; ; X64-LABEL: trunc4: ; X64: # %bb.0: @@ -18,6 +28,14 @@ ; X64-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0 ; X64-NEXT: vzeroupper ; X64-NEXT: retq +; +; X64-FAST-LABEL: trunc4: +; X64-FAST: # %bb.0: +; X64-FAST-NEXT: vmovaps {{.*#+}} ymm1 = [0,2,4,6,4,6,6,7] +; X64-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; X64-FAST-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0 +; X64-FAST-NEXT: vzeroupper +; X64-FAST-NEXT: retq %B = trunc <4 x i64> %A to <4 x i32> ret <4 x i32>%B } @@ -38,6 +56,14 @@ ; X64-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0 ; X64-NEXT: vzeroupper ; X64-NEXT: retq +; +; X64-FAST-LABEL: trunc8: +; X64-FAST: # %bb.0: +; X64-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; X64-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; X64-FAST-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0 +; X64-FAST-NEXT: vzeroupper +; X64-FAST-NEXT: retq %B = trunc <8 x i32> %A to <8 x i16> ret <8 x i16>%B } @@ -52,6 +78,11 @@ ; X64: # %bb.0: ; X64-NEXT: vpmovsxdq %xmm0, %ymm0 ; X64-NEXT: retq +; +; X64-FAST-LABEL: sext4: +; X64-FAST: # %bb.0: +; X64-FAST-NEXT: vpmovsxdq %xmm0, %ymm0 +; X64-FAST-NEXT: retq %B = sext <4 x i32> %A to <4 x i64> ret <4 x i64>%B } @@ -66,6 +97,11 @@ ; X64: # %bb.0: ; X64-NEXT: vpmovsxwd %xmm0, %ymm0 ; X64-NEXT: retq +; +; X64-FAST-LABEL: sext8: +; X64-FAST: # %bb.0: +; X64-FAST-NEXT: vpmovsxwd %xmm0, %ymm0 +; X64-FAST-NEXT: retq %B = sext <8 x i16> %A to <8 x i32> ret <8 x i32>%B } @@ -80,6 +116,11 @@ ; X64: # %bb.0: ; X64-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; X64-NEXT: retq +; +; X64-FAST-LABEL: zext4: +; X64-FAST: # %bb.0: +; X64-FAST-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; X64-FAST-NEXT: retq %B = zext <4 x i32> %A to <4 x i64> ret <4 x i64>%B } @@ -94,6 +135,11 @@ ; X64: # %bb.0: ; X64-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; X64-NEXT: retq +; +; X64-FAST-LABEL: zext8: +; X64-FAST: # %bb.0: +; X64-FAST-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; X64-FAST-NEXT: retq %B = zext <8 x i16> %A to <8 x i32> ret <8 x i32>%B } @@ -110,6 +156,12 @@ ; X64-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 ; X64-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; X64-NEXT: retq +; +; X64-FAST-LABEL: zext_8i8_8i32: +; X64-FAST: # %bb.0: +; X64-FAST-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; X64-FAST-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; X64-FAST-NEXT: retq %B = zext <8 x i8> %A to <8 x i32> ret <8 x i32>%B } @@ -124,6 +176,11 @@ ; X64: # %bb.0: ; X64-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero ; X64-NEXT: retq +; +; X64-FAST-LABEL: zext_16i8_16i16: +; X64-FAST: # %bb.0: +; X64-FAST-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; X64-FAST-NEXT: retq %t = zext <16 x i8> %z to <16 x i16> ret <16 x i16> %t } @@ -138,6 +195,11 @@ ; X64: # %bb.0: ; X64-NEXT: vpmovsxbw %xmm0, %ymm0 ; X64-NEXT: retq +; +; X64-FAST-LABEL: sext_16i8_16i16: +; X64-FAST: # %bb.0: +; X64-FAST-NEXT: vpmovsxbw %xmm0, %ymm0 +; X64-FAST-NEXT: retq %t = sext <16 x i8> %z to <16 x i16> ret <16 x i16> %t } @@ -162,6 +224,16 @@ ; X64-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; X64-NEXT: vzeroupper ; X64-NEXT: retq +; +; X64-FAST-LABEL: trunc_16i16_16i8: +; X64-FAST: # %bb.0: +; X64-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 +; X64-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> +; X64-FAST-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; X64-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; X64-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; X64-FAST-NEXT: vzeroupper +; X64-FAST-NEXT: retq %t = trunc <16 x i16> %z to <16 x i8> ret <16 x i8> %t } @@ -177,6 +249,11 @@ ; X64: # %bb.0: ; X64-NEXT: vpmovsxdq (%rdi), %ymm0 ; X64-NEXT: retq +; +; X64-FAST-LABEL: load_sext_test1: +; X64-FAST: # %bb.0: +; X64-FAST-NEXT: vpmovsxdq (%rdi), %ymm0 +; X64-FAST-NEXT: retq %X = load <4 x i32>, <4 x i32>* %ptr %Y = sext <4 x i32> %X to <4 x i64> ret <4 x i64>%Y @@ -193,6 +270,11 @@ ; X64: # %bb.0: ; X64-NEXT: vpmovsxbq (%rdi), %ymm0 ; X64-NEXT: retq +; +; X64-FAST-LABEL: load_sext_test2: +; X64-FAST: # %bb.0: +; X64-FAST-NEXT: vpmovsxbq (%rdi), %ymm0 +; X64-FAST-NEXT: retq %X = load <4 x i8>, <4 x i8>* %ptr %Y = sext <4 x i8> %X to <4 x i64> ret <4 x i64>%Y @@ -209,6 +291,11 @@ ; X64: # %bb.0: ; X64-NEXT: vpmovsxwq (%rdi), %ymm0 ; X64-NEXT: retq +; +; X64-FAST-LABEL: load_sext_test3: +; X64-FAST: # %bb.0: +; X64-FAST-NEXT: vpmovsxwq (%rdi), %ymm0 +; X64-FAST-NEXT: retq %X = load <4 x i16>, <4 x i16>* %ptr %Y = sext <4 x i16> %X to <4 x i64> ret <4 x i64>%Y @@ -225,6 +312,11 @@ ; X64: # %bb.0: ; X64-NEXT: vpmovsxwd (%rdi), %ymm0 ; X64-NEXT: retq +; +; X64-FAST-LABEL: load_sext_test4: +; X64-FAST: # %bb.0: +; X64-FAST-NEXT: vpmovsxwd (%rdi), %ymm0 +; X64-FAST-NEXT: retq %X = load <8 x i16>, <8 x i16>* %ptr %Y = sext <8 x i16> %X to <8 x i32> ret <8 x i32>%Y @@ -241,6 +333,11 @@ ; X64: # %bb.0: ; X64-NEXT: vpmovsxbd (%rdi), %ymm0 ; X64-NEXT: retq +; +; X64-FAST-LABEL: load_sext_test5: +; X64-FAST: # %bb.0: +; X64-FAST-NEXT: vpmovsxbd (%rdi), %ymm0 +; X64-FAST-NEXT: retq %X = load <8 x i8>, <8 x i8>* %ptr %Y = sext <8 x i8> %X to <8 x i32> ret <8 x i32>%Y Index: test/CodeGen/X86/avx2-vector-shifts.ll =================================================================== --- test/CodeGen/X86/avx2-vector-shifts.ll +++ test/CodeGen/X86/avx2-vector-shifts.ll @@ -1,6 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=X32 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=X64 +; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=X32,X32-SLOW +; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefixes=X32,X32-FAST +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=X64,X64-SLOW +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefixes=X64,X64-FAST ; AVX2 Logical Shift Left @@ -372,25 +374,45 @@ } define <4 x i32> @srl_trunc_and_v4i64(<4 x i32> %x, <4 x i64> %y) nounwind { -; X32-LABEL: srl_trunc_and_v4i64: -; X32: # %bb.0: -; X32-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] -; X32-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; X32-NEXT: vpbroadcastd {{.*#+}} xmm2 = [8,8,8,8] -; X32-NEXT: vpand %xmm2, %xmm1, %xmm1 -; X32-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0 -; X32-NEXT: vzeroupper -; X32-NEXT: retl -; -; X64-LABEL: srl_trunc_and_v4i64: -; X64: # %bb.0: -; X64-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] -; X64-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; X64-NEXT: vpbroadcastd {{.*#+}} xmm2 = [8,8,8,8] -; X64-NEXT: vpand %xmm2, %xmm1, %xmm1 -; X64-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0 -; X64-NEXT: vzeroupper -; X64-NEXT: retq +; X32-SLOW-LABEL: srl_trunc_and_v4i64: +; X32-SLOW: # %bb.0: +; X32-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] +; X32-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] +; X32-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm2 = [8,8,8,8] +; X32-SLOW-NEXT: vpand %xmm2, %xmm1, %xmm1 +; X32-SLOW-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0 +; X32-SLOW-NEXT: vzeroupper +; X32-SLOW-NEXT: retl +; +; X32-FAST-LABEL: srl_trunc_and_v4i64: +; X32-FAST: # %bb.0: +; X32-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7] +; X32-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1 +; X32-FAST-NEXT: vpbroadcastd {{.*#+}} xmm2 = [8,8,8,8] +; X32-FAST-NEXT: vpand %xmm2, %xmm1, %xmm1 +; X32-FAST-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0 +; X32-FAST-NEXT: vzeroupper +; X32-FAST-NEXT: retl +; +; X64-SLOW-LABEL: srl_trunc_and_v4i64: +; X64-SLOW: # %bb.0: +; X64-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] +; X64-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] +; X64-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm2 = [8,8,8,8] +; X64-SLOW-NEXT: vpand %xmm2, %xmm1, %xmm1 +; X64-SLOW-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0 +; X64-SLOW-NEXT: vzeroupper +; X64-SLOW-NEXT: retq +; +; X64-FAST-LABEL: srl_trunc_and_v4i64: +; X64-FAST: # %bb.0: +; X64-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7] +; X64-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1 +; X64-FAST-NEXT: vpbroadcastd {{.*#+}} xmm2 = [8,8,8,8] +; X64-FAST-NEXT: vpand %xmm2, %xmm1, %xmm1 +; X64-FAST-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0 +; X64-FAST-NEXT: vzeroupper +; X64-FAST-NEXT: retq %and = and <4 x i64> %y, %trunc = trunc <4 x i64> %and to <4 x i32> %sra = lshr <4 x i32> %x, %trunc Index: test/CodeGen/X86/avx512-extract-subvector-load-store.ll =================================================================== --- test/CodeGen/X86/avx512-extract-subvector-load-store.ll +++ test/CodeGen/X86/avx512-extract-subvector-load-store.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl,+avx512dq -O2 | FileCheck %s --check-prefix=AVX512 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl -O2 | FileCheck %s --check-prefix=AVX512NOTDQ +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl,+avx512dq,+fast-variable-shuffle -O2 | FileCheck %s --check-prefix=AVX512 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl,+fast-variable-shuffle -O2 | FileCheck %s --check-prefix=AVX512NOTDQ define void @load_v8i1_broadcast_4_v2i1(<8 x i1>* %a0,<2 x double> %a1,<2 x double> %a2,<2 x double>* %a3) { ; AVX512-LABEL: load_v8i1_broadcast_4_v2i1: @@ -331,8 +331,8 @@ ; AVX512-NEXT: kmovd (%rdi), %k0 ; AVX512-NEXT: kshiftrd $24, %k0, %k0 ; AVX512-NEXT: vpmovm2d %k0, %ymm2 -; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[3,3,2,3,7,7,6,7] -; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2] +; AVX512-NEXT: vpbroadcastd {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7] +; AVX512-NEXT: vpermd %ymm2, %ymm3, %ymm2 ; AVX512-NEXT: vpmovd2m %ymm2, %k1 ; AVX512-NEXT: vmovaps %ymm0, %ymm1 {%k1} ; AVX512-NEXT: vmovaps %ymm1, (%rsi) @@ -345,8 +345,8 @@ ; AVX512NOTDQ-NEXT: kshiftrd $24, %k0, %k1 ; AVX512NOTDQ-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 ; AVX512NOTDQ-NEXT: vmovdqa32 %ymm2, %ymm2 {%k1} {z} -; AVX512NOTDQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[3,3,2,3,7,7,6,7] -; AVX512NOTDQ-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2] +; AVX512NOTDQ-NEXT: vpbroadcastd {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7] +; AVX512NOTDQ-NEXT: vpermd %ymm2, %ymm3, %ymm2 ; AVX512NOTDQ-NEXT: vptestmd %ymm2, %ymm2, %k1 ; AVX512NOTDQ-NEXT: vmovaps %ymm0, %ymm1 {%k1} ; AVX512NOTDQ-NEXT: vmovaps %ymm1, (%rsi) @@ -541,8 +541,8 @@ ; AVX512-NEXT: kmovq (%rdi), %k0 ; AVX512-NEXT: kshiftrq $56, %k0, %k0 ; AVX512-NEXT: vpmovm2d %k0, %ymm2 -; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[3,3,2,3,7,7,6,7] -; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2] +; AVX512-NEXT: vpbroadcastd {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7] +; AVX512-NEXT: vpermd %ymm2, %ymm3, %ymm2 ; AVX512-NEXT: vpmovd2m %ymm2, %k1 ; AVX512-NEXT: vmovaps %ymm0, %ymm1 {%k1} ; AVX512-NEXT: vmovaps %ymm1, (%rsi) @@ -555,8 +555,8 @@ ; AVX512NOTDQ-NEXT: kshiftrq $56, %k0, %k1 ; AVX512NOTDQ-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 ; AVX512NOTDQ-NEXT: vmovdqa32 %ymm2, %ymm2 {%k1} {z} -; AVX512NOTDQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[3,3,2,3,7,7,6,7] -; AVX512NOTDQ-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,2] +; AVX512NOTDQ-NEXT: vpbroadcastd {{.*#+}} ymm3 = [7,7,7,7,7,7,7,7] +; AVX512NOTDQ-NEXT: vpermd %ymm2, %ymm3, %ymm2 ; AVX512NOTDQ-NEXT: vptestmd %ymm2, %ymm2, %k1 ; AVX512NOTDQ-NEXT: vmovaps %ymm0, %ymm1 {%k1} ; AVX512NOTDQ-NEXT: vmovaps %ymm1, (%rsi) @@ -1134,8 +1134,8 @@ ; AVX512-NEXT: kmovd (%rdi), %k0 ; AVX512-NEXT: kshiftrd $24, %k0, %k0 ; AVX512-NEXT: vpmovm2d %k0, %ymm0 -; AVX512-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,3,2,3,7,7,6,7] -; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,2] +; AVX512-NEXT: vpbroadcastd {{.*#+}} ymm1 = [7,7,7,7,7,7,7,7] +; AVX512-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX512-NEXT: vpmovd2m %ymm0, %k0 ; AVX512-NEXT: kmovb %k0, (%rsi) ; AVX512-NEXT: vzeroupper @@ -1147,8 +1147,8 @@ ; AVX512NOTDQ-NEXT: kshiftrd $24, %k0, %k1 ; AVX512NOTDQ-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 ; AVX512NOTDQ-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} -; AVX512NOTDQ-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,3,2,3,7,7,6,7] -; AVX512NOTDQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,2] +; AVX512NOTDQ-NEXT: vpbroadcastd {{.*#+}} ymm1 = [7,7,7,7,7,7,7,7] +; AVX512NOTDQ-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX512NOTDQ-NEXT: vptestmd %ymm0, %ymm0, %k0 ; AVX512NOTDQ-NEXT: kmovd %k0, %eax ; AVX512NOTDQ-NEXT: movb %al, (%rsi) @@ -1369,8 +1369,8 @@ ; AVX512-NEXT: kmovq (%rdi), %k0 ; AVX512-NEXT: kshiftrq $56, %k0, %k0 ; AVX512-NEXT: vpmovm2d %k0, %ymm0 -; AVX512-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,3,2,3,7,7,6,7] -; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,2] +; AVX512-NEXT: vpbroadcastd {{.*#+}} ymm1 = [7,7,7,7,7,7,7,7] +; AVX512-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX512-NEXT: vpmovd2m %ymm0, %k0 ; AVX512-NEXT: kmovb %k0, (%rsi) ; AVX512-NEXT: vzeroupper @@ -1382,8 +1382,8 @@ ; AVX512NOTDQ-NEXT: kshiftrq $56, %k0, %k1 ; AVX512NOTDQ-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 ; AVX512NOTDQ-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} -; AVX512NOTDQ-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,3,2,3,7,7,6,7] -; AVX512NOTDQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,2] +; AVX512NOTDQ-NEXT: vpbroadcastd {{.*#+}} ymm1 = [7,7,7,7,7,7,7,7] +; AVX512NOTDQ-NEXT: vpermd %ymm0, %ymm1, %ymm0 ; AVX512NOTDQ-NEXT: vptestmd %ymm0, %ymm0, %k0 ; AVX512NOTDQ-NEXT: kmovd %k0, %eax ; AVX512NOTDQ-NEXT: movb %al, (%rsi) Index: test/CodeGen/X86/avx512-shuffles/broadcast-vector-int.ll =================================================================== --- test/CodeGen/X86/avx512-shuffles/broadcast-vector-int.ll +++ test/CodeGen/X86/avx512-shuffles/broadcast-vector-int.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512f,+avx512vl,+avx512dq %s -o - | FileCheck %s +; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512f,+avx512vl,+avx512dq,+fast-variable-shuffle %s -o - | FileCheck %s ; FIXME: fixing PR34394 should fix the i32x2 memory cases resulting in a simple vbroadcasti32x2 instruction. @@ -459,8 +459,8 @@ define <8 x i32> @test_2xi32_to_8xi32_mem(<2 x i32>* %vp) { ; CHECK-LABEL: test_2xi32_to_8xi32_mem: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero -; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; CHECK-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,7,4,5,6,7],zero,zero,zero,zero ; CHECK-NEXT: vpbroadcastq %xmm0, %ymm0 ; CHECK-NEXT: retq %vec = load <2 x i32>, <2 x i32>* %vp @@ -470,8 +470,8 @@ define <8 x i32> @test_masked_2xi32_to_8xi32_mem_mask0(<2 x i32>* %vp, <8 x i32> %default, <8 x i32> %mask) { ; CHECK-LABEL: test_masked_2xi32_to_8xi32_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovzxdq {{.*#+}} xmm2 = mem[0],zero,mem[1],zero -; CHECK-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; CHECK-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero +; CHECK-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,6,7,4,5,6,7],zero,zero,zero,zero ; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vpcmpeqd %ymm3, %ymm1, %k1 ; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} ymm0 {%k1} = xmm2[0,1,0,1,0,1,0,1] @@ -486,8 +486,8 @@ define <8 x i32> @test_masked_z_2xi32_to_8xi32_mem_mask0(<2 x i32>* %vp, <8 x i32> %mask) { ; CHECK-LABEL: test_masked_z_2xi32_to_8xi32_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero -; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; CHECK-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; CHECK-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,7,4,5,6,7],zero,zero,zero,zero ; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; CHECK-NEXT: vpcmpeqd %ymm2, %ymm0, %k1 ; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} ymm0 {%k1} {z} = xmm1[0,1,0,1,0,1,0,1] @@ -501,8 +501,8 @@ define <8 x i32> @test_masked_2xi32_to_8xi32_mem_mask1(<2 x i32>* %vp, <8 x i32> %default, <8 x i32> %mask) { ; CHECK-LABEL: test_masked_2xi32_to_8xi32_mem_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovzxdq {{.*#+}} xmm2 = mem[0],zero,mem[1],zero -; CHECK-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; CHECK-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero +; CHECK-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,6,7,4,5,6,7],zero,zero,zero,zero ; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vpcmpeqd %ymm3, %ymm1, %k1 ; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} ymm0 {%k1} = xmm2[0,1,0,1,0,1,0,1] @@ -517,8 +517,8 @@ define <8 x i32> @test_masked_z_2xi32_to_8xi32_mem_mask1(<2 x i32>* %vp, <8 x i32> %mask) { ; CHECK-LABEL: test_masked_z_2xi32_to_8xi32_mem_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero -; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; CHECK-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; CHECK-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,7,4,5,6,7],zero,zero,zero,zero ; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; CHECK-NEXT: vpcmpeqd %ymm2, %ymm0, %k1 ; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} ymm0 {%k1} {z} = xmm1[0,1,0,1,0,1,0,1] @@ -532,8 +532,8 @@ define <8 x i32> @test_masked_2xi32_to_8xi32_mem_mask2(<2 x i32>* %vp, <8 x i32> %default, <8 x i32> %mask) { ; CHECK-LABEL: test_masked_2xi32_to_8xi32_mem_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovzxdq {{.*#+}} xmm2 = mem[0],zero,mem[1],zero -; CHECK-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; CHECK-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero +; CHECK-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,6,7,4,5,6,7],zero,zero,zero,zero ; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vpcmpeqd %ymm3, %ymm1, %k1 ; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} ymm0 {%k1} = xmm2[0,1,0,1,0,1,0,1] @@ -548,8 +548,8 @@ define <8 x i32> @test_masked_z_2xi32_to_8xi32_mem_mask2(<2 x i32>* %vp, <8 x i32> %mask) { ; CHECK-LABEL: test_masked_z_2xi32_to_8xi32_mem_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero -; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; CHECK-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; CHECK-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,7,4,5,6,7],zero,zero,zero,zero ; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; CHECK-NEXT: vpcmpeqd %ymm2, %ymm0, %k1 ; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} ymm0 {%k1} {z} = xmm1[0,1,0,1,0,1,0,1] @@ -563,8 +563,8 @@ define <8 x i32> @test_masked_2xi32_to_8xi32_mem_mask3(<2 x i32>* %vp, <8 x i32> %default, <8 x i32> %mask) { ; CHECK-LABEL: test_masked_2xi32_to_8xi32_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovzxdq {{.*#+}} xmm2 = mem[0],zero,mem[1],zero -; CHECK-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; CHECK-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero +; CHECK-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,6,7,4,5,6,7],zero,zero,zero,zero ; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vpcmpeqd %ymm3, %ymm1, %k1 ; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} ymm0 {%k1} = xmm2[0,1,0,1,0,1,0,1] @@ -579,8 +579,8 @@ define <8 x i32> @test_masked_z_2xi32_to_8xi32_mem_mask3(<2 x i32>* %vp, <8 x i32> %mask) { ; CHECK-LABEL: test_masked_z_2xi32_to_8xi32_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vpmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero -; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; CHECK-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; CHECK-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,7,4,5,6,7],zero,zero,zero,zero ; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; CHECK-NEXT: vpcmpeqd %ymm2, %ymm0, %k1 ; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} ymm0 {%k1} {z} = xmm1[0,1,0,1,0,1,0,1] Index: test/CodeGen/X86/avx512-shuffles/partial_permute.ll =================================================================== --- test/CodeGen/X86/avx512-shuffles/partial_permute.ll +++ test/CodeGen/X86/avx512-shuffles/partial_permute.ll @@ -1,16 +1,14 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512f,+avx512vl,+avx512bw %s -o - | FileCheck %s +; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512f,+avx512vl,+avx512bw,+fast-variable-shuffle %s -o - | FileCheck %s ; FIXME: All cases here should be fixed by PR34380 define <8 x i16> @test_16xi16_to_8xi16_perm_mask0(<16 x i16> %vec) { ; CHECK-LABEL: test_16xi16_to_8xi16_perm_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,7,6,6,4] -; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] +; CHECK-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[14,15,12,13,12,13,8,9,14,15,12,13,12,13,8,9] ; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0 -; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,0,3] -; CHECK-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,4] +; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,8,9,10,11,0,1,2,3,12,13,0,1] ; CHECK-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3,4],xmm0[5,6,7] ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq @@ -20,11 +18,9 @@ define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mask0(<16 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) { ; CHECK-LABEL: test_masked_16xi16_to_8xi16_perm_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpshufhw {{.*#+}} xmm3 = xmm0[0,1,2,3,7,6,6,4] -; CHECK-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,2,3] +; CHECK-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[14,15,12,13,12,13,8,9,14,15,12,13,12,13,8,9] ; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0 -; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,0,3] -; CHECK-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,4] +; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,8,9,10,11,0,1,2,3,12,13,0,1] ; CHECK-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1],xmm0[2],xmm3[3,4],xmm0[5,6,7] ; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vpcmpeqw %xmm3, %xmm2, %k1 @@ -40,11 +36,9 @@ define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mask0(<16 x i16> %vec, <8 x i16> %mask) { ; CHECK-LABEL: test_masked_z_16xi16_to_8xi16_perm_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vpshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,7,6,6,4] -; CHECK-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] +; CHECK-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[14,15,12,13,12,13,8,9,14,15,12,13,12,13,8,9] ; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0 -; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,0,3] -; CHECK-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,4] +; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,8,9,10,11,0,1,2,3,12,13,0,1] ; CHECK-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3,4],xmm0[5,6,7] ; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; CHECK-NEXT: vpcmpeqw %xmm2, %xmm1, %k1 @@ -59,11 +53,10 @@ define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mask1(<16 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) { ; CHECK-LABEL: test_masked_16xi16_to_8xi16_perm_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm3 -; CHECK-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,1,8,9,2,3,10,11,12,13,14,15,8,9,12,13] -; CHECK-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,6,7] -; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,2,3] -; CHECK-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2],xmm0[3],xmm3[4,5,6,7] +; CHECK-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[8,9,8,9,8,9,8,9,8,9,8,9,12,13,14,15] +; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0 +; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,2,3,10,11,12,13,14,15,8,9,12,13] +; CHECK-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0],xmm0[1,2],xmm3[3],xmm0[4,5,6,7] ; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vpcmpeqw %xmm3, %xmm2, %k1 ; CHECK-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1} @@ -78,11 +71,10 @@ define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mask1(<16 x i16> %vec, <8 x i16> %mask) { ; CHECK-LABEL: test_masked_z_16xi16_to_8xi16_perm_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm2 -; CHECK-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,8,9,2,3,10,11,12,13,14,15,8,9,12,13] -; CHECK-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,6,7] -; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,2,3] -; CHECK-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2],xmm0[3],xmm2[4,5,6,7] +; CHECK-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[8,9,8,9,8,9,8,9,8,9,8,9,12,13,14,15] +; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0 +; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,2,3,10,11,12,13,14,15,8,9,12,13] +; CHECK-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1,2],xmm2[3],xmm0[4,5,6,7] ; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; CHECK-NEXT: vpcmpeqw %xmm2, %xmm1, %k1 ; CHECK-NEXT: vmovdqu16 %xmm0, %xmm0 {%k1} {z} @@ -96,11 +88,10 @@ define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mask2(<16 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) { ; CHECK-LABEL: test_masked_16xi16_to_8xi16_perm_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm3 -; CHECK-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[12,13,6,7,12,13,4,5,0,1,2,3,12,13,2,3] -; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,0,3] -; CHECK-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,6,7] -; CHECK-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3],xmm0[4,5,6],xmm3[7] +; CHECK-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[8,9,10,11,4,5,6,7,14,15,2,3,12,13,14,15] +; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0 +; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[12,13,6,7,12,13,4,5,0,1,2,3,12,13,2,3] +; CHECK-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0],xmm0[1,2,3],xmm3[4,5,6],xmm0[7] ; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vpcmpeqw %xmm3, %xmm2, %k1 ; CHECK-NEXT: vpblendmw %xmm0, %xmm1, %xmm0 {%k1} @@ -115,11 +106,10 @@ define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mask2(<16 x i16> %vec, <8 x i16> %mask) { ; CHECK-LABEL: test_masked_z_16xi16_to_8xi16_perm_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm2 -; CHECK-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[12,13,6,7,12,13,4,5,0,1,2,3,12,13,2,3] -; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,0,3] -; CHECK-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,6,7] -; CHECK-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3],xmm0[4,5,6],xmm2[7] +; CHECK-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[8,9,10,11,4,5,6,7,14,15,2,3,12,13,14,15] +; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0 +; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[12,13,6,7,12,13,4,5,0,1,2,3,12,13,2,3] +; CHECK-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3],xmm2[4,5,6],xmm0[7] ; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; CHECK-NEXT: vpcmpeqw %xmm2, %xmm1, %k1 ; CHECK-NEXT: vmovdqu16 %xmm0, %xmm0 {%k1} {z} @@ -181,11 +171,10 @@ ; CHECK-LABEL: test_16xi16_to_8xi16_perm_mem_mask0: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa (%rdi), %ymm0 -; CHECK-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,1,14,15,12,13,6,7,10,11,10,11,6,7,6,7] -; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0 -; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,2,0] -; CHECK-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,1,3,4,5,6,7] -; CHECK-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3,4],xmm0[5],xmm1[6],xmm0[7] +; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm1 +; CHECK-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[8,9,10,11,10,11,6,7,8,9,10,11,0,1,2,3] +; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,14,15,12,13,6,7,10,11,10,11,6,7,6,7] +; CHECK-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3,4],xmm1[5],xmm0[6],xmm1[7] ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %vec = load <16 x i16>, <16 x i16>* %vp @@ -196,11 +185,10 @@ ; CHECK-LABEL: test_masked_16xi16_to_8xi16_perm_mem_mask0: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa (%rdi), %ymm2 -; CHECK-NEXT: vpshufb {{.*#+}} xmm3 = xmm2[0,1,14,15,12,13,6,7,10,11,10,11,6,7,6,7] -; CHECK-NEXT: vextracti128 $1, %ymm2, %xmm2 -; CHECK-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,1,2,0] -; CHECK-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,1,3,4,5,6,7] -; CHECK-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2],xmm3[3,4],xmm2[5],xmm3[6],xmm2[7] +; CHECK-NEXT: vextracti128 $1, %ymm2, %xmm3 +; CHECK-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[8,9,10,11,10,11,6,7,8,9,10,11,0,1,2,3] +; CHECK-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,14,15,12,13,6,7,10,11,10,11,6,7,6,7] +; CHECK-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3,4],xmm3[5],xmm2[6],xmm3[7] ; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vpcmpeqw %xmm3, %xmm1, %k1 ; CHECK-NEXT: vmovdqu16 %xmm2, %xmm0 {%k1} @@ -217,11 +205,10 @@ ; CHECK-LABEL: test_masked_z_16xi16_to_8xi16_perm_mem_mask0: ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa (%rdi), %ymm1 -; CHECK-NEXT: vpshufb {{.*#+}} xmm2 = xmm1[0,1,14,15,12,13,6,7,10,11,10,11,6,7,6,7] -; CHECK-NEXT: vextracti128 $1, %ymm1, %xmm1 -; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,1,2,0] -; CHECK-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7] -; CHECK-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2],xmm2[3,4],xmm1[5],xmm2[6],xmm1[7] +; CHECK-NEXT: vextracti128 $1, %ymm1, %xmm2 +; CHECK-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[8,9,10,11,10,11,6,7,8,9,10,11,0,1,2,3] +; CHECK-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,14,15,12,13,6,7,10,11,10,11,6,7,6,7] +; CHECK-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3,4],xmm2[5],xmm1[6],xmm2[7] ; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; CHECK-NEXT: vpcmpeqw %xmm2, %xmm0, %k1 ; CHECK-NEXT: vmovdqu16 %xmm1, %xmm0 {%k1} {z} @@ -2192,11 +2179,11 @@ ; CHECK-LABEL: test_masked_8xi64_to_4xi64_perm_mask1: ; CHECK: # %bb.0: ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3 -; CHECK-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3],ymm3[4,5,6,7] -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 -; CHECK-NEXT: vpermq {{.*#+}} ymm1 {%k1} = ymm0[2,0,2,1] -; CHECK-NEXT: vmovdqa %ymm1, %ymm0 +; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = [2,0,2,5] +; CHECK-NEXT: vpermi2q %ymm0, %ymm3, %ymm4 +; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; CHECK-NEXT: vpcmpeqq %ymm0, %ymm2, %k1 +; CHECK-NEXT: vpblendmq %ymm4, %ymm1, %ymm0 {%k1} ; CHECK-NEXT: retq %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> %cmp = icmp eq <4 x i64> %mask, zeroinitializer @@ -2207,11 +2194,12 @@ define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask1(<8 x i64> %vec, <4 x i64> %mask) { ; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm2 -; CHECK-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5,6,7] -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 -; CHECK-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[2,0,2,1] +; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3 +; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [2,0,2,5] +; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; CHECK-NEXT: vpcmpeqq %ymm4, %ymm1, %k1 +; CHECK-NEXT: vpermi2q %ymm0, %ymm3, %ymm2 {%k1} {z} +; CHECK-NEXT: vmovdqa %ymm2, %ymm0 ; CHECK-NEXT: retq %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> %cmp = icmp eq <4 x i64> %mask, zeroinitializer @@ -2222,11 +2210,11 @@ ; CHECK-LABEL: test_masked_8xi64_to_4xi64_perm_mask2: ; CHECK: # %bb.0: ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3 -; CHECK-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3],ymm3[4,5],ymm0[6,7] -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 -; CHECK-NEXT: vpermq {{.*#+}} ymm1 {%k1} = ymm0[2,3,2,3] -; CHECK-NEXT: vmovdqa %ymm1, %ymm0 +; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = [2,7,2,7] +; CHECK-NEXT: vpermi2q %ymm0, %ymm3, %ymm4 +; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; CHECK-NEXT: vpcmpeqq %ymm0, %ymm2, %k1 +; CHECK-NEXT: vpblendmq %ymm4, %ymm1, %ymm0 {%k1} ; CHECK-NEXT: retq %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> %cmp = icmp eq <4 x i64> %mask, zeroinitializer @@ -2237,11 +2225,12 @@ define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask2(<8 x i64> %vec, <4 x i64> %mask) { ; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm2 -; CHECK-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3],ymm2[4,5],ymm0[6,7] -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 -; CHECK-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3,2,3] +; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3 +; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [2,7,2,7] +; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; CHECK-NEXT: vpcmpeqq %ymm4, %ymm1, %k1 +; CHECK-NEXT: vpermi2q %ymm0, %ymm3, %ymm2 {%k1} {z} +; CHECK-NEXT: vmovdqa %ymm2, %ymm0 ; CHECK-NEXT: retq %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> %cmp = icmp eq <4 x i64> %mask, zeroinitializer @@ -2251,9 +2240,10 @@ define <4 x i64> @test_8xi64_to_4xi64_perm_mask3(<8 x i64> %vec) { ; CHECK-LABEL: test_8xi64_to_4xi64_perm_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm1 -; CHECK-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7] -; CHECK-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,0,0,3] +; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm2 +; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [2,4,4,3] +; CHECK-NEXT: vpermi2q %ymm0, %ymm2, %ymm1 +; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ; CHECK-NEXT: retq %res = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> ret <4 x i64> %res @@ -2262,11 +2252,11 @@ ; CHECK-LABEL: test_masked_8xi64_to_4xi64_perm_mask3: ; CHECK: # %bb.0: ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3 -; CHECK-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3,4,5,6,7] -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 -; CHECK-NEXT: vpermq {{.*#+}} ymm1 {%k1} = ymm0[2,0,0,3] -; CHECK-NEXT: vmovdqa %ymm1, %ymm0 +; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = [2,4,4,3] +; CHECK-NEXT: vpermi2q %ymm0, %ymm3, %ymm4 +; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; CHECK-NEXT: vpcmpeqq %ymm0, %ymm2, %k1 +; CHECK-NEXT: vpblendmq %ymm4, %ymm1, %ymm0 {%k1} ; CHECK-NEXT: retq %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> %cmp = icmp eq <4 x i64> %mask, zeroinitializer @@ -2277,11 +2267,12 @@ define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask3(<8 x i64> %vec, <4 x i64> %mask) { ; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm2 -; CHECK-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3,4,5,6,7] -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 -; CHECK-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[2,0,0,3] +; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3 +; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [2,4,4,3] +; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; CHECK-NEXT: vpcmpeqq %ymm4, %ymm1, %k1 +; CHECK-NEXT: vpermi2q %ymm0, %ymm3, %ymm2 {%k1} {z} +; CHECK-NEXT: vmovdqa %ymm2, %ymm0 ; CHECK-NEXT: retq %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> %cmp = icmp eq <4 x i64> %mask, zeroinitializer @@ -2292,11 +2283,11 @@ ; CHECK-LABEL: test_masked_8xi64_to_4xi64_perm_mask4: ; CHECK: # %bb.0: ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3 -; CHECK-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm3[1],ymm0[3],ymm3[3] -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 -; CHECK-NEXT: vpermq {{.*#+}} ymm1 {%k1} = ymm0[2,3,3,1] -; CHECK-NEXT: vmovdqa %ymm1, %ymm0 +; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = [7,3,3,1] +; CHECK-NEXT: vpermi2q %ymm0, %ymm3, %ymm4 +; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; CHECK-NEXT: vpcmpeqq %ymm0, %ymm2, %k1 +; CHECK-NEXT: vpblendmq %ymm4, %ymm1, %ymm0 {%k1} ; CHECK-NEXT: retq %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> %cmp = icmp eq <4 x i64> %mask, zeroinitializer @@ -2307,11 +2298,12 @@ define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask4(<8 x i64> %vec, <4 x i64> %mask) { ; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mask4: ; CHECK: # %bb.0: -; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm2 -; CHECK-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm2[1],ymm0[3],ymm2[3] -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 -; CHECK-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3,3,1] +; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3 +; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [7,3,3,1] +; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; CHECK-NEXT: vpcmpeqq %ymm4, %ymm1, %k1 +; CHECK-NEXT: vpermi2q %ymm0, %ymm3, %ymm2 {%k1} {z} +; CHECK-NEXT: vmovdqa %ymm2, %ymm0 ; CHECK-NEXT: retq %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> %cmp = icmp eq <4 x i64> %mask, zeroinitializer @@ -2352,9 +2344,10 @@ define <4 x i64> @test_8xi64_to_4xi64_perm_mask6(<8 x i64> %vec) { ; CHECK-LABEL: test_8xi64_to_4xi64_perm_mask6: ; CHECK: # %bb.0: -; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm1 -; CHECK-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,2,1,3] -; CHECK-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] +; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm2 +; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [3,2,1,7] +; CHECK-NEXT: vpermi2q %ymm0, %ymm2, %ymm1 +; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ; CHECK-NEXT: retq %res = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> ret <4 x i64> %res @@ -2363,11 +2356,11 @@ ; CHECK-LABEL: test_masked_8xi64_to_4xi64_perm_mask6: ; CHECK: # %bb.0: ; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3 -; CHECK-NEXT: vpermq {{.*#+}} ymm3 = ymm3[3,2,1,3] -; CHECK-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7] -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 -; CHECK-NEXT: vpblendmq %ymm0, %ymm1, %ymm0 {%k1} +; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = [3,2,1,7] +; CHECK-NEXT: vpermi2q %ymm0, %ymm3, %ymm4 +; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; CHECK-NEXT: vpcmpeqq %ymm0, %ymm2, %k1 +; CHECK-NEXT: vpblendmq %ymm4, %ymm1, %ymm0 {%k1} ; CHECK-NEXT: retq %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> %cmp = icmp eq <4 x i64> %mask, zeroinitializer @@ -2378,12 +2371,12 @@ define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mask6(<8 x i64> %vec, <4 x i64> %mask) { ; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mask6: ; CHECK: # %bb.0: -; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm2 -; CHECK-NEXT: vpermq {{.*#+}} ymm2 = ymm2[3,2,1,3] -; CHECK-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7] -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 -; CHECK-NEXT: vmovdqa64 %ymm0, %ymm0 {%k1} {z} +; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3 +; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [3,2,1,7] +; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; CHECK-NEXT: vpcmpeqq %ymm4, %ymm1, %k1 +; CHECK-NEXT: vpermi2q %ymm0, %ymm3, %ymm2 {%k1} {z} +; CHECK-NEXT: vmovdqa %ymm2, %ymm0 ; CHECK-NEXT: retq %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> %cmp = icmp eq <4 x i64> %mask, zeroinitializer @@ -2537,10 +2530,11 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa64 (%rdi), %zmm2 ; CHECK-NEXT: vextracti64x4 $1, %zmm2, %ymm3 -; CHECK-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3,4,5,6,7] -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vpcmpeqq %ymm3, %ymm1, %k1 -; CHECK-NEXT: vpermq {{.*#+}} ymm0 {%k1} = ymm2[0,3,2,0] +; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = [4,3,2,4] +; CHECK-NEXT: vpermi2q %ymm2, %ymm3, %ymm4 +; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; CHECK-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 +; CHECK-NEXT: vmovdqa64 %ymm4, %ymm0 {%k1} ; CHECK-NEXT: retq %vec = load <8 x i64>, <8 x i64>* %vp %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> @@ -2552,12 +2546,13 @@ define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask1(<8 x i64>* %vp, <4 x i64> %mask) { ; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa64 (%rdi), %zmm1 -; CHECK-NEXT: vextracti64x4 $1, %zmm1, %ymm2 -; CHECK-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3,4,5,6,7] -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqq %ymm2, %ymm0, %k1 -; CHECK-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = ymm1[0,3,2,0] +; CHECK-NEXT: vmovdqa64 (%rdi), %zmm2 +; CHECK-NEXT: vextracti64x4 $1, %zmm2, %ymm3 +; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [4,3,2,4] +; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; CHECK-NEXT: vpcmpeqq %ymm4, %ymm0, %k1 +; CHECK-NEXT: vpermi2q %ymm2, %ymm3, %ymm1 {%k1} {z} +; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ; CHECK-NEXT: retq %vec = load <8 x i64>, <8 x i64>* %vp %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> @@ -2571,10 +2566,11 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa64 (%rdi), %zmm2 ; CHECK-NEXT: vextracti64x4 $1, %zmm2, %ymm3 -; CHECK-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm3[1],ymm2[1],ymm3[3],ymm2[3] -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vpcmpeqq %ymm3, %ymm1, %k1 -; CHECK-NEXT: vpermq {{.*#+}} ymm0 {%k1} = ymm2[2,1,1,0] +; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = [3,5,5,1] +; CHECK-NEXT: vpermi2q %ymm2, %ymm3, %ymm4 +; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; CHECK-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 +; CHECK-NEXT: vmovdqa64 %ymm4, %ymm0 {%k1} ; CHECK-NEXT: retq %vec = load <8 x i64>, <8 x i64>* %vp %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> @@ -2586,12 +2582,13 @@ define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask2(<8 x i64>* %vp, <4 x i64> %mask) { ; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa64 (%rdi), %zmm1 -; CHECK-NEXT: vextracti64x4 $1, %zmm1, %ymm2 -; CHECK-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm2[1],ymm1[1],ymm2[3],ymm1[3] -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqq %ymm2, %ymm0, %k1 -; CHECK-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = ymm1[2,1,1,0] +; CHECK-NEXT: vmovdqa64 (%rdi), %zmm2 +; CHECK-NEXT: vextracti64x4 $1, %zmm2, %ymm3 +; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [3,5,5,1] +; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; CHECK-NEXT: vpcmpeqq %ymm4, %ymm0, %k1 +; CHECK-NEXT: vpermi2q %ymm2, %ymm3, %ymm1 {%k1} {z} +; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ; CHECK-NEXT: retq %vec = load <8 x i64>, <8 x i64>* %vp %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> @@ -2603,10 +2600,10 @@ define <4 x i64> @test_8xi64_to_4xi64_perm_mem_mask3(<8 x i64>* %vp) { ; CHECK-LABEL: test_8xi64_to_4xi64_perm_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0 -; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; CHECK-NEXT: vpalignr {{.*#+}} ymm0 = ymm1[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm1[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,1,1,3] +; CHECK-NEXT: vmovdqa64 (%rdi), %zmm1 +; CHECK-NEXT: vextracti64x4 $1, %zmm1, %ymm2 +; CHECK-NEXT: vmovdqa {{.*#+}} ymm0 = [7,0,0,2] +; CHECK-NEXT: vpermi2q %ymm2, %ymm1, %ymm0 ; CHECK-NEXT: retq %vec = load <8 x i64>, <8 x i64>* %vp %res = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> @@ -2617,10 +2614,11 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa64 (%rdi), %zmm2 ; CHECK-NEXT: vextracti64x4 $1, %zmm2, %ymm3 -; CHECK-NEXT: vpalignr {{.*#+}} ymm2 = ymm3[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],ymm3[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23] -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vpcmpeqq %ymm3, %ymm1, %k1 -; CHECK-NEXT: vpermq {{.*#+}} ymm0 {%k1} = ymm2[2,1,1,3] +; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = [7,0,0,2] +; CHECK-NEXT: vpermi2q %ymm3, %ymm2, %ymm4 +; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; CHECK-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 +; CHECK-NEXT: vmovdqa64 %ymm4, %ymm0 {%k1} ; CHECK-NEXT: retq %vec = load <8 x i64>, <8 x i64>* %vp %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> @@ -2632,12 +2630,13 @@ define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask3(<8 x i64>* %vp, <4 x i64> %mask) { ; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa64 (%rdi), %zmm1 -; CHECK-NEXT: vextracti64x4 $1, %zmm1, %ymm2 -; CHECK-NEXT: vpalignr {{.*#+}} ymm1 = ymm2[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm2[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23] -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqq %ymm2, %ymm0, %k1 -; CHECK-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = ymm1[2,1,1,3] +; CHECK-NEXT: vmovdqa64 (%rdi), %zmm2 +; CHECK-NEXT: vextracti64x4 $1, %zmm2, %ymm3 +; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [7,0,0,2] +; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; CHECK-NEXT: vpcmpeqq %ymm4, %ymm0, %k1 +; CHECK-NEXT: vpermi2q %ymm3, %ymm2, %ymm1 {%k1} {z} +; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ; CHECK-NEXT: retq %vec = load <8 x i64>, <8 x i64>* %vp %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> @@ -2687,10 +2686,11 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa64 (%rdi), %zmm2 ; CHECK-NEXT: vextracti64x4 $1, %zmm2, %ymm3 -; CHECK-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vpcmpeqq %ymm3, %ymm1, %k1 -; CHECK-NEXT: vpermq {{.*#+}} ymm0 {%k1} = ymm2[0,2,3,1] +; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = [0,2,7,1] +; CHECK-NEXT: vpermi2q %ymm3, %ymm2, %ymm4 +; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; CHECK-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 +; CHECK-NEXT: vmovdqa64 %ymm4, %ymm0 {%k1} ; CHECK-NEXT: retq %vec = load <8 x i64>, <8 x i64>* %vp %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> @@ -2702,12 +2702,13 @@ define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask5(<8 x i64>* %vp, <4 x i64> %mask) { ; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask5: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa64 (%rdi), %zmm1 -; CHECK-NEXT: vextracti64x4 $1, %zmm1, %ymm2 -; CHECK-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqq %ymm2, %ymm0, %k1 -; CHECK-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = ymm1[0,2,3,1] +; CHECK-NEXT: vmovdqa64 (%rdi), %zmm2 +; CHECK-NEXT: vextracti64x4 $1, %zmm2, %ymm3 +; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,7,1] +; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; CHECK-NEXT: vpcmpeqq %ymm4, %ymm0, %k1 +; CHECK-NEXT: vpermi2q %ymm3, %ymm2, %ymm1 {%k1} {z} +; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ; CHECK-NEXT: retq %vec = load <8 x i64>, <8 x i64>* %vp %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> @@ -2769,10 +2770,11 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa64 (%rdi), %zmm2 ; CHECK-NEXT: vextracti64x4 $1, %zmm2, %ymm3 -; CHECK-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm3[1],ymm2[1],ymm3[3],ymm2[3] -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vpcmpeqq %ymm3, %ymm1, %k1 -; CHECK-NEXT: vpermq {{.*#+}} ymm0 {%k1} = ymm2[2,2,0,1] +; CHECK-NEXT: vmovdqa {{.*#+}} ymm4 = [3,3,1,5] +; CHECK-NEXT: vpermi2q %ymm2, %ymm3, %ymm4 +; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; CHECK-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 +; CHECK-NEXT: vmovdqa64 %ymm4, %ymm0 {%k1} ; CHECK-NEXT: retq %vec = load <8 x i64>, <8 x i64>* %vp %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> @@ -2784,12 +2786,13 @@ define <4 x i64> @test_masked_z_8xi64_to_4xi64_perm_mem_mask7(<8 x i64>* %vp, <4 x i64> %mask) { ; CHECK-LABEL: test_masked_z_8xi64_to_4xi64_perm_mem_mask7: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa64 (%rdi), %zmm1 -; CHECK-NEXT: vextracti64x4 $1, %zmm1, %ymm2 -; CHECK-NEXT: vpunpckhqdq {{.*#+}} ymm1 = ymm2[1],ymm1[1],ymm2[3],ymm1[3] -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpcmpeqq %ymm2, %ymm0, %k1 -; CHECK-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = ymm1[2,2,0,1] +; CHECK-NEXT: vmovdqa64 (%rdi), %zmm2 +; CHECK-NEXT: vextracti64x4 $1, %zmm2, %ymm3 +; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [3,3,1,5] +; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; CHECK-NEXT: vpcmpeqq %ymm4, %ymm0, %k1 +; CHECK-NEXT: vpermi2q %ymm2, %ymm3, %ymm1 {%k1} {z} +; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ; CHECK-NEXT: retq %vec = load <8 x i64>, <8 x i64>* %vp %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> @@ -3465,11 +3468,11 @@ define <4 x float> @test_16xfloat_to_4xfloat_perm_mask3(<16 x float> %vec) { ; CHECK-LABEL: test_16xfloat_to_4xfloat_perm_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm1 -; CHECK-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[2,1,3,3] -; CHECK-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] -; CHECK-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3] -; CHECK-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3] +; CHECK-NEXT: vmovaps {{.*#+}} ymm1 = [0,2,4,6,4,6,6,7] +; CHECK-NEXT: vpermps %ymm0, %ymm1, %ymm1 +; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm0 +; CHECK-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,1,3,3] +; CHECK-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %res = shufflevector <16 x float> %vec, <16 x float> undef, <4 x i32> @@ -3478,11 +3481,11 @@ define <4 x float> @test_masked_16xfloat_to_4xfloat_perm_mask3(<16 x float> %vec, <4 x float> %vec2, <4 x float> %mask) { ; CHECK-LABEL: test_masked_16xfloat_to_4xfloat_perm_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3 -; CHECK-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,1,3,3] -; CHECK-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] -; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; CHECK-NEXT: vpblendd {{.*#+}} xmm0 = xmm3[0],xmm0[1],xmm3[2],xmm0[3] +; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [0,2,4,6,4,6,6,7] +; CHECK-NEXT: vpermd %ymm0, %ymm3, %ymm3 +; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,3,3] +; CHECK-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm3[1],xmm0[2],xmm3[3] ; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vcmpeqps %xmm3, %xmm2, %k1 ; CHECK-NEXT: vblendmps %xmm0, %xmm1, %xmm0 {%k1} @@ -3497,11 +3500,11 @@ define <4 x float> @test_masked_z_16xfloat_to_4xfloat_perm_mask3(<16 x float> %vec, <4 x float> %mask) { ; CHECK-LABEL: test_masked_z_16xfloat_to_4xfloat_perm_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm2 -; CHECK-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,1,3,3] -; CHECK-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] -; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; CHECK-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3] +; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7] +; CHECK-NEXT: vpermd %ymm0, %ymm2, %ymm2 +; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm0 +; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,1,3,3] +; CHECK-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] ; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; CHECK-NEXT: vcmpeqps %xmm2, %xmm1, %k1 ; CHECK-NEXT: vmovaps %xmm0, %xmm0 {%k1} {z} @@ -4022,9 +4025,10 @@ define <4 x double> @test_8xdouble_to_4xdouble_perm_mask0(<8 x double> %vec) { ; CHECK-LABEL: test_8xdouble_to_4xdouble_perm_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm1 -; CHECK-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3] -; CHECK-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,3,2,3] +; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm2 +; CHECK-NEXT: vmovapd {{.*#+}} ymm1 = [3,7,3,7] +; CHECK-NEXT: vpermi2pd %ymm0, %ymm2, %ymm1 +; CHECK-NEXT: vmovapd %ymm1, %ymm0 ; CHECK-NEXT: retq %res = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> ret <4 x double> %res @@ -4032,12 +4036,12 @@ define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mask0(<8 x double> %vec, <4 x double> %vec2, <4 x double> %mask) { ; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3 -; CHECK-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm3[1],ymm0[1],ymm3[3],ymm0[3] -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vcmpeqpd %ymm3, %ymm2, %k1 -; CHECK-NEXT: vpermpd {{.*#+}} ymm1 {%k1} = ymm0[2,3,2,3] -; CHECK-NEXT: vmovapd %ymm1, %ymm0 +; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm3 +; CHECK-NEXT: vmovapd {{.*#+}} ymm4 = [3,7,3,7] +; CHECK-NEXT: vpermi2pd %ymm0, %ymm3, %ymm4 +; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; CHECK-NEXT: vcmpeqpd %ymm0, %ymm2, %k1 +; CHECK-NEXT: vblendmpd %ymm4, %ymm1, %ymm0 {%k1} ; CHECK-NEXT: retq %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> %cmp = fcmp oeq <4 x double> %mask, zeroinitializer @@ -4048,11 +4052,12 @@ define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mask0(<8 x double> %vec, <4 x double> %mask) { ; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm2 -; CHECK-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm2[1],ymm0[1],ymm2[3],ymm0[3] -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vcmpeqpd %ymm2, %ymm1, %k1 -; CHECK-NEXT: vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3,2,3] +; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm3 +; CHECK-NEXT: vmovapd {{.*#+}} ymm2 = [3,7,3,7] +; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; CHECK-NEXT: vcmpeqpd %ymm4, %ymm1, %k1 +; CHECK-NEXT: vpermi2pd %ymm0, %ymm3, %ymm2 {%k1} {z} +; CHECK-NEXT: vmovapd %ymm2, %ymm0 ; CHECK-NEXT: retq %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> %cmp = fcmp oeq <4 x double> %mask, zeroinitializer @@ -4191,12 +4196,12 @@ define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mask5(<8 x double> %vec, <4 x double> %vec2, <4 x double> %mask) { ; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mask5: ; CHECK: # %bb.0: -; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3 -; CHECK-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vcmpeqpd %ymm3, %ymm2, %k1 -; CHECK-NEXT: vpermpd {{.*#+}} ymm1 {%k1} = ymm0[2,3,2,2] -; CHECK-NEXT: vmovapd %ymm1, %ymm0 +; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm3 +; CHECK-NEXT: vmovapd {{.*#+}} ymm4 = [2,6,2,2] +; CHECK-NEXT: vpermi2pd %ymm3, %ymm0, %ymm4 +; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; CHECK-NEXT: vcmpeqpd %ymm0, %ymm2, %k1 +; CHECK-NEXT: vblendmpd %ymm4, %ymm1, %ymm0 {%k1} ; CHECK-NEXT: retq %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> %cmp = fcmp oeq <4 x double> %mask, zeroinitializer @@ -4207,11 +4212,12 @@ define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mask5(<8 x double> %vec, <4 x double> %mask) { ; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mask5: ; CHECK: # %bb.0: -; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm2 -; CHECK-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[2] -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vcmpeqpd %ymm2, %ymm1, %k1 -; CHECK-NEXT: vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3,2,2] +; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm3 +; CHECK-NEXT: vmovapd {{.*#+}} ymm2 = [2,6,2,2] +; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; CHECK-NEXT: vcmpeqpd %ymm4, %ymm1, %k1 +; CHECK-NEXT: vpermi2pd %ymm3, %ymm0, %ymm2 {%k1} {z} +; CHECK-NEXT: vmovapd %ymm2, %ymm0 ; CHECK-NEXT: retq %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> %cmp = fcmp oeq <4 x double> %mask, zeroinitializer @@ -4221,9 +4227,10 @@ define <4 x double> @test_8xdouble_to_4xdouble_perm_mask6(<8 x double> %vec) { ; CHECK-LABEL: test_8xdouble_to_4xdouble_perm_mask6: ; CHECK: # %bb.0: -; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm1 -; CHECK-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[1],ymm0[0],ymm1[3],ymm0[2] -; CHECK-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,2,1] +; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm2 +; CHECK-NEXT: vmovapd {{.*#+}} ymm1 = [1,4,3,4] +; CHECK-NEXT: vpermi2pd %ymm0, %ymm2, %ymm1 +; CHECK-NEXT: vmovapd %ymm1, %ymm0 ; CHECK-NEXT: retq %res = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> ret <4 x double> %res @@ -4232,11 +4239,11 @@ ; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mask6: ; CHECK: # %bb.0: ; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm3 -; CHECK-NEXT: vshufpd {{.*#+}} ymm0 = ymm3[1],ymm0[0],ymm3[3],ymm0[2] -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vcmpeqpd %ymm3, %ymm2, %k1 -; CHECK-NEXT: vpermpd {{.*#+}} ymm1 {%k1} = ymm0[0,1,2,1] -; CHECK-NEXT: vmovapd %ymm1, %ymm0 +; CHECK-NEXT: vmovapd {{.*#+}} ymm4 = [1,4,3,4] +; CHECK-NEXT: vpermi2pd %ymm0, %ymm3, %ymm4 +; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; CHECK-NEXT: vcmpeqpd %ymm0, %ymm2, %k1 +; CHECK-NEXT: vblendmpd %ymm4, %ymm1, %ymm0 {%k1} ; CHECK-NEXT: retq %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> %cmp = fcmp oeq <4 x double> %mask, zeroinitializer @@ -4247,11 +4254,12 @@ define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mask6(<8 x double> %vec, <4 x double> %mask) { ; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mask6: ; CHECK: # %bb.0: -; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm2 -; CHECK-NEXT: vshufpd {{.*#+}} ymm0 = ymm2[1],ymm0[0],ymm2[3],ymm0[2] -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vcmpeqpd %ymm2, %ymm1, %k1 -; CHECK-NEXT: vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0,1,2,1] +; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm3 +; CHECK-NEXT: vmovapd {{.*#+}} ymm2 = [1,4,3,4] +; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; CHECK-NEXT: vcmpeqpd %ymm4, %ymm1, %k1 +; CHECK-NEXT: vpermi2pd %ymm0, %ymm3, %ymm2 {%k1} {z} +; CHECK-NEXT: vmovapd %ymm2, %ymm0 ; CHECK-NEXT: retq %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> %cmp = fcmp oeq <4 x double> %mask, zeroinitializer @@ -4262,11 +4270,11 @@ ; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mask7: ; CHECK: # %bb.0: ; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm3 -; CHECK-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm3[1,2],ymm0[3] -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vcmpeqpd %ymm3, %ymm2, %k1 -; CHECK-NEXT: vpermpd {{.*#+}} ymm1 {%k1} = ymm0[3,1,0,2] -; CHECK-NEXT: vmovapd %ymm1, %ymm0 +; CHECK-NEXT: vmovapd {{.*#+}} ymm4 = [3,5,0,6] +; CHECK-NEXT: vpermi2pd %ymm3, %ymm0, %ymm4 +; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; CHECK-NEXT: vcmpeqpd %ymm0, %ymm2, %k1 +; CHECK-NEXT: vblendmpd %ymm4, %ymm1, %ymm0 {%k1} ; CHECK-NEXT: retq %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> %cmp = fcmp oeq <4 x double> %mask, zeroinitializer @@ -4277,11 +4285,12 @@ define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mask7(<8 x double> %vec, <4 x double> %mask) { ; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mask7: ; CHECK: # %bb.0: -; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm2 -; CHECK-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm2[1,2],ymm0[3] -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vcmpeqpd %ymm2, %ymm1, %k1 -; CHECK-NEXT: vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm0[3,1,0,2] +; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm3 +; CHECK-NEXT: vmovapd {{.*#+}} ymm2 = [3,5,0,6] +; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; CHECK-NEXT: vcmpeqpd %ymm4, %ymm1, %k1 +; CHECK-NEXT: vpermi2pd %ymm3, %ymm0, %ymm2 {%k1} {z} +; CHECK-NEXT: vmovapd %ymm2, %ymm0 ; CHECK-NEXT: retq %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> %cmp = fcmp oeq <4 x double> %mask, zeroinitializer @@ -4291,10 +4300,10 @@ define <2 x double> @test_8xdouble_to_2xdouble_perm_mask0(<8 x double> %vec) { ; CHECK-LABEL: test_8xdouble_to_2xdouble_perm_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm1 -; CHECK-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] -; CHECK-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] -; CHECK-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0 +; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm2 +; CHECK-NEXT: vmovapd {{.*#+}} ymm1 = [0,6,2,6] +; CHECK-NEXT: vpermi2pd %ymm2, %ymm0, %ymm1 +; CHECK-NEXT: vmovapd %xmm1, %xmm0 ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %res = shufflevector <8 x double> %vec, <8 x double> undef, <2 x i32> @@ -4303,12 +4312,12 @@ define <2 x double> @test_masked_8xdouble_to_2xdouble_perm_mask0(<8 x double> %vec, <2 x double> %vec2, <2 x double> %mask) { ; CHECK-LABEL: test_masked_8xdouble_to_2xdouble_perm_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm3 -; CHECK-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] -; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3] -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vcmpeqpd %xmm3, %xmm2, %k1 -; CHECK-NEXT: vblendmpd %xmm0, %xmm1, %xmm0 {%k1} +; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm3 +; CHECK-NEXT: vmovapd {{.*#+}} ymm4 = [0,6,2,6] +; CHECK-NEXT: vpermi2pd %ymm3, %ymm0, %ymm4 +; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; CHECK-NEXT: vcmpeqpd %xmm0, %xmm2, %k1 +; CHECK-NEXT: vblendmpd %xmm4, %xmm1, %xmm0 {%k1} ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <2 x i32> @@ -4320,12 +4329,12 @@ define <2 x double> @test_masked_z_8xdouble_to_2xdouble_perm_mask0(<8 x double> %vec, <2 x double> %mask) { ; CHECK-LABEL: test_masked_z_8xdouble_to_2xdouble_perm_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm2 -; CHECK-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[2] -; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3] -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vcmpeqpd %xmm2, %xmm1, %k1 -; CHECK-NEXT: vmovapd %xmm0, %xmm0 {%k1} {z} +; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm2 +; CHECK-NEXT: vmovapd {{.*#+}} ymm3 = [0,6,2,6] +; CHECK-NEXT: vpermi2pd %ymm2, %ymm0, %ymm3 +; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; CHECK-NEXT: vcmpeqpd %xmm0, %xmm1, %k1 +; CHECK-NEXT: vmovapd %xmm3, %xmm0 {%k1} {z} ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <2 x i32> @@ -4419,10 +4428,11 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vmovapd (%rdi), %zmm2 ; CHECK-NEXT: vextractf64x4 $1, %zmm2, %ymm3 -; CHECK-NEXT: vblendpd {{.*#+}} ymm2 = ymm3[0],ymm2[1,2,3] -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vcmpeqpd %ymm3, %ymm1, %k1 -; CHECK-NEXT: vpermpd {{.*#+}} ymm0 {%k1} = ymm2[3,0,2,0] +; CHECK-NEXT: vmovapd {{.*#+}} ymm4 = [3,4,2,4] +; CHECK-NEXT: vpermi2pd %ymm3, %ymm2, %ymm4 +; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; CHECK-NEXT: vcmpeqpd %ymm2, %ymm1, %k1 +; CHECK-NEXT: vmovapd %ymm4, %ymm0 {%k1} ; CHECK-NEXT: retq %vec = load <8 x double>, <8 x double>* %vp %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> @@ -4434,12 +4444,13 @@ define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mem_mask1(<8 x double>* %vp, <4 x double> %mask) { ; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mem_mask1: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovapd (%rdi), %zmm1 -; CHECK-NEXT: vextractf64x4 $1, %zmm1, %ymm2 -; CHECK-NEXT: vblendpd {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3] -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vcmpeqpd %ymm2, %ymm0, %k1 -; CHECK-NEXT: vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm1[3,0,2,0] +; CHECK-NEXT: vmovapd (%rdi), %zmm2 +; CHECK-NEXT: vextractf64x4 $1, %zmm2, %ymm3 +; CHECK-NEXT: vmovapd {{.*#+}} ymm1 = [3,4,2,4] +; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; CHECK-NEXT: vcmpeqpd %ymm4, %ymm0, %k1 +; CHECK-NEXT: vpermi2pd %ymm3, %ymm2, %ymm1 {%k1} {z} +; CHECK-NEXT: vmovapd %ymm1, %ymm0 ; CHECK-NEXT: retq %vec = load <8 x double>, <8 x double>* %vp %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> @@ -4453,10 +4464,11 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vmovapd (%rdi), %zmm2 ; CHECK-NEXT: vextractf64x4 $1, %zmm2, %ymm3 -; CHECK-NEXT: vblendpd {{.*#+}} ymm2 = ymm3[0],ymm2[1,2,3] -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vcmpeqpd %ymm3, %ymm1, %k1 -; CHECK-NEXT: vpermpd {{.*#+}} ymm0 {%k1} = ymm2[1,2,3,0] +; CHECK-NEXT: vmovapd {{.*#+}} ymm4 = [1,2,3,4] +; CHECK-NEXT: vpermi2pd %ymm3, %ymm2, %ymm4 +; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; CHECK-NEXT: vcmpeqpd %ymm2, %ymm1, %k1 +; CHECK-NEXT: vmovapd %ymm4, %ymm0 {%k1} ; CHECK-NEXT: retq %vec = load <8 x double>, <8 x double>* %vp %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> @@ -4468,12 +4480,13 @@ define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mem_mask2(<8 x double>* %vp, <4 x double> %mask) { ; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mem_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovapd (%rdi), %zmm1 -; CHECK-NEXT: vextractf64x4 $1, %zmm1, %ymm2 -; CHECK-NEXT: vblendpd {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3] -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vcmpeqpd %ymm2, %ymm0, %k1 -; CHECK-NEXT: vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm1[1,2,3,0] +; CHECK-NEXT: vmovapd (%rdi), %zmm2 +; CHECK-NEXT: vextractf64x4 $1, %zmm2, %ymm3 +; CHECK-NEXT: vmovapd {{.*#+}} ymm1 = [1,2,3,4] +; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; CHECK-NEXT: vcmpeqpd %ymm4, %ymm0, %k1 +; CHECK-NEXT: vpermi2pd %ymm3, %ymm2, %ymm1 {%k1} {z} +; CHECK-NEXT: vmovapd %ymm1, %ymm0 ; CHECK-NEXT: retq %vec = load <8 x double>, <8 x double>* %vp %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> @@ -4485,10 +4498,10 @@ define <4 x double> @test_8xdouble_to_4xdouble_perm_mem_mask3(<8 x double>* %vp) { ; CHECK-LABEL: test_8xdouble_to_4xdouble_perm_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovapd (%rdi), %zmm0 -; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm1 -; CHECK-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,0] -; CHECK-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3] +; CHECK-NEXT: vmovapd (%rdi), %zmm1 +; CHECK-NEXT: vextractf64x4 $1, %zmm1, %ymm2 +; CHECK-NEXT: vmovapd {{.*#+}} ymm0 = [4,2,1,0] +; CHECK-NEXT: vpermi2pd %ymm2, %ymm1, %ymm0 ; CHECK-NEXT: retq %vec = load <8 x double>, <8 x double>* %vp %res = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> @@ -4499,11 +4512,11 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vmovapd (%rdi), %zmm2 ; CHECK-NEXT: vextractf64x4 $1, %zmm2, %ymm3 -; CHECK-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,1,0] -; CHECK-NEXT: vblendpd {{.*#+}} ymm2 = ymm3[0],ymm2[1,2,3] -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vcmpeqpd %ymm3, %ymm1, %k1 -; CHECK-NEXT: vmovapd %ymm2, %ymm0 {%k1} +; CHECK-NEXT: vmovapd {{.*#+}} ymm4 = [4,2,1,0] +; CHECK-NEXT: vpermi2pd %ymm3, %ymm2, %ymm4 +; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; CHECK-NEXT: vcmpeqpd %ymm2, %ymm1, %k1 +; CHECK-NEXT: vmovapd %ymm4, %ymm0 {%k1} ; CHECK-NEXT: retq %vec = load <8 x double>, <8 x double>* %vp %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> @@ -4515,13 +4528,13 @@ define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mem_mask3(<8 x double>* %vp, <4 x double> %mask) { ; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovapd (%rdi), %zmm1 -; CHECK-NEXT: vextractf64x4 $1, %zmm1, %ymm2 -; CHECK-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,1,0] -; CHECK-NEXT: vblendpd {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3] -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vcmpeqpd %ymm2, %ymm0, %k1 -; CHECK-NEXT: vmovapd %ymm1, %ymm0 {%k1} {z} +; CHECK-NEXT: vmovapd (%rdi), %zmm2 +; CHECK-NEXT: vextractf64x4 $1, %zmm2, %ymm3 +; CHECK-NEXT: vmovapd {{.*#+}} ymm1 = [4,2,1,0] +; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; CHECK-NEXT: vcmpeqpd %ymm4, %ymm0, %k1 +; CHECK-NEXT: vpermi2pd %ymm3, %ymm2, %ymm1 {%k1} {z} +; CHECK-NEXT: vmovapd %ymm1, %ymm0 ; CHECK-NEXT: retq %vec = load <8 x double>, <8 x double>* %vp %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> @@ -4571,10 +4584,11 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vmovapd (%rdi), %zmm2 ; CHECK-NEXT: vextractf64x4 $1, %zmm2, %ymm3 -; CHECK-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3] -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vcmpeqpd %ymm3, %ymm1, %k1 -; CHECK-NEXT: vpermpd {{.*#+}} ymm0 {%k1} = ymm2[2,1,1,1] +; CHECK-NEXT: vmovapd {{.*#+}} ymm4 = [6,1,1,1] +; CHECK-NEXT: vpermi2pd %ymm2, %ymm3, %ymm4 +; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; CHECK-NEXT: vcmpeqpd %ymm2, %ymm1, %k1 +; CHECK-NEXT: vmovapd %ymm4, %ymm0 {%k1} ; CHECK-NEXT: retq %vec = load <8 x double>, <8 x double>* %vp %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> @@ -4586,12 +4600,13 @@ define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mem_mask5(<8 x double>* %vp, <4 x double> %mask) { ; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mem_mask5: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovapd (%rdi), %zmm1 -; CHECK-NEXT: vextractf64x4 $1, %zmm1, %ymm2 -; CHECK-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3] -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vcmpeqpd %ymm2, %ymm0, %k1 -; CHECK-NEXT: vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm1[2,1,1,1] +; CHECK-NEXT: vmovapd (%rdi), %zmm2 +; CHECK-NEXT: vextractf64x4 $1, %zmm2, %ymm3 +; CHECK-NEXT: vmovapd {{.*#+}} ymm1 = [6,1,1,1] +; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; CHECK-NEXT: vcmpeqpd %ymm4, %ymm0, %k1 +; CHECK-NEXT: vpermi2pd %ymm2, %ymm3, %ymm1 {%k1} {z} +; CHECK-NEXT: vmovapd %ymm1, %ymm0 ; CHECK-NEXT: retq %vec = load <8 x double>, <8 x double>* %vp %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> @@ -4605,9 +4620,9 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vmovapd (%rdi), %zmm0 ; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm1 -; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm0 -; CHECK-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,1] -; CHECK-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3] +; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm2 +; CHECK-NEXT: vmovapd {{.*#+}} ymm0 = [0,2,6,1] +; CHECK-NEXT: vpermi2pd %ymm1, %ymm2, %ymm0 ; CHECK-NEXT: retq %vec = load <8 x double>, <8 x double>* %vp %res = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> @@ -4619,11 +4634,11 @@ ; CHECK-NEXT: vmovapd (%rdi), %zmm2 ; CHECK-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm3 ; CHECK-NEXT: vextractf64x4 $1, %zmm2, %ymm2 -; CHECK-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,2,2,1] -; CHECK-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3] -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vcmpeqpd %ymm3, %ymm1, %k1 -; CHECK-NEXT: vmovapd %ymm2, %ymm0 {%k1} +; CHECK-NEXT: vmovapd {{.*#+}} ymm4 = [0,2,6,1] +; CHECK-NEXT: vpermi2pd %ymm3, %ymm2, %ymm4 +; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; CHECK-NEXT: vcmpeqpd %ymm2, %ymm1, %k1 +; CHECK-NEXT: vmovapd %ymm4, %ymm0 {%k1} ; CHECK-NEXT: retq %vec = load <8 x double>, <8 x double>* %vp %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> @@ -4637,12 +4652,12 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vmovapd (%rdi), %zmm1 ; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm2 -; CHECK-NEXT: vextractf64x4 $1, %zmm1, %ymm1 -; CHECK-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,2,1] -; CHECK-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3] -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vcmpeqpd %ymm2, %ymm0, %k1 -; CHECK-NEXT: vmovapd %ymm1, %ymm0 {%k1} {z} +; CHECK-NEXT: vextractf64x4 $1, %zmm1, %ymm3 +; CHECK-NEXT: vmovapd {{.*#+}} ymm1 = [0,2,6,1] +; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; CHECK-NEXT: vcmpeqpd %ymm4, %ymm0, %k1 +; CHECK-NEXT: vpermi2pd %ymm2, %ymm3, %ymm1 {%k1} {z} +; CHECK-NEXT: vmovapd %ymm1, %ymm0 ; CHECK-NEXT: retq %vec = load <8 x double>, <8 x double>* %vp %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> @@ -4656,10 +4671,11 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vmovapd (%rdi), %zmm2 ; CHECK-NEXT: vextractf64x4 $1, %zmm2, %ymm3 -; CHECK-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3] -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vcmpeqpd %ymm3, %ymm1, %k1 -; CHECK-NEXT: vpermpd {{.*#+}} ymm0 {%k1} = ymm2[0,1,2,1] +; CHECK-NEXT: vmovapd {{.*#+}} ymm4 = [0,5,2,5] +; CHECK-NEXT: vpermi2pd %ymm3, %ymm2, %ymm4 +; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; CHECK-NEXT: vcmpeqpd %ymm2, %ymm1, %k1 +; CHECK-NEXT: vmovapd %ymm4, %ymm0 {%k1} ; CHECK-NEXT: retq %vec = load <8 x double>, <8 x double>* %vp %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> @@ -4671,12 +4687,13 @@ define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mem_mask7(<8 x double>* %vp, <4 x double> %mask) { ; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mem_mask7: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovapd (%rdi), %zmm1 -; CHECK-NEXT: vextractf64x4 $1, %zmm1, %ymm2 -; CHECK-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3] -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vcmpeqpd %ymm2, %ymm0, %k1 -; CHECK-NEXT: vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm1[0,1,2,1] +; CHECK-NEXT: vmovapd (%rdi), %zmm2 +; CHECK-NEXT: vextractf64x4 $1, %zmm2, %ymm3 +; CHECK-NEXT: vmovapd {{.*#+}} ymm1 = [0,5,2,5] +; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; CHECK-NEXT: vcmpeqpd %ymm4, %ymm0, %k1 +; CHECK-NEXT: vpermi2pd %ymm3, %ymm2, %ymm1 {%k1} {z} +; CHECK-NEXT: vmovapd %ymm1, %ymm0 ; CHECK-NEXT: retq %vec = load <8 x double>, <8 x double>* %vp %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> @@ -4688,10 +4705,10 @@ define <2 x double> @test_8xdouble_to_2xdouble_perm_mem_mask0(<8 x double>* %vp) { ; CHECK-LABEL: test_8xdouble_to_2xdouble_perm_mem_mask0: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovapd (%rdi), %zmm0 -; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm1 -; CHECK-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1],ymm1[0],ymm0[3],ymm1[2] -; CHECK-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3] +; CHECK-NEXT: vmovapd (%rdi), %zmm1 +; CHECK-NEXT: vextractf64x4 $1, %zmm1, %ymm2 +; CHECK-NEXT: vmovapd {{.*#+}} ymm0 = [1,6,3,6] +; CHECK-NEXT: vpermi2pd %ymm2, %ymm1, %ymm0 ; CHECK-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0 ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq @@ -4704,11 +4721,11 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vmovapd (%rdi), %zmm2 ; CHECK-NEXT: vextractf64x4 $1, %zmm2, %ymm3 -; CHECK-NEXT: vshufpd {{.*#+}} ymm2 = ymm2[1],ymm3[0],ymm2[3],ymm3[2] -; CHECK-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,3,2,3] -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vcmpeqpd %xmm3, %xmm1, %k1 -; CHECK-NEXT: vmovapd %xmm2, %xmm0 {%k1} +; CHECK-NEXT: vmovapd {{.*#+}} ymm4 = [1,6,3,6] +; CHECK-NEXT: vpermi2pd %ymm3, %ymm2, %ymm4 +; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; CHECK-NEXT: vcmpeqpd %xmm2, %xmm1, %k1 +; CHECK-NEXT: vmovapd %xmm4, %xmm0 {%k1} ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %vec = load <8 x double>, <8 x double>* %vp @@ -4723,11 +4740,11 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vmovapd (%rdi), %zmm1 ; CHECK-NEXT: vextractf64x4 $1, %zmm1, %ymm2 -; CHECK-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[1],ymm2[0],ymm1[3],ymm2[2] -; CHECK-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,3,2,3] -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vcmpeqpd %xmm2, %xmm0, %k1 -; CHECK-NEXT: vmovapd %xmm1, %xmm0 {%k1} {z} +; CHECK-NEXT: vmovapd {{.*#+}} ymm3 = [1,6,3,6] +; CHECK-NEXT: vpermi2pd %ymm2, %ymm1, %ymm3 +; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-NEXT: vcmpeqpd %xmm1, %xmm0, %k1 +; CHECK-NEXT: vmovapd %xmm3, %xmm0 {%k1} {z} ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %vec = load <8 x double>, <8 x double>* %vp Index: test/CodeGen/X86/avx512-trunc.ll =================================================================== --- test/CodeGen/X86/avx512-trunc.ll +++ test/CodeGen/X86/avx512-trunc.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512f | FileCheck %s --check-prefix=ALL --check-prefix=KNL -; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512f -mattr=+avx512vl -mattr=+avx512bw -mattr=+avx512dq | FileCheck %s --check-prefix=ALL --check-prefix=SKX +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512f,+avx512vl,+avx512bw,+avx512dq,+fast-variable-shuffle | FileCheck %s --check-prefixes=ALL,SKX attributes #0 = { nounwind } Index: test/CodeGen/X86/bitcast-int-to-vector-bool-sext.ll =================================================================== --- test/CodeGen/X86/bitcast-int-to-vector-bool-sext.ll +++ test/CodeGen/X86/bitcast-int-to-vector-bool-sext.ll @@ -2,8 +2,9 @@ ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE2-SSSE3,SSE2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefixes=SSE2-SSSE3,SSSE3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX12,AVX1 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX12,AVX2 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl,+avx512bw | FileCheck %s --check-prefixes=AVX512 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX12,AVX2,AVX2-SLOW +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX12,AVX2,AVX2-FAST +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl,+avx512bw,+fast-variable-shuffle | FileCheck %s --check-prefix=AVX512 ; ; 128-bit vectors @@ -379,19 +380,31 @@ ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; -; AVX2-LABEL: ext_i32_32i8: -; AVX2: # %bb.0: -; AVX2-NEXT: vmovd %edi, %xmm0 -; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; AVX2-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,1,1,4,5,6,7] -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] -; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,2,3,3,4,5,6,7] -; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] -; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745] -; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: retq +; AVX2-SLOW-LABEL: ext_i32_32i8: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vmovd %edi, %xmm0 +; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,1,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,2,3,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm1 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745] +; AVX2-SLOW-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: ext_i32_32i8: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vmovd %edi, %xmm0 +; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,1,0,1,0,1,0,1,2,3,2,3,2,3,2,3] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,4,5,4,5,4,5,6,7,6,7,6,7,6,7] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm1 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745] +; AVX2-FAST-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm0 +; AVX2-FAST-NEXT: retq ; ; AVX512-LABEL: ext_i32_32i8: ; AVX512: # %bb.0: @@ -683,26 +696,43 @@ ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ; AVX1-NEXT: retq ; -; AVX2-LABEL: ext_i64_64i8: -; AVX2: # %bb.0: -; AVX2-NEXT: vmovq %rdi, %xmm0 -; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm1[0,0,1,1,4,5,6,7] -; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] -; AVX2-NEXT: vpshuflw {{.*#+}} xmm2 = xmm1[2,2,3,3,4,5,6,7] -; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,1,1] -; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745] -; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpcmpeqb %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpshufhw {{.*#+}} xmm3 = xmm1[0,1,2,3,4,4,5,5] -; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,2,3,3] -; AVX2-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,6,7,7] -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm3, %ymm1 -; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1 -; AVX2-NEXT: vpcmpeqb %ymm2, %ymm1, %ymm1 -; AVX2-NEXT: retq +; AVX2-SLOW-LABEL: ext_i64_64i8: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vmovq %rdi, %xmm0 +; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm1[0,0,1,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm1[2,2,3,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,1,1] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745] +; AVX2-SLOW-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpcmpeqb %ymm2, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm3 = xmm1[0,1,2,3,4,4,5,5] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,2,3,3] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,6,7,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm3, %ymm1 +; AVX2-SLOW-NEXT: vpand %ymm2, %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vpcmpeqb %ymm2, %ymm1, %ymm1 +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: ext_i64_64i8: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vmovq %rdi, %xmm0 +; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm1[0,1,0,1,0,1,0,1,2,3,2,3,2,3,2,3] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm1[4,5,4,5,4,5,4,5,6,7,6,7,6,7,6,7] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745] +; AVX2-FAST-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpcmpeqb %ymm2, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm1[8,9,8,9,8,9,8,9,10,11,10,11,10,11,10,11] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[12,13,12,13,12,13,12,13,14,15,14,15,14,15,14,15] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm3, %ymm1 +; AVX2-FAST-NEXT: vpand %ymm2, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpcmpeqb %ymm2, %ymm1, %ymm1 +; AVX2-FAST-NEXT: retq ; ; AVX512-LABEL: ext_i64_64i8: ; AVX512: # %bb.0: Index: test/CodeGen/X86/bitcast-int-to-vector-bool-zext.ll =================================================================== --- test/CodeGen/X86/bitcast-int-to-vector-bool-zext.ll +++ test/CodeGen/X86/bitcast-int-to-vector-bool-zext.ll @@ -2,9 +2,10 @@ ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE2-SSSE3,SSE2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefixes=SSE2-SSSE3,SSSE3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX12,AVX1 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX12,AVX2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX12,AVX2,AVX2-SLOW +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX12,AVX2,AVX2-FAST ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512,AVX512F -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl,+avx512bw | FileCheck %s --check-prefixes=AVX512,AVX512VLBW +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl,+avx512bw,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VLBW ; ; 128-bit vectors @@ -472,21 +473,35 @@ ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; -; AVX2-LABEL: ext_i32_32i8: -; AVX2: # %bb.0: -; AVX2-NEXT: vmovd %edi, %xmm0 -; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; AVX2-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,1,1,4,5,6,7] -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] -; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,2,3,3,4,5,6,7] -; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] -; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745] -; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpsrlw $7, %ymm0, %ymm0 -; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 -; AVX2-NEXT: retq +; AVX2-SLOW-LABEL: ext_i32_32i8: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vmovd %edi, %xmm0 +; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,1,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,2,3,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm1 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745] +; AVX2-SLOW-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpsrlw $7, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: ext_i32_32i8: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vmovd %edi, %xmm0 +; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,1,0,1,0,1,0,1,2,3,2,3,2,3,2,3] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,4,5,4,5,4,5,6,7,6,7,6,7,6,7] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm1 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745] +; AVX2-FAST-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpsrlw $7, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 +; AVX2-FAST-NEXT: retq ; ; AVX512F-LABEL: ext_i32_32i8: ; AVX512F: # %bb.0: @@ -885,31 +900,53 @@ ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ; AVX1-NEXT: retq ; -; AVX2-LABEL: ext_i64_64i8: -; AVX2: # %bb.0: -; AVX2-NEXT: vmovq %rdi, %xmm0 -; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm1[0,0,1,1,4,5,6,7] -; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] -; AVX2-NEXT: vpshuflw {{.*#+}} xmm2 = xmm1[2,2,3,3,4,5,6,7] -; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,1,1] -; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745] -; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpcmpeqb %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpsrlw $7, %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] -; AVX2-NEXT: vpand %ymm3, %ymm0, %ymm0 -; AVX2-NEXT: vpshufhw {{.*#+}} xmm4 = xmm1[0,1,2,3,4,4,5,5] -; AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,2,3,3] -; AVX2-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,6,7,7] -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm4, %ymm1 -; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1 -; AVX2-NEXT: vpcmpeqb %ymm2, %ymm1, %ymm1 -; AVX2-NEXT: vpsrlw $7, %ymm1, %ymm1 -; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm1 -; AVX2-NEXT: retq +; AVX2-SLOW-LABEL: ext_i64_64i8: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vmovq %rdi, %xmm0 +; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm1[0,0,1,1,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm1[2,2,3,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,1,1] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745] +; AVX2-SLOW-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpcmpeqb %ymm2, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpsrlw $7, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX2-SLOW-NEXT: vpand %ymm3, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm4 = xmm1[0,1,2,3,4,4,5,5] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,2,3,3] +; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,6,7,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm4, %ymm1 +; AVX2-SLOW-NEXT: vpand %ymm2, %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vpcmpeqb %ymm2, %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vpsrlw $7, %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vpand %ymm3, %ymm1, %ymm1 +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: ext_i64_64i8: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vmovq %rdi, %xmm0 +; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm1[0,1,0,1,0,1,0,1,2,3,2,3,2,3,2,3] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm1[4,5,4,5,4,5,4,5,6,7,6,7,6,7,6,7] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745] +; AVX2-FAST-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpcmpeqb %ymm2, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpsrlw $7, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX2-FAST-NEXT: vpand %ymm3, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm4 = xmm1[8,9,8,9,8,9,8,9,10,11,10,11,10,11,10,11] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[12,13,12,13,12,13,12,13,14,15,14,15,14,15,14,15] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm4, %ymm1 +; AVX2-FAST-NEXT: vpand %ymm2, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpcmpeqb %ymm2, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpsrlw $7, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpand %ymm3, %ymm1, %ymm1 +; AVX2-FAST-NEXT: retq ; ; AVX512F-LABEL: ext_i64_64i8: ; AVX512F: # %bb.0: Index: test/CodeGen/X86/broadcastm-lowering.ll =================================================================== --- test/CodeGen/X86/broadcastm-lowering.ll +++ test/CodeGen/X86/broadcastm-lowering.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512cd | FileCheck %s --check-prefix=ALL --check-prefix=AVX512CD -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,avx512cd,+avx512bw| FileCheck %s --check-prefix=ALL --check-prefix=AVX512VLCDBW -; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512vl,avx512cd,+avx512bw| FileCheck %s --check-prefix=ALL --check-prefix=X86-AVX512VLCDBW +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512cd,+fast-variable-shuffle | FileCheck %s --check-prefixes=ALL,AVX512CD +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,avx512cd,+avx512bw,+fast-variable-shuffle | FileCheck %s --check-prefixes=ALL,AVX512VLCDBW +; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512vl,avx512cd,+avx512bw,+fast-variable-shuffle | FileCheck %s --check-prefixes=ALL,X86-AVX512VLCDBW define <2 x i64> @test_mm_epi64(<8 x i16> %a, <8 x i16> %b) { ; AVX512CD-LABEL: test_mm_epi64: @@ -28,7 +28,7 @@ ; X86-AVX512VLCDBW-NEXT: kmovd %k0, %eax ; X86-AVX512VLCDBW-NEXT: movzbl %al, %eax ; X86-AVX512VLCDBW-NEXT: vmovd %eax, %xmm0 -; X86-AVX512VLCDBW-NEXT: vpbroadcastq %xmm0, %xmm0 +; X86-AVX512VLCDBW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3],zero,zero,zero,zero,xmm0[0,1,2,3],zero,zero,zero,zero ; X86-AVX512VLCDBW-NEXT: retl entry: %0 = icmp eq <8 x i16> %a, %b @@ -122,7 +122,7 @@ ; X86-AVX512VLCDBW-NEXT: kmovd %k0, %eax ; X86-AVX512VLCDBW-NEXT: movzbl %al, %eax ; X86-AVX512VLCDBW-NEXT: vmovd %eax, %xmm0 -; X86-AVX512VLCDBW-NEXT: vpbroadcastq %xmm0, %xmm0 +; X86-AVX512VLCDBW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3],zero,zero,zero,zero,xmm0[0,1,2,3],zero,zero,zero,zero ; X86-AVX512VLCDBW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; X86-AVX512VLCDBW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 ; X86-AVX512VLCDBW-NEXT: retl @@ -160,7 +160,7 @@ ; X86-AVX512VLCDBW-NEXT: kmovd %k0, %eax ; X86-AVX512VLCDBW-NEXT: movzbl %al, %eax ; X86-AVX512VLCDBW-NEXT: vmovd %eax, %xmm0 -; X86-AVX512VLCDBW-NEXT: vpbroadcastq %xmm0, %xmm0 +; X86-AVX512VLCDBW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3],zero,zero,zero,zero,xmm0[0,1,2,3],zero,zero,zero,zero ; X86-AVX512VLCDBW-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; X86-AVX512VLCDBW-NEXT: retl entry: Index: test/CodeGen/X86/combine-shl.ll =================================================================== --- test/CodeGen/X86/combine-shl.ll +++ test/CodeGen/X86/combine-shl.ll @@ -1,6 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX-SLOW +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX,AVX-FAST ; fold (shl 0, x) -> 0 define <4 x i32> @combine_vec_shl_zero(<4 x i32> %x) { @@ -113,14 +114,23 @@ ; SSE-NEXT: pmulld %xmm1, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: combine_vec_shl_trunc_and: -; AVX: # %bb.0: -; AVX-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] -; AVX-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 -; AVX-NEXT: vpsllvd %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vzeroupper -; AVX-NEXT: retq +; AVX-SLOW-LABEL: combine_vec_shl_trunc_and: +; AVX-SLOW: # %bb.0: +; AVX-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] +; AVX-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] +; AVX-SLOW-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 +; AVX-SLOW-NEXT: vpsllvd %xmm1, %xmm0, %xmm0 +; AVX-SLOW-NEXT: vzeroupper +; AVX-SLOW-NEXT: retq +; +; AVX-FAST-LABEL: combine_vec_shl_trunc_and: +; AVX-FAST: # %bb.0: +; AVX-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7] +; AVX-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1 +; AVX-FAST-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 +; AVX-FAST-NEXT: vpsllvd %xmm1, %xmm0, %xmm0 +; AVX-FAST-NEXT: vzeroupper +; AVX-FAST-NEXT: retq %1 = and <4 x i64> %y, %2 = trunc <4 x i64> %1 to <4 x i32> %3 = shl <4 x i32> %x, %2 Index: test/CodeGen/X86/combine-sra.ll =================================================================== --- test/CodeGen/X86/combine-sra.ll +++ test/CodeGen/X86/combine-sra.ll @@ -1,6 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2-SLOW +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX,AVX2-FAST ; fold (sra 0, x) -> 0 define <4 x i32> @combine_vec_ashr_zero(<4 x i32> %x) { @@ -180,14 +181,23 @@ ; SSE-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,3],xmm0[4,5],xmm4[6,7] ; SSE-NEXT: retq ; -; AVX-LABEL: combine_vec_ashr_trunc_and: -; AVX: # %bb.0: -; AVX-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] -; AVX-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 -; AVX-NEXT: vpsravd %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vzeroupper -; AVX-NEXT: retq +; AVX2-SLOW-LABEL: combine_vec_ashr_trunc_and: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] +; AVX2-SLOW-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vpsravd %xmm1, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vzeroupper +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: combine_vec_ashr_trunc_and: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7] +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1 +; AVX2-FAST-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 +; AVX2-FAST-NEXT: vpsravd %xmm1, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vzeroupper +; AVX2-FAST-NEXT: retq %1 = and <4 x i64> %y, %2 = trunc <4 x i64> %1 to <4 x i32> %3 = ashr <4 x i32> %x, %2 @@ -213,14 +223,23 @@ ; SSE-NEXT: movdqa %xmm1, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: combine_vec_ashr_trunc_lshr: -; AVX: # %bb.0: -; AVX-NEXT: vpsrlq $32, %ymm0, %ymm0 -; AVX-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] -; AVX-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX-NEXT: vpsravd {{.*}}(%rip), %xmm0, %xmm0 -; AVX-NEXT: vzeroupper -; AVX-NEXT: retq +; AVX2-SLOW-LABEL: combine_vec_ashr_trunc_lshr: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpsrlq $32, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-SLOW-NEXT: vpsravd {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vzeroupper +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: combine_vec_ashr_trunc_lshr: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vpsrlq $32, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,4,6,4,6,6,7] +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT: vpsravd {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-FAST-NEXT: vzeroupper +; AVX2-FAST-NEXT: retq %1 = lshr <4 x i64> %x, %2 = trunc <4 x i64> %1 to <4 x i32> %3 = ashr <4 x i32> %2, @@ -247,13 +266,21 @@ ; SSE-NEXT: movdqa %xmm1, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: combine_vec_ashr_trunc_ashr: -; AVX: # %bb.0: -; AVX-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[1,3,2,3,5,7,6,7] -; AVX-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX-NEXT: vpsravd {{.*}}(%rip), %xmm0, %xmm0 -; AVX-NEXT: vzeroupper -; AVX-NEXT: retq +; AVX2-SLOW-LABEL: combine_vec_ashr_trunc_ashr: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[1,3,2,3,5,7,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-SLOW-NEXT: vpsravd {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vzeroupper +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: combine_vec_ashr_trunc_ashr: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [1,3,5,7,5,7,6,7] +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT: vpsravd {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-FAST-NEXT: vzeroupper +; AVX2-FAST-NEXT: retq %1 = ashr <4 x i64> %x, %2 = trunc <4 x i64> %1 to <4 x i32> %3 = ashr <4 x i32> %2, Index: test/CodeGen/X86/combine-srl.ll =================================================================== --- test/CodeGen/X86/combine-srl.ll +++ test/CodeGen/X86/combine-srl.ll @@ -1,6 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2-SLOW +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX,AVX2-FAST ; fold (srl 0, x) -> 0 define <4 x i32> @combine_vec_lshr_zero(<4 x i32> %x) { @@ -215,14 +216,23 @@ ; SSE-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] ; SSE-NEXT: retq ; -; AVX-LABEL: combine_vec_lshr_trunc_lshr1: -; AVX: # %bb.0: -; AVX-NEXT: vpsrlvq {{.*}}(%rip), %ymm0, %ymm0 -; AVX-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] -; AVX-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm0 -; AVX-NEXT: vzeroupper -; AVX-NEXT: retq +; AVX2-SLOW-LABEL: combine_vec_lshr_trunc_lshr1: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpsrlvq {{.*}}(%rip), %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-SLOW-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vzeroupper +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: combine_vec_lshr_trunc_lshr1: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vpsrlvq {{.*}}(%rip), %ymm0, %ymm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,4,6,4,6,6,7] +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-FAST-NEXT: vzeroupper +; AVX2-FAST-NEXT: retq %1 = lshr <4 x i64> %x, %2 = trunc <4 x i64> %1 to <4 x i32> %3 = lshr <4 x i32> %2, @@ -446,14 +456,23 @@ ; SSE-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,3],xmm0[4,5],xmm4[6,7] ; SSE-NEXT: retq ; -; AVX-LABEL: combine_vec_lshr_trunc_and: -; AVX: # %bb.0: -; AVX-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] -; AVX-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 -; AVX-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vzeroupper -; AVX-NEXT: retq +; AVX2-SLOW-LABEL: combine_vec_lshr_trunc_and: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] +; AVX2-SLOW-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vzeroupper +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: combine_vec_lshr_trunc_and: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7] +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1 +; AVX2-FAST-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 +; AVX2-FAST-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vzeroupper +; AVX2-FAST-NEXT: retq %1 = and <4 x i64> %y, %2 = trunc <4 x i64> %1 to <4 x i32> %3 = lshr <4 x i32> %x, %2 Index: test/CodeGen/X86/insertelement-zero.ll =================================================================== --- test/CodeGen/X86/insertelement-zero.ll +++ test/CodeGen/X86/insertelement-zero.ll @@ -4,7 +4,8 @@ ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSSE3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=ALL,AVX,AVX2,AVX2-SLOW +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefixes=ALL,AVX,AVX2,AVX2-FAST define <2 x double> @insert_v2f64_z1(<2 x double> %a) { ; SSE2-LABEL: insert_v2f64_z1: @@ -429,12 +430,24 @@ ; SSE41-NEXT: pinsrb $15, %eax, %xmm0 ; SSE41-NEXT: retq ; -; AVX-LABEL: insert_v16i8_z123456789ABCDEz: -; AVX: # %bb.0: -; AVX-NEXT: xorl %eax, %eax -; AVX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 -; AVX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: insert_v16i8_z123456789ABCDEz: +; AVX1: # %bb.0: +; AVX1-NEXT: xorl %eax, %eax +; AVX1-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 +; AVX1-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-SLOW-LABEL: insert_v16i8_z123456789ABCDEz: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: xorl %eax, %eax +; AVX2-SLOW-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: insert_v16i8_z123456789ABCDEz: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-FAST-NEXT: retq %1 = insertelement <16 x i8> %a, i8 0, i32 0 %2 = insertelement <16 x i8> %1, i8 0, i32 15 ret <16 x i8> %2 @@ -479,16 +492,25 @@ ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; -; AVX2-LABEL: insert_v32i8_z123456789ABCDEzGHIJKLMNOPQRSTzz: -; AVX2: # %bb.0: -; AVX2-NEXT: xorl %eax, %eax -; AVX2-NEXT: vpinsrb $0, %eax, %xmm0, %xmm1 -; AVX2-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7] -; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 -; AVX2-NEXT: retq +; AVX2-SLOW-LABEL: insert_v32i8_z123456789ABCDEzGHIJKLMNOPQRSTzz: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: xorl %eax, %eax +; AVX2-SLOW-NEXT: vpinsrb $0, %eax, %xmm0, %xmm1 +; AVX2-SLOW-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX2-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: insert_v32i8_z123456789ABCDEzGHIJKLMNOPQRSTzz: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm1 +; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm0 +; AVX2-FAST-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7] +; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT: retq %1 = insertelement <32 x i8> %a, i8 0, i32 0 %2 = insertelement <32 x i8> %1, i8 0, i32 15 %3 = insertelement <32 x i8> %2, i8 0, i32 30 Index: test/CodeGen/X86/oddshuffles.ll =================================================================== --- test/CodeGen/X86/oddshuffles.ll +++ test/CodeGen/X86/oddshuffles.ll @@ -2,7 +2,8 @@ ; RUN: llc < %s -mtriple=x86_64-pc-linux -mattr=+sse2 | FileCheck %s --check-prefix=SSE --check-prefix=SSE2 ; RUN: llc < %s -mtriple=x86_64-pc-linux -mattr=+sse4.2 | FileCheck %s --check-prefix=SSE --check-prefix=SSE42 ; RUN: llc < %s -mtriple=x86_64-pc-linux -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1 -; RUN: llc < %s -mtriple=x86_64-pc-linux -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 +; RUN: llc < %s -mtriple=x86_64-pc-linux -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-SLOW +; RUN: llc < %s -mtriple=x86_64-pc-linux -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-FAST ; RUN: llc < %s -mtriple=x86_64-pc-linux -mattr=+xop | FileCheck %s --check-prefix=XOP define void @v3i64(<2 x i64> %a, <2 x i64> %b, <3 x i64>* %p) nounwind { @@ -154,16 +155,36 @@ ; SSE42-NEXT: movq %xmm2, (%rdi) ; SSE42-NEXT: retq ; -; AVX-LABEL: v5i16: -; AVX: # %bb.0: -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] -; AVX-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,0,2,3,4,5,6,7] -; AVX-NEXT: vpshuflw {{.*#+}} xmm2 = xmm0[0,2,2,3,4,5,6,7] -; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,3,2,3] -; AVX-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX-NEXT: vpextrw $6, %xmm0, 8(%rdi) -; AVX-NEXT: vmovq %xmm1, (%rdi) -; AVX-NEXT: retq +; AVX1-LABEL: v5i16: +; AVX1: # %bb.0: +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,0,2,3,4,5,6,7] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm0[0,2,2,3,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,3,2,3] +; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX1-NEXT: vpextrw $6, %xmm0, 8(%rdi) +; AVX1-NEXT: vmovq %xmm1, (%rdi) +; AVX1-NEXT: retq +; +; AVX2-SLOW-LABEL: v5i16: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,0,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm0[0,2,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,3,2,3] +; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX2-SLOW-NEXT: vpextrw $6, %xmm0, 8(%rdi) +; AVX2-SLOW-NEXT: vmovq %xmm1, (%rdi) +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: v5i16: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,5,8,9,4,5,6,7,8,9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[0,1,4,5,12,13,14,15,8,9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; AVX2-FAST-NEXT: vpextrw $6, %xmm0, 8(%rdi) +; AVX2-FAST-NEXT: vmovq %xmm1, (%rdi) +; AVX2-FAST-NEXT: retq ; ; XOP-LABEL: v5i16: ; XOP: # %bb.0: @@ -550,18 +571,30 @@ ; AVX1-NEXT: vmovq %xmm2, 16(%rdi) ; AVX1-NEXT: retq ; -; AVX2-LABEL: v12i16: -; AVX2: # %bb.0: -; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,2,3] -; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[3,1,2,3] -; AVX2-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,3,1,3,4,5,6,7] -; AVX2-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2],xmm2[3],xmm3[4,5,6,7] -; AVX2-NEXT: vpbroadcastd %xmm1, %xmm1 -; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,2,3,10,11,10,11,4,5,12,13] -; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3,4],xmm1[5],xmm0[6,7] -; AVX2-NEXT: vmovdqa %xmm0, (%rdi) -; AVX2-NEXT: vmovq %xmm2, 16(%rdi) -; AVX2-NEXT: retq +; AVX2-SLOW-LABEL: v12i16: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[3,1,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,3,1,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2],xmm2[3],xmm3[4,5,6,7] +; AVX2-SLOW-NEXT: vpbroadcastd %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,2,3,10,11,10,11,4,5,12,13] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3,4],xmm1[5],xmm0[6,7] +; AVX2-SLOW-NEXT: vmovdqa %xmm0, (%rdi) +; AVX2-SLOW-NEXT: vmovq %xmm2, 16(%rdi) +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: v12i16: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vpbroadcastd %xmm1, %xmm2 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[0,1,8,9,8,9,2,3,10,11,10,11,4,5,12,13] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2],xmm3[3,4],xmm2[5],xmm3[6,7] +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,3] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[12,13,6,7,14,15,6,7,8,9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5,6,7] +; AVX2-FAST-NEXT: vmovq %xmm0, 16(%rdi) +; AVX2-FAST-NEXT: vmovdqa %xmm2, (%rdi) +; AVX2-FAST-NEXT: retq ; ; XOP-LABEL: v12i16: ; XOP: # %bb.0: @@ -637,20 +670,35 @@ ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; -; AVX2-LABEL: v12i32: -; AVX2: # %bb.0: -; AVX2-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[2,3,2,3] -; AVX2-NEXT: vpermilps {{.*#+}} ymm3 = ymm0[3,3,2,3,7,7,6,7] -; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,2,2,3] -; AVX2-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm3[1,2],xmm2[3] -; AVX2-NEXT: vmovaps {{.*#+}} ymm3 = <0,4,u,1,5,u,2,6> -; AVX2-NEXT: vpermps %ymm0, %ymm3, %ymm0 -; AVX2-NEXT: vbroadcastsd %xmm1, %ymm1 -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] -; AVX2-NEXT: vmovaps %ymm0, (%rdi) -; AVX2-NEXT: vmovaps %xmm2, 32(%rdi) -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq +; AVX2-SLOW-LABEL: v12i32: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[2,3,2,3] +; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm3 = ymm0[3,3,2,3,7,7,6,7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,2,2,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm3[1,2],xmm2[3] +; AVX2-SLOW-NEXT: vmovaps {{.*#+}} ymm3 = <0,4,u,1,5,u,2,6> +; AVX2-SLOW-NEXT: vpermps %ymm0, %ymm3, %ymm0 +; AVX2-SLOW-NEXT: vbroadcastsd %xmm1, %ymm1 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] +; AVX2-SLOW-NEXT: vmovaps %ymm0, (%rdi) +; AVX2-SLOW-NEXT: vmovaps %xmm2, 32(%rdi) +; AVX2-SLOW-NEXT: vzeroupper +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: v12i32: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm2 = <0,4,u,1,5,u,2,6> +; AVX2-FAST-NEXT: vpermps %ymm0, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vbroadcastsd %xmm1, %ymm3 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7] +; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm3 = [3,3,7,7,7,7,6,7] +; AVX2-FAST-NEXT: vpermps %ymm0, %ymm3, %ymm0 +; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[2,3,2,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3] +; AVX2-FAST-NEXT: vmovaps %xmm0, 32(%rdi) +; AVX2-FAST-NEXT: vmovaps %ymm2, (%rdi) +; AVX2-FAST-NEXT: vzeroupper +; AVX2-FAST-NEXT: retq ; ; XOP-LABEL: v12i32: ; XOP: # %bb.0: @@ -1400,34 +1448,63 @@ ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; -; AVX2-LABEL: interleave_24i32_out: -; AVX2: # %bb.0: -; AVX2-NEXT: vmovups (%rdi), %ymm0 -; AVX2-NEXT: vmovups 32(%rdi), %ymm1 -; AVX2-NEXT: vmovups 64(%rdi), %ymm2 -; AVX2-NEXT: vmovaps {{.*#+}} ymm3 = -; AVX2-NEXT: vpermps %ymm2, %ymm3, %ymm3 -; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7] -; AVX2-NEXT: vmovaps {{.*#+}} ymm5 = <0,3,6,1,4,7,u,u> -; AVX2-NEXT: vpermps %ymm4, %ymm5, %ymm4 -; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7] -; AVX2-NEXT: vmovaps {{.*#+}} ymm4 = -; AVX2-NEXT: vpermps %ymm2, %ymm4, %ymm4 -; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] -; AVX2-NEXT: vmovaps {{.*#+}} ymm6 = <1,4,7,2,5,u,u,u> -; AVX2-NEXT: vpermps %ymm5, %ymm6, %ymm5 -; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5,6,7] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] -; AVX2-NEXT: vmovaps {{.*#+}} ymm1 = <2,5,0,3,6,u,u,u> -; AVX2-NEXT: vpermps %ymm0, %ymm1, %ymm0 -; AVX2-NEXT: vpermilps {{.*#+}} ymm1 = ymm2[0,1,0,3,4,5,4,7] -; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] -; AVX2-NEXT: vmovups %ymm3, (%rsi) -; AVX2-NEXT: vmovups %ymm4, (%rdx) -; AVX2-NEXT: vmovups %ymm0, (%rcx) -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq +; AVX2-SLOW-LABEL: interleave_24i32_out: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vmovups (%rdi), %ymm0 +; AVX2-SLOW-NEXT: vmovups 32(%rdi), %ymm1 +; AVX2-SLOW-NEXT: vmovups 64(%rdi), %ymm2 +; AVX2-SLOW-NEXT: vmovaps {{.*#+}} ymm3 = +; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm3, %ymm3 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7] +; AVX2-SLOW-NEXT: vmovaps {{.*#+}} ymm5 = <0,3,6,1,4,7,u,u> +; AVX2-SLOW-NEXT: vpermps %ymm4, %ymm5, %ymm4 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7] +; AVX2-SLOW-NEXT: vmovaps {{.*#+}} ymm4 = +; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm4, %ymm4 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] +; AVX2-SLOW-NEXT: vmovaps {{.*#+}} ymm6 = <1,4,7,2,5,u,u,u> +; AVX2-SLOW-NEXT: vpermps %ymm5, %ymm6, %ymm5 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5,6,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] +; AVX2-SLOW-NEXT: vmovaps {{.*#+}} ymm1 = <2,5,0,3,6,u,u,u> +; AVX2-SLOW-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm2[0,1,0,3,4,5,4,7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] +; AVX2-SLOW-NEXT: vmovups %ymm3, (%rsi) +; AVX2-SLOW-NEXT: vmovups %ymm4, (%rdx) +; AVX2-SLOW-NEXT: vmovups %ymm0, (%rcx) +; AVX2-SLOW-NEXT: vzeroupper +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: interleave_24i32_out: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vmovups (%rdi), %ymm0 +; AVX2-FAST-NEXT: vmovups 32(%rdi), %ymm1 +; AVX2-FAST-NEXT: vmovups 64(%rdi), %ymm2 +; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm3 = +; AVX2-FAST-NEXT: vpermps %ymm2, %ymm3, %ymm3 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7] +; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm5 = <0,3,6,1,4,7,u,u> +; AVX2-FAST-NEXT: vpermps %ymm4, %ymm5, %ymm4 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7] +; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm4 = +; AVX2-FAST-NEXT: vpermps %ymm2, %ymm4, %ymm4 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm5 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] +; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm6 = <1,4,7,2,5,u,u,u> +; AVX2-FAST-NEXT: vpermps %ymm5, %ymm6, %ymm5 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5,6,7] +; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm5 = [0,1,0,3,0,1,4,7] +; AVX2-FAST-NEXT: vpermps %ymm2, %ymm5, %ymm2 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] +; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm1 = <2,5,0,3,6,u,u,u> +; AVX2-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] +; AVX2-FAST-NEXT: vmovups %ymm3, (%rsi) +; AVX2-FAST-NEXT: vmovups %ymm4, (%rdx) +; AVX2-FAST-NEXT: vmovups %ymm0, (%rcx) +; AVX2-FAST-NEXT: vzeroupper +; AVX2-FAST-NEXT: retq ; ; XOP-LABEL: interleave_24i32_out: ; XOP: # %bb.0: @@ -1603,33 +1680,61 @@ ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; -; AVX2-LABEL: interleave_24i32_in: -; AVX2: # %bb.0: -; AVX2-NEXT: vmovups (%rsi), %ymm0 -; AVX2-NEXT: vmovups (%rdx), %ymm1 -; AVX2-NEXT: vmovups (%rcx), %ymm2 -; AVX2-NEXT: vpermilps {{.*#+}} xmm3 = xmm1[1,0,2,2] -; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,1] -; AVX2-NEXT: vpermpd {{.*#+}} ymm4 = ymm0[0,0,2,1] -; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6],ymm3[7] -; AVX2-NEXT: vbroadcastsd %xmm2, %ymm4 -; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7] -; AVX2-NEXT: vpermpd {{.*#+}} ymm4 = ymm2[2,1,3,3] -; AVX2-NEXT: vpermilps {{.*#+}} ymm5 = ymm1[1,2,3,3,5,6,7,7] -; AVX2-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[2,2,2,3] -; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5,6],ymm4[7] -; AVX2-NEXT: vbroadcastsd 24(%rsi), %ymm5 -; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7] -; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,1,2,2] -; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[1,1,2,2] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2,3],ymm0[4],ymm2[5,6],ymm0[7] -; AVX2-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,0,3,3,4,4,7,7] -; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] -; AVX2-NEXT: vmovups %ymm0, 32(%rdi) -; AVX2-NEXT: vmovups %ymm4, 64(%rdi) -; AVX2-NEXT: vmovups %ymm3, (%rdi) -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq +; AVX2-SLOW-LABEL: interleave_24i32_in: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vmovups (%rsi), %ymm0 +; AVX2-SLOW-NEXT: vmovups (%rdx), %ymm1 +; AVX2-SLOW-NEXT: vmovups (%rcx), %ymm2 +; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm3 = xmm1[1,0,2,2] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,1] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm4 = ymm0[0,0,2,1] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6],ymm3[7] +; AVX2-SLOW-NEXT: vbroadcastsd %xmm2, %ymm4 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm4 = ymm2[2,1,3,3] +; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm5 = ymm1[1,2,3,3,5,6,7,7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[2,2,2,3] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5,6],ymm4[7] +; AVX2-SLOW-NEXT: vbroadcastsd 24(%rsi), %ymm5 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,1,2,2] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[1,1,2,2] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2,3],ymm0[4],ymm2[5,6],ymm0[7] +; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,0,3,3,4,4,7,7] +; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] +; AVX2-SLOW-NEXT: vmovups %ymm0, 32(%rdi) +; AVX2-SLOW-NEXT: vmovups %ymm4, 64(%rdi) +; AVX2-SLOW-NEXT: vmovups %ymm3, (%rdi) +; AVX2-SLOW-NEXT: vzeroupper +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: interleave_24i32_in: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vmovups (%rsi), %ymm0 +; AVX2-FAST-NEXT: vmovups (%rdx), %ymm1 +; AVX2-FAST-NEXT: vmovups (%rcx), %ymm2 +; AVX2-FAST-NEXT: vpermilps {{.*#+}} xmm3 = xmm1[1,0,2,2] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,1] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm4 = ymm0[0,0,2,1] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6],ymm3[7] +; AVX2-FAST-NEXT: vbroadcastsd %xmm2, %ymm4 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,1,2,2] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm4 = ymm2[1,1,2,2] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0],ymm0[1],ymm4[2,3],ymm0[4],ymm4[5,6],ymm0[7] +; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm4 = ymm1[0,0,3,3,4,4,7,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm4[2],ymm0[3,4],ymm4[5],ymm0[6,7] +; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm4 = [5,6,5,6,5,6,7,7] +; AVX2-FAST-NEXT: vpermps %ymm1, %ymm4, %ymm1 +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[2,1,3,3] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5,6],ymm2[7] +; AVX2-FAST-NEXT: vbroadcastsd 24(%rsi), %ymm2 +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7] +; AVX2-FAST-NEXT: vmovups %ymm1, 64(%rdi) +; AVX2-FAST-NEXT: vmovups %ymm0, 32(%rdi) +; AVX2-FAST-NEXT: vmovups %ymm3, (%rdi) +; AVX2-FAST-NEXT: vzeroupper +; AVX2-FAST-NEXT: retq ; ; XOP-LABEL: interleave_24i32_in: ; XOP: # %bb.0: Index: test/CodeGen/X86/psubus.ll =================================================================== --- test/CodeGen/X86/psubus.ll +++ test/CodeGen/X86/psubus.ll @@ -2,9 +2,10 @@ ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=SSE --check-prefix=SSE2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefix=SSE --check-prefix=SSSE3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE --check-prefix=SSE41 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefix=AVX512 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s -check-prefixes=AVX,AVX2,AVX2-SLOW +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-FAST +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX,AVX512 define <8 x i16> @test1(<8 x i16> %x) nounwind { ; SSE-LABEL: test1: @@ -16,11 +17,6 @@ ; AVX: # %bb.0: # %vector.ph ; AVX-NEXT: vpsubusw {{.*}}(%rip), %xmm0, %xmm0 ; AVX-NEXT: retq -; -; AVX512-LABEL: test1: -; AVX512: # %bb.0: # %vector.ph -; AVX512-NEXT: vpsubusw {{.*}}(%rip), %xmm0, %xmm0 -; AVX512-NEXT: retq vector.ph: %0 = icmp slt <8 x i16> %x, zeroinitializer %1 = xor <8 x i16> %x, @@ -38,11 +34,6 @@ ; AVX: # %bb.0: # %vector.ph ; AVX-NEXT: vpsubusw {{.*}}(%rip), %xmm0, %xmm0 ; AVX-NEXT: retq -; -; AVX512-LABEL: test2: -; AVX512: # %bb.0: # %vector.ph -; AVX512-NEXT: vpsubusw {{.*}}(%rip), %xmm0, %xmm0 -; AVX512-NEXT: retq vector.ph: %0 = icmp ugt <8 x i16> %x, %1 = add <8 x i16> %x, @@ -98,11 +89,6 @@ ; AVX: # %bb.0: # %vector.ph ; AVX-NEXT: vpsubusb {{.*}}(%rip), %xmm0, %xmm0 ; AVX-NEXT: retq -; -; AVX512-LABEL: test4: -; AVX512: # %bb.0: # %vector.ph -; AVX512-NEXT: vpsubusb {{.*}}(%rip), %xmm0, %xmm0 -; AVX512-NEXT: retq vector.ph: %0 = icmp slt <16 x i8> %x, zeroinitializer %1 = xor <16 x i8> %x, @@ -120,11 +106,6 @@ ; AVX: # %bb.0: # %vector.ph ; AVX-NEXT: vpsubusb {{.*}}(%rip), %xmm0, %xmm0 ; AVX-NEXT: retq -; -; AVX512-LABEL: test5: -; AVX512: # %bb.0: # %vector.ph -; AVX512-NEXT: vpsubusb {{.*}}(%rip), %xmm0, %xmm0 -; AVX512-NEXT: retq vector.ph: %0 = icmp ugt <16 x i8> %x, %1 = add <16 x i8> %x, @@ -1150,11 +1131,6 @@ ; AVX: # %bb.0: # %vector.ph ; AVX-NEXT: vpsubusw %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq -; -; AVX512-LABEL: psubus_8i16_max: -; AVX512: # %bb.0: # %vector.ph -; AVX512-NEXT: vpsubusw %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: retq vector.ph: %cmp = icmp ult <8 x i16> %x, %y %max = select <8 x i1> %cmp, <8 x i16> %y, <8 x i16> %x @@ -1172,11 +1148,6 @@ ; AVX: # %bb.0: # %vector.ph ; AVX-NEXT: vpsubusb %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq -; -; AVX512-LABEL: psubus_16i8_max: -; AVX512: # %bb.0: # %vector.ph -; AVX512-NEXT: vpsubusb %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: retq vector.ph: %cmp = icmp ult <16 x i8> %x, %y %max = select <16 x i1> %cmp, <16 x i8> %y, <16 x i8> %x @@ -1902,32 +1873,58 @@ ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; -; AVX2-LABEL: psubus_8i64_max: -; AVX2: # %bb.0: # %vector.ph -; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm3 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm4 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] -; AVX2-NEXT: vpxor %ymm4, %ymm2, %ymm5 -; AVX2-NEXT: vpor %ymm4, %ymm0, %ymm6 -; AVX2-NEXT: vpcmpgtq %ymm6, %ymm5, %ymm5 -; AVX2-NEXT: vpxor %ymm4, %ymm1, %ymm6 -; AVX2-NEXT: vpor %ymm4, %ymm3, %ymm4 -; AVX2-NEXT: vpcmpgtq %ymm4, %ymm6, %ymm4 -; AVX2-NEXT: vblendvpd %ymm4, %ymm1, %ymm3, %ymm3 -; AVX2-NEXT: vblendvpd %ymm5, %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpsubq %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpsubq %ymm1, %ymm3, %ymm1 -; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] -; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 -; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0 -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq +; AVX2-SLOW-LABEL: psubus_8i64_max: +; AVX2-SLOW: # %bb.0: # %vector.ph +; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} ymm3 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm4 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] +; AVX2-SLOW-NEXT: vpxor %ymm4, %ymm2, %ymm5 +; AVX2-SLOW-NEXT: vpor %ymm4, %ymm0, %ymm6 +; AVX2-SLOW-NEXT: vpcmpgtq %ymm6, %ymm5, %ymm5 +; AVX2-SLOW-NEXT: vpxor %ymm4, %ymm1, %ymm6 +; AVX2-SLOW-NEXT: vpor %ymm4, %ymm3, %ymm4 +; AVX2-SLOW-NEXT: vpcmpgtq %ymm4, %ymm6, %ymm4 +; AVX2-SLOW-NEXT: vblendvpd %ymm4, %ymm1, %ymm3, %ymm3 +; AVX2-SLOW-NEXT: vblendvpd %ymm5, %ymm2, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpsubq %ymm2, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpsubq %ymm1, %ymm3, %ymm1 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-SLOW-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0 +; AVX2-SLOW-NEXT: vzeroupper +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: psubus_8i64_max: +; AVX2-FAST: # %bb.0: # %vector.ph +; AVX2-FAST-NEXT: vpmovzxwq {{.*#+}} ymm3 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; AVX2-FAST-NEXT: vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm4 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] +; AVX2-FAST-NEXT: vpxor %ymm4, %ymm2, %ymm5 +; AVX2-FAST-NEXT: vpor %ymm4, %ymm0, %ymm6 +; AVX2-FAST-NEXT: vpcmpgtq %ymm6, %ymm5, %ymm5 +; AVX2-FAST-NEXT: vpxor %ymm4, %ymm1, %ymm6 +; AVX2-FAST-NEXT: vpor %ymm4, %ymm3, %ymm4 +; AVX2-FAST-NEXT: vpcmpgtq %ymm4, %ymm6, %ymm4 +; AVX2-FAST-NEXT: vblendvpd %ymm4, %ymm1, %ymm3, %ymm3 +; AVX2-FAST-NEXT: vblendvpd %ymm5, %ymm2, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpsubq %ymm2, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpsubq %ymm1, %ymm3, %ymm1 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7] +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1 +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-FAST-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0 +; AVX2-FAST-NEXT: vzeroupper +; AVX2-FAST-NEXT: retq ; ; AVX512-LABEL: psubus_8i64_max: ; AVX512: # %bb.0: # %vector.ph Index: test/CodeGen/X86/shuffle-of-splat-multiuses.ll =================================================================== --- test/CodeGen/X86/shuffle-of-splat-multiuses.ll +++ test/CodeGen/X86/shuffle-of-splat-multiuses.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX2 +; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX2,AVX2-SLOW +; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX2,AVX2-FAST ; PR32449 define <2 x double> @foo2(<2 x double> %v, <2 x double> *%p) nounwind { @@ -27,12 +28,19 @@ } define <8 x float> @foo8(<8 x float> %v, <8 x float> *%p) nounwind { -; AVX2-LABEL: foo8: -; AVX2: # %bb.0: -; AVX2-NEXT: vmovshdup {{.*#+}} ymm0 = ymm0[1,1,3,3,5,5,7,7] -; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] -; AVX2-NEXT: vmovaps %ymm0, (%rdi) -; AVX2-NEXT: retq +; AVX2-SLOW-LABEL: foo8: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vmovshdup {{.*#+}} ymm0 = ymm0[1,1,3,3,5,5,7,7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] +; AVX2-SLOW-NEXT: vmovaps %ymm0, (%rdi) +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: foo8: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vbroadcastss {{.*#+}} ymm1 = [5,5,5,5,5,5,5,5] +; AVX2-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT: vmovaps %ymm0, (%rdi) +; AVX2-FAST-NEXT: retq %res = shufflevector <8 x float> %v, <8 x float> undef, <8 x i32> %res1 = shufflevector<8 x float> %res, <8 x float> undef, <8 x i32> store <8 x float> %res, <8 x float>* %p Index: test/CodeGen/X86/shuffle-strided-with-offset-128.ll =================================================================== --- test/CodeGen/X86/shuffle-strided-with-offset-128.ll +++ test/CodeGen/X86/shuffle-strided-with-offset-128.ll @@ -5,9 +5,9 @@ ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 --check-prefix=AVX2-SLOW ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 --check-prefix=AVX2-FAST ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512F -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512VL -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512BW -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512BWVL +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VL +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BW +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BWVL define void @shuffle_v16i8_to_v8i8_1(<16 x i8>* %L, <8 x i8>* %S) nounwind { ; SSE2-LABEL: shuffle_v16i8_to_v8i8_1: @@ -400,8 +400,8 @@ ; ; AVX512BW-LABEL: shuffle_v8i16_to_v2i16_1: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm0 = mem[0,2,2,3] -; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7] +; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,10,11,8,9,10,11,8,9,10,11,12,13,14,15] ; AVX512BW-NEXT: vmovd %xmm0, (%rsi) ; AVX512BW-NEXT: retq ; @@ -460,8 +460,8 @@ ; ; AVX512BW-LABEL: shuffle_v8i16_to_v2i16_2: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm0 = mem[3,1,2,3] -; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7] +; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,12,13,4,5,6,7,8,9,10,11,12,13,14,15] ; AVX512BW-NEXT: vmovd %xmm0, (%rsi) ; AVX512BW-NEXT: retq ; @@ -520,8 +520,8 @@ ; ; AVX512BW-LABEL: shuffle_v8i16_to_v2i16_3: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm0 = mem[3,1,2,3] -; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] +; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,7,14,15,4,5,6,7,8,9,10,11,12,13,14,15] ; AVX512BW-NEXT: vmovd %xmm0, (%rsi) ; AVX512BW-NEXT: retq ; Index: test/CodeGen/X86/shuffle-strided-with-offset-256.ll =================================================================== --- test/CodeGen/X86/shuffle-strided-with-offset-256.ll +++ test/CodeGen/X86/shuffle-strided-with-offset-256.ll @@ -3,9 +3,9 @@ ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 --check-prefix=AVX2-SLOW ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 --check-prefix=AVX2-FAST ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512F -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512VL -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512BW -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512BWVL +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VL +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BW +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BWVL define void @shuffle_v32i8_to_v16i8_1(<32 x i8>* %L, <16 x i8>* %S) nounwind { ; AVX1-LABEL: shuffle_v32i8_to_v16i8_1: @@ -405,10 +405,9 @@ ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,1,3,3,4,5,6,7] -; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,1,3,3,4,5,6,7] +; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [2,3,2,3,10,11,10,11,8,9,10,11,12,13,14,15] +; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX512VL-NEXT: vpmovdw %xmm0, (%rsi) ; AVX512VL-NEXT: vzeroupper @@ -418,10 +417,9 @@ ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,3,2,3,4,5,6,7] -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7] +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [2,3,10,11,8,9,10,11,8,9,10,11,12,13,14,15] +; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; AVX512BW-NEXT: vmovq %xmm0, (%rsi) ; AVX512BW-NEXT: vzeroupper @@ -431,10 +429,9 @@ ; AVX512BWVL: # %bb.0: ; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; AVX512BWVL-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,1,3,3,4,5,6,7] -; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; AVX512BWVL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,1,3,3,4,5,6,7] +; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm2 = [2,3,2,3,10,11,10,11,8,9,10,11,12,13,14,15] +; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX512BWVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX512BWVL-NEXT: vpmovdw %xmm0, (%rsi) ; AVX512BWVL-NEXT: vzeroupper @@ -510,10 +507,9 @@ ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3] -; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,0,2,3,4,5,6,7] -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] -; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7] +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [4,5,12,13,4,5,6,7,8,9,10,11,12,13,14,15] +; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; AVX512BW-NEXT: vmovq %xmm0, (%rsi) ; AVX512BW-NEXT: vzeroupper @@ -589,10 +585,9 @@ ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3] -; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,1,3,4,5,6,7] -; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] -; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,1,3,4,5,6,7] +; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [6,7,14,15,14,15,6,7,8,9,10,11,12,13,14,15] +; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX512VL-NEXT: vpmovdw %xmm0, (%rsi) ; AVX512VL-NEXT: vzeroupper @@ -602,10 +597,9 @@ ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3] -; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7] -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] -; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [6,7,14,15,4,5,6,7,8,9,10,11,12,13,14,15] +; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; AVX512BW-NEXT: vmovq %xmm0, (%rsi) ; AVX512BW-NEXT: vzeroupper @@ -615,10 +609,9 @@ ; AVX512BWVL: # %bb.0: ; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3] -; AVX512BWVL-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,1,3,4,5,6,7] -; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] -; AVX512BWVL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,1,3,4,5,6,7] +; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm2 = [6,7,14,15,14,15,6,7,8,9,10,11,12,13,14,15] +; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX512BWVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX512BWVL-NEXT: vpmovdw %xmm0, (%rsi) ; AVX512BWVL-NEXT: vzeroupper @@ -748,10 +741,9 @@ ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,1,3,3,4,5,6,7] -; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,1,3,3,4,5,6,7] +; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [2,3,2,3,10,11,10,11,8,9,10,11,12,13,14,15] +; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX512VL-NEXT: vpmovdb %xmm0, (%rsi) ; AVX512VL-NEXT: vzeroupper @@ -773,10 +765,9 @@ ; AVX512BWVL: # %bb.0: ; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; AVX512BWVL-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,1,3,3,4,5,6,7] -; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; AVX512BWVL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,1,3,3,4,5,6,7] +; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm2 = [2,3,2,3,10,11,10,11,8,9,10,11,12,13,14,15] +; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX512BWVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX512BWVL-NEXT: vpmovdb %xmm0, (%rsi) ; AVX512BWVL-NEXT: vzeroupper @@ -1056,10 +1047,9 @@ ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3] -; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,1,3,4,5,6,7] -; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] -; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,1,3,4,5,6,7] +; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [6,7,14,15,14,15,6,7,8,9,10,11,12,13,14,15] +; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX512VL-NEXT: vpmovdb %xmm0, (%rsi) ; AVX512VL-NEXT: vzeroupper @@ -1081,10 +1071,9 @@ ; AVX512BWVL: # %bb.0: ; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3] -; AVX512BWVL-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,1,3,4,5,6,7] -; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] -; AVX512BWVL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,1,3,4,5,6,7] +; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm2 = [6,7,14,15,14,15,6,7,8,9,10,11,12,13,14,15] +; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX512BWVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX512BWVL-NEXT: vpmovdb %xmm0, (%rsi) ; AVX512BWVL-NEXT: vzeroupper Index: test/CodeGen/X86/shuffle-strided-with-offset-512.ll =================================================================== --- test/CodeGen/X86/shuffle-strided-with-offset-512.ll +++ test/CodeGen/X86/shuffle-strided-with-offset-512.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512F -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512VL -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512BW -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512BWVL +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VL +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BW +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BWVL define void @shuffle_v64i8_to_v32i8_1(<64 x i8>* %L, <32 x i8>* %S) nounwind { ; AVX512F-LABEL: shuffle_v64i8_to_v32i8_1: @@ -23,9 +23,9 @@ ; AVX512VL-NEXT: vmovdqa 32(%rdi), %ymm1 ; AVX512VL-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,17,19,21,23,25,27,29,31] ; AVX512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,17,19,21,23,25,27,29,31,u,u,u,u,u,u,u,u] -; AVX512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] -; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] -; AVX512VL-NEXT: vmovdqa %ymm0, (%rsi) +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,5,7] +; AVX512VL-NEXT: vpermi2q %ymm1, %ymm0, %ymm2 +; AVX512VL-NEXT: vmovdqa %ymm2, (%rsi) ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq ; @@ -47,9 +47,9 @@ ; AVX512BWVL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512BWVL-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,17,19,21,23,25,27,29,31] ; AVX512BWVL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,17,19,21,23,25,27,29,31,u,u,u,u,u,u,u,u] -; AVX512BWVL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] -; AVX512BWVL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] -; AVX512BWVL-NEXT: vmovdqa %ymm0, (%rsi) +; AVX512BWVL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,5,7] +; AVX512BWVL-NEXT: vpermi2q %ymm1, %ymm0, %ymm2 +; AVX512BWVL-NEXT: vmovdqa %ymm2, (%rsi) ; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq %vec = load <64 x i8>, <64 x i8>* %L @@ -77,9 +77,9 @@ ; AVX512VL-NEXT: vmovdqa 32(%rdi), %ymm1 ; AVX512VL-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[6,7,2,3,4,5,6,7,2,3,6,7,10,11,14,15,22,23,18,19,20,21,22,23,18,19,22,23,26,27,30,31] ; AVX512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15,18,19,22,23,26,27,30,31,30,31,26,27,28,29,30,31] -; AVX512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] -; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] -; AVX512VL-NEXT: vmovdqa %ymm0, (%rsi) +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,5,7] +; AVX512VL-NEXT: vpermi2q %ymm1, %ymm0, %ymm2 +; AVX512VL-NEXT: vmovdqa %ymm2, (%rsi) ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq ; @@ -111,15 +111,45 @@ } define void @shuffle_v16i32_to_v8i32_1(<16 x i32>* %L, <8 x i32>* %S) nounwind { -; AVX512-LABEL: shuffle_v16i32_to_v8i32_1: -; AVX512: # %bb.0: -; AVX512-NEXT: vmovaps (%rdi), %zmm0 -; AVX512-NEXT: vextractf64x4 $1, %zmm0, %ymm1 -; AVX512-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm1[1,3],ymm0[5,7],ymm1[5,7] -; AVX512-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3] -; AVX512-NEXT: vmovaps %ymm0, (%rsi) -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; AVX512F-LABEL: shuffle_v16i32_to_v8i32_1: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovaps (%rdi), %zmm0 +; AVX512F-NEXT: vextractf64x4 $1, %zmm0, %ymm1 +; AVX512F-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm1[1,3],ymm0[5,7],ymm1[5,7] +; AVX512F-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX512F-NEXT: vmovaps %ymm0, (%rsi) +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: shuffle_v16i32_to_v8i32_1: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vmovdqa32 (%rdi), %zmm0 +; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [1,3,5,7,9,11,13,15] +; AVX512VL-NEXT: vpermi2d %ymm1, %ymm0, %ymm2 +; AVX512VL-NEXT: vmovdqa %ymm2, (%rsi) +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq +; +; AVX512BW-LABEL: shuffle_v16i32_to_v8i32_1: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vmovaps (%rdi), %zmm0 +; AVX512BW-NEXT: vextractf64x4 $1, %zmm0, %ymm1 +; AVX512BW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm1[1,3],ymm0[5,7],ymm1[5,7] +; AVX512BW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX512BW-NEXT: vmovaps %ymm0, (%rsi) +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512BWVL-LABEL: shuffle_v16i32_to_v8i32_1: +; AVX512BWVL: # %bb.0: +; AVX512BWVL-NEXT: vmovdqa32 (%rdi), %zmm0 +; AVX512BWVL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512BWVL-NEXT: vmovdqa {{.*#+}} ymm2 = [1,3,5,7,9,11,13,15] +; AVX512BWVL-NEXT: vpermi2d %ymm1, %ymm0, %ymm2 +; AVX512BWVL-NEXT: vmovdqa %ymm2, (%rsi) +; AVX512BWVL-NEXT: vzeroupper +; AVX512BWVL-NEXT: retq %vec = load <16 x i32>, <16 x i32>* %L %strided.vec = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> store <8 x i32> %strided.vec, <8 x i32>* %S @@ -399,16 +429,14 @@ ; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512VL-NEXT: vmovdqa 32(%rdi), %ymm1 ; AVX512VL-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,1,3,4,5,6,7] -; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7] +; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,2,3,10,11,8,9,10,11,12,13,14,15] +; AVX512VL-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX512VL-NEXT: vpshufb %xmm3, %xmm1, %xmm1 ; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[1,3,2,3,4,5,6,7] -; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7] +; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = [2,3,10,11,8,9,10,11,8,9,10,11,12,13,14,15] +; AVX512VL-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX512VL-NEXT: vpshufb %xmm3, %xmm0, %xmm0 ; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] ; AVX512VL-NEXT: vmovdqa %xmm0, (%rsi) @@ -420,16 +448,14 @@ ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,1,3,4,5,6,7] -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7] +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,2,3,10,11,8,9,10,11,12,13,14,15] +; AVX512BW-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX512BW-NEXT: vpshufb %xmm3, %xmm1, %xmm1 ; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[1,3,2,3,4,5,6,7] -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7] +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [2,3,10,11,8,9,10,11,8,9,10,11,12,13,14,15] +; AVX512BW-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX512BW-NEXT: vpshufb %xmm3, %xmm0, %xmm0 ; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] ; AVX512BW-NEXT: vmovdqa %xmm0, (%rsi) @@ -478,16 +504,14 @@ ; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512VL-NEXT: vmovdqa 32(%rdi), %ymm1 ; AVX512VL-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,1,2,3] -; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,2,0,4,5,6,7] -; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3] -; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,2,0,4,5,6,7] +; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = [12,13,14,15,4,5,12,13,8,9,10,11,12,13,14,15] +; AVX512VL-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX512VL-NEXT: vpshufb %xmm3, %xmm1, %xmm1 ; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,1,2,3] -; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,0,2,3,4,5,6,7] -; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] -; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7] +; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = [4,5,12,13,4,5,6,7,8,9,10,11,12,13,14,15] +; AVX512VL-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX512VL-NEXT: vpshufb %xmm3, %xmm0, %xmm0 ; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] ; AVX512VL-NEXT: vmovdqa %xmm0, (%rsi) @@ -499,16 +523,14 @@ ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,1,2,3] -; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,2,0,4,5,6,7] -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3] -; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,2,0,4,5,6,7] +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [12,13,14,15,4,5,12,13,8,9,10,11,12,13,14,15] +; AVX512BW-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX512BW-NEXT: vpshufb %xmm3, %xmm1, %xmm1 ; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,1,2,3] -; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,0,2,3,4,5,6,7] -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] -; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7] +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [4,5,12,13,4,5,6,7,8,9,10,11,12,13,14,15] +; AVX512BW-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX512BW-NEXT: vpshufb %xmm3, %xmm0, %xmm0 ; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] ; AVX512BW-NEXT: vmovdqa %xmm0, (%rsi) @@ -557,16 +579,14 @@ ; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512VL-NEXT: vmovdqa 32(%rdi), %ymm1 ; AVX512VL-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,1,2,3] -; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,3,1,4,5,6,7] -; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3] -; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,3,1,4,5,6,7] +; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = [12,13,14,15,6,7,14,15,8,9,10,11,12,13,14,15] +; AVX512VL-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX512VL-NEXT: vpshufb %xmm3, %xmm1, %xmm1 ; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,1,2,3] -; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7] -; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] -; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] +; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = [6,7,14,15,4,5,6,7,8,9,10,11,12,13,14,15] +; AVX512VL-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX512VL-NEXT: vpshufb %xmm3, %xmm0, %xmm0 ; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] ; AVX512VL-NEXT: vmovdqa %xmm0, (%rsi) @@ -578,16 +598,14 @@ ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,1,2,3] -; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,3,1,4,5,6,7] -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3] -; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,3,1,4,5,6,7] +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [12,13,14,15,6,7,14,15,8,9,10,11,12,13,14,15] +; AVX512BW-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX512BW-NEXT: vpshufb %xmm3, %xmm1, %xmm1 ; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,1,2,3] -; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7] -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] -; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7] +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [6,7,14,15,4,5,6,7,8,9,10,11,12,13,14,15] +; AVX512BW-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX512BW-NEXT: vpshufb %xmm3, %xmm0, %xmm0 ; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] ; AVX512BW-NEXT: vmovdqa %xmm0, (%rsi) Index: test/CodeGen/X86/shuffle-vs-trunc-128.ll =================================================================== --- test/CodeGen/X86/shuffle-vs-trunc-128.ll +++ test/CodeGen/X86/shuffle-vs-trunc-128.ll @@ -5,9 +5,9 @@ ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 --check-prefix=AVX2-SLOW ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 --check-prefix=AVX2-FAST ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512F -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512VL -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512BW -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512BWVL +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VL +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BW +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BWVL ; PR31551 ; Pairs of shufflevector:trunc functions with functional equivalence. @@ -473,8 +473,8 @@ ; ; AVX512BW-LABEL: shuffle_v8i16_to_v2i16: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm0 = mem[0,2,2,3] -; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,10,11,8,9,10,11,12,13,14,15] ; AVX512BW-NEXT: vmovd %xmm0, (%rsi) ; AVX512BW-NEXT: retq ; @@ -533,8 +533,8 @@ ; ; AVX512BW-LABEL: trunc_v2i64_to_v2i16: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm0 = mem[0,2,2,3] -; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,10,11,8,9,10,11,12,13,14,15] ; AVX512BW-NEXT: vmovd %xmm0, (%rsi) ; AVX512BW-NEXT: retq ; Index: test/CodeGen/X86/shuffle-vs-trunc-256.ll =================================================================== --- test/CodeGen/X86/shuffle-vs-trunc-256.ll +++ test/CodeGen/X86/shuffle-vs-trunc-256.ll @@ -3,9 +3,9 @@ ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 --check-prefix=AVX2-SLOW ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 --check-prefix=AVX2-FAST ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512F -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512VL -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512BW -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512BWVL +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VL +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BW +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BWVL ; PR31551 ; Pairs of shufflevector:trunc functions with functional equivalence. @@ -505,10 +505,9 @@ ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,8,9,8,9,10,11,8,9,10,11,12,13,14,15] +; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0 ; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; AVX512BW-NEXT: vmovq %xmm0, (%rsi) ; AVX512BW-NEXT: vzeroupper Index: test/CodeGen/X86/shuffle-vs-trunc-512.ll =================================================================== --- test/CodeGen/X86/shuffle-vs-trunc-512.ll +++ test/CodeGen/X86/shuffle-vs-trunc-512.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512F -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512VL -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512BW -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512BWVL +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VL +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BW +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BWVL ; PR31551 ; Pairs of shufflevector:trunc functions with functional equivalence. @@ -27,9 +27,9 @@ ; AVX512VL-NEXT: vmovdqa 32(%rdi), %ymm1 ; AVX512VL-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30] ; AVX512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30,u,u,u,u,u,u,u,u] -; AVX512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] -; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] -; AVX512VL-NEXT: vmovdqa %ymm0, (%rsi) +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,5,7] +; AVX512VL-NEXT: vpermi2q %ymm1, %ymm0, %ymm2 +; AVX512VL-NEXT: vmovdqa %ymm2, (%rsi) ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq ; @@ -51,9 +51,9 @@ ; AVX512BWVL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512BWVL-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30] ; AVX512BWVL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30,u,u,u,u,u,u,u,u] -; AVX512BWVL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] -; AVX512BWVL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] -; AVX512BWVL-NEXT: vmovdqa %ymm0, (%rsi) +; AVX512BWVL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,5,7] +; AVX512BWVL-NEXT: vpermi2q %ymm1, %ymm0, %ymm2 +; AVX512BWVL-NEXT: vmovdqa %ymm2, (%rsi) ; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq %vec = load <64 x i8>, <64 x i8>* %L @@ -120,13 +120,14 @@ ; ; AVX512VL-LABEL: shuffle_v32i16_to_v16i16: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpshuflw {{.*#+}} ymm0 = mem[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] -; AVX512VL-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] -; AVX512VL-NEXT: vpshuflw {{.*#+}} ymm1 = mem[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] -; AVX512VL-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] -; AVX512VL-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,2],ymm0[0,2],ymm1[4,6],ymm0[4,6] -; AVX512VL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3] -; AVX512VL-NEXT: vmovaps %ymm0, (%rsi) +; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512VL-NEXT: vmovdqa 32(%rdi), %ymm1 +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,4,5,6,7,8,9,12,13,12,13,14,15,16,17,20,21,20,21,22,23,24,25,28,29,28,29,30,31] +; AVX512VL-NEXT: vpshufb %ymm2, %ymm1, %ymm1 +; AVX512VL-NEXT: vpshufb %ymm2, %ymm0, %ymm0 +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,8,10,12,14] +; AVX512VL-NEXT: vpermi2d %ymm1, %ymm0, %ymm2 +; AVX512VL-NEXT: vmovdqa %ymm2, (%rsi) ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq ; @@ -134,10 +135,9 @@ ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512BW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] -; AVX512BW-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] -; AVX512BW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] -; AVX512BW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] +; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,4,5,6,7,8,9,12,13,12,13,14,15,16,17,20,21,20,21,22,23,24,25,28,29,28,29,30,31] +; AVX512BW-NEXT: vpshufb %ymm2, %ymm1, %ymm1 +; AVX512BW-NEXT: vpshufb %ymm2, %ymm0, %ymm0 ; AVX512BW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm1[0,2],ymm0[4,6],ymm1[4,6] ; AVX512BW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3] ; AVX512BW-NEXT: vmovaps %ymm0, (%rsi) @@ -174,15 +174,45 @@ } define void @shuffle_v16i32_to_v8i32(<16 x i32>* %L, <8 x i32>* %S) nounwind { -; AVX512-LABEL: shuffle_v16i32_to_v8i32: -; AVX512: # %bb.0: -; AVX512-NEXT: vmovaps (%rdi), %zmm0 -; AVX512-NEXT: vextractf64x4 $1, %zmm0, %ymm1 -; AVX512-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm1[0,2],ymm0[4,6],ymm1[4,6] -; AVX512-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3] -; AVX512-NEXT: vmovaps %ymm0, (%rsi) -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; AVX512F-LABEL: shuffle_v16i32_to_v8i32: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovaps (%rdi), %zmm0 +; AVX512F-NEXT: vextractf64x4 $1, %zmm0, %ymm1 +; AVX512F-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm1[0,2],ymm0[4,6],ymm1[4,6] +; AVX512F-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX512F-NEXT: vmovaps %ymm0, (%rsi) +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: shuffle_v16i32_to_v8i32: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vmovdqa32 (%rdi), %zmm0 +; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,8,10,12,14] +; AVX512VL-NEXT: vpermi2d %ymm1, %ymm0, %ymm2 +; AVX512VL-NEXT: vmovdqa %ymm2, (%rsi) +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq +; +; AVX512BW-LABEL: shuffle_v16i32_to_v8i32: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vmovaps (%rdi), %zmm0 +; AVX512BW-NEXT: vextractf64x4 $1, %zmm0, %ymm1 +; AVX512BW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm1[0,2],ymm0[4,6],ymm1[4,6] +; AVX512BW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX512BW-NEXT: vmovaps %ymm0, (%rsi) +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512BWVL-LABEL: shuffle_v16i32_to_v8i32: +; AVX512BWVL: # %bb.0: +; AVX512BWVL-NEXT: vmovdqa32 (%rdi), %zmm0 +; AVX512BWVL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512BWVL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,8,10,12,14] +; AVX512BWVL-NEXT: vpermi2d %ymm1, %ymm0, %ymm2 +; AVX512BWVL-NEXT: vmovdqa %ymm2, (%rsi) +; AVX512BWVL-NEXT: vzeroupper +; AVX512BWVL-NEXT: retq %vec = load <16 x i32>, <16 x i32>* %L %strided.vec = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> store <8 x i32> %strided.vec, <8 x i32>* %S @@ -326,16 +356,14 @@ ; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0 ; AVX512VL-NEXT: vmovdqa 32(%rdi), %ymm1 ; AVX512VL-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,0,2,4,5,6,7] -; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,0,2,4,5,6,7] +; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,0,1,8,9,8,9,10,11,12,13,14,15] +; AVX512VL-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX512VL-NEXT: vpshufb %xmm3, %xmm1, %xmm1 ; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7] -; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,8,9,8,9,10,11,8,9,10,11,12,13,14,15] +; AVX512VL-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX512VL-NEXT: vpshufb %xmm3, %xmm0, %xmm0 ; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] ; AVX512VL-NEXT: vmovdqa %xmm0, (%rsi) @@ -347,16 +375,14 @@ ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 ; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,0,2,4,5,6,7] -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,0,2,4,5,6,7] +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,0,1,8,9,8,9,10,11,12,13,14,15] +; AVX512BW-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX512BW-NEXT: vpshufb %xmm3, %xmm1, %xmm1 ; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7] -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,8,9,8,9,10,11,8,9,10,11,12,13,14,15] +; AVX512BW-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX512BW-NEXT: vpshufb %xmm3, %xmm0, %xmm0 ; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] ; AVX512BW-NEXT: vmovdqa %xmm0, (%rsi) Index: test/CodeGen/X86/vector-half-conversions.ll =================================================================== --- test/CodeGen/X86/vector-half-conversions.ll +++ test/CodeGen/X86/vector-half-conversions.ll @@ -1,8 +1,9 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+f16c -verify-machineinstrs | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+f16c -verify-machineinstrs | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+f16c -verify-machineinstrs | FileCheck %s --check-prefixes=ALL,AVX,AVX2,AVX2-SLOW +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+f16c,+fast-variable-shuffle -verify-machineinstrs | FileCheck %s --check-prefixes=ALL,AVX,AVX2,AVX2-FAST ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f -verify-machineinstrs | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512F -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl -verify-machineinstrs | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512VL +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl,+fast-variable-shuffle -verify-machineinstrs | FileCheck %s --check-prefixes=ALL,AVX,AVX512,AVX512VL ; ; Half to Float @@ -1189,22 +1190,38 @@ ; AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] ; AVX1-NEXT: retq ; -; AVX2-LABEL: cvt_2i16_to_2f64: -; AVX2: # %bb.0: -; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; AVX2-NEXT: vmovd %xmm0, %eax -; AVX2-NEXT: movswl %ax, %ecx -; AVX2-NEXT: shrl $16, %eax -; AVX2-NEXT: cwtl -; AVX2-NEXT: vmovd %eax, %xmm0 -; AVX2-NEXT: vcvtph2ps %xmm0, %xmm0 -; AVX2-NEXT: vmovd %ecx, %xmm1 -; AVX2-NEXT: vcvtph2ps %xmm1, %xmm1 -; AVX2-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 -; AVX2-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; AVX2-NEXT: retq +; AVX2-SLOW-LABEL: cvt_2i16_to_2f64: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vmovd %xmm0, %eax +; AVX2-SLOW-NEXT: movswl %ax, %ecx +; AVX2-SLOW-NEXT: shrl $16, %eax +; AVX2-SLOW-NEXT: cwtl +; AVX2-SLOW-NEXT: vmovd %eax, %xmm0 +; AVX2-SLOW-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vmovd %ecx, %xmm1 +; AVX2-SLOW-NEXT: vcvtph2ps %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: cvt_2i16_to_2f64: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,10,11,8,9,10,11,12,13,14,15] +; AVX2-FAST-NEXT: vmovd %xmm0, %eax +; AVX2-FAST-NEXT: movswl %ax, %ecx +; AVX2-FAST-NEXT: shrl $16, %eax +; AVX2-FAST-NEXT: cwtl +; AVX2-FAST-NEXT: vmovd %eax, %xmm0 +; AVX2-FAST-NEXT: vcvtph2ps %xmm0, %xmm0 +; AVX2-FAST-NEXT: vmovd %ecx, %xmm1 +; AVX2-FAST-NEXT: vcvtph2ps %xmm1, %xmm1 +; AVX2-FAST-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 +; AVX2-FAST-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; AVX2-FAST-NEXT: retq ; ; AVX512F-LABEL: cvt_2i16_to_2f64: ; AVX512F: # %bb.0: Index: test/CodeGen/X86/vector-shuffle-128-v4.ll =================================================================== --- test/CodeGen/X86/vector-shuffle-128-v4.ll +++ test/CodeGen/X86/vector-shuffle-128-v4.ll @@ -4,8 +4,9 @@ ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSSE3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1OR2 --check-prefix=AVX1 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1OR2 --check-prefix=AVX2OR512VL --check-prefix=AVX2 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512dq | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2OR512VL --check-prefix=AVX512VL +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=ALL,AVX,AVX1OR2,AVX2OR512VL,AVX2,AVX2-SLOW +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefixes=ALL,AVX,AVX1OR2,AVX2OR512VL,AVX2,AVX2-FAST +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512dq,+fast-variable-shuffle | FileCheck %s --check-prefixes=ALL,AVX,AVX2OR512VL,AVX512VL define <4 x i32> @shuffle_v4i32_0001(<4 x i32> %a, <4 x i32> %b) { ; SSE-LABEL: shuffle_v4i32_0001: @@ -1238,18 +1239,28 @@ ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,0,1,1] ; SSE41-NEXT: retq ; -; AVX1OR2-LABEL: shuffle_v4i32_z4zz: -; AVX1OR2: # %bb.0: -; AVX1OR2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1OR2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] -; AVX1OR2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,0,1,1] -; AVX1OR2-NEXT: retq +; AVX1-LABEL: shuffle_v4i32_z4zz: +; AVX1: # %bb.0: +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,0,1,1] +; AVX1-NEXT: retq +; +; AVX2-SLOW-LABEL: shuffle_v4i32_z4zz: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,0,1,1] +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: shuffle_v4i32_z4zz: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0,1,2,3],zero,zero,zero,zero,zero,zero,zero,zero +; AVX2-FAST-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v4i32_z4zz: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; AVX512VL-NEXT: vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] -; AVX512VL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,0,1,1] +; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0,1,2,3],zero,zero,zero,zero,zero,zero,zero,zero ; AVX512VL-NEXT: retq %shuffle = shufflevector <4 x i32> zeroinitializer, <4 x i32> %a, <4 x i32> ret <4 x i32> %shuffle @@ -1284,18 +1295,28 @@ ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,0,1] ; SSE41-NEXT: retq ; -; AVX1OR2-LABEL: shuffle_v4i32_zz4z: -; AVX1OR2: # %bb.0: -; AVX1OR2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1OR2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] -; AVX1OR2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,0,1] -; AVX1OR2-NEXT: retq +; AVX1-LABEL: shuffle_v4i32_zz4z: +; AVX1: # %bb.0: +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,0,1] +; AVX1-NEXT: retq +; +; AVX2-SLOW-LABEL: shuffle_v4i32_zz4z: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,0,1] +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: shuffle_v4i32_zz4z: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3],zero,zero,zero,zero +; AVX2-FAST-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v4i32_zz4z: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; AVX512VL-NEXT: vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] -; AVX512VL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,0,1] +; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3],zero,zero,zero,zero ; AVX512VL-NEXT: retq %shuffle = shufflevector <4 x i32> zeroinitializer, <4 x i32> %a, <4 x i32> ret <4 x i32> %shuffle @@ -1351,12 +1372,22 @@ ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7] ; AVX1-NEXT: retq ; -; AVX2OR512VL-LABEL: shuffle_v4i32_z6zz: -; AVX2OR512VL: # %bb.0: -; AVX2OR512VL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,2,3,3] -; AVX2OR512VL-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; AVX2OR512VL-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] -; AVX2OR512VL-NEXT: retq +; AVX2-SLOW-LABEL: shuffle_v4i32_z6zz: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,2,3,3] +; AVX2-SLOW-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: shuffle_v4i32_z6zz: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[8,9,10,11],zero,zero,zero,zero,zero,zero,zero,zero +; AVX2-FAST-NEXT: retq +; +; AVX512VL-LABEL: shuffle_v4i32_z6zz: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[8,9,10,11],zero,zero,zero,zero,zero,zero,zero,zero +; AVX512VL-NEXT: retq %shuffle = shufflevector <4 x i32> zeroinitializer, <4 x i32> %a, <4 x i32> ret <4 x i32> %shuffle } Index: test/CodeGen/X86/vector-shuffle-combining.ll =================================================================== --- test/CodeGen/X86/vector-shuffle-combining.ll +++ test/CodeGen/X86/vector-shuffle-combining.ll @@ -3,7 +3,8 @@ ; RUN: llc < %s -mcpu=x86-64 -mattr=+ssse3 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSSE3 ; RUN: llc < %s -mcpu=x86-64 -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41 ; RUN: llc < %s -mcpu=x86-64 -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1 -; RUN: llc < %s -mcpu=x86-64 -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2 +; RUN: llc < %s -mcpu=x86-64 -mattr=+avx2 | FileCheck %s --check-prefixes=ALL,AVX,AVX2,AVX2-SLOW +; RUN: llc < %s -mcpu=x86-64 -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefixes=ALL,AVX,AVX2,AVX2-FAST ; ; Verify that the DAG combiner correctly folds bitwise operations across ; shuffles, nested shuffles with undef, pairs of nested shuffles, and other @@ -2536,12 +2537,19 @@ ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3] ; AVX1-NEXT: retq ; -; AVX2-LABEL: combine_unneeded_subvector1: -; AVX2: # %bb.0: -; AVX2-NEXT: vpaddd {{.*}}(%rip), %ymm0, %ymm0 -; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX2-NEXT: retq +; AVX2-SLOW-LABEL: combine_unneeded_subvector1: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpaddd {{.*}}(%rip), %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: combine_unneeded_subvector1: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vpaddd {{.*}}(%rip), %ymm0, %ymm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [7,6,5,4,7,6,5,4] +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT: retq %b = add <8 x i32> %a, %c = shufflevector <8 x i32> %b, <8 x i32> undef, <8 x i32> ret <8 x i32> %c @@ -2873,12 +2881,19 @@ ; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,0],ymm1[3,2],ymm0[5,4],ymm1[7,6] ; AVX1-NEXT: retq ; -; AVX2-LABEL: PR22412: -; AVX2: # %bb.0: # %entry -; AVX2-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3] -; AVX2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6] -; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,1] -; AVX2-NEXT: retq +; AVX2-SLOW-LABEL: PR22412: +; AVX2-SLOW: # %bb.0: # %entry +; AVX2-SLOW-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3] +; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,1] +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: PR22412: +; AVX2-FAST: # %bb.0: # %entry +; AVX2-FAST-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3] +; AVX2-FAST-NEXT: vmovapd {{.*#+}} ymm1 = [1,0,7,6,5,4,3,2] +; AVX2-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT: retq entry: %s1 = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> %s2 = shufflevector <8 x float> %s1, <8 x float> undef, <8 x i32> Index: test/CodeGen/X86/vector-shuffle-v1.ll =================================================================== --- test/CodeGen/X86/vector-shuffle-v1.ll +++ test/CodeGen/X86/vector-shuffle-v1.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX512F -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl | FileCheck %s --check-prefix=AVX512VL -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw -mattr=+avx512vl -mattr=+avx512dq| FileCheck %s --check-prefix=VL_BW_DQ +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefix=AVX512VL +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+avx512dq,+fast-variable-shuffle | FileCheck %s --check-prefix=VL_BW_DQ define <2 x i1> @shuf2i1_1_0(<2 x i1> %a) { ; AVX512F-LABEL: shuf2i1_1_0: @@ -309,10 +309,10 @@ ; AVX512VL-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 ; AVX512VL-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} ; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,2,2,3] -; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] ; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3,4,5,6,7] -; AVX512VL-NEXT: vpslld $31, %ymm0, %ymm0 +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [8,1,10,3,0,1,2,3] +; AVX512VL-NEXT: vpermi2d %ymm1, %ymm0, %ymm2 +; AVX512VL-NEXT: vpslld $31, %ymm2, %ymm0 ; AVX512VL-NEXT: vptestmd %ymm0, %ymm0, %k0 ; AVX512VL-NEXT: kmovw %k0, %eax ; AVX512VL-NEXT: # kill: def %al killed %al killed %eax @@ -324,10 +324,10 @@ ; VL_BW_DQ-NEXT: kmovd %edi, %k0 ; VL_BW_DQ-NEXT: vpmovm2d %k0, %ymm0 ; VL_BW_DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,2,2,3] -; VL_BW_DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] ; VL_BW_DQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; VL_BW_DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3,4,5,6,7] -; VL_BW_DQ-NEXT: vpmovd2m %ymm0, %k0 +; VL_BW_DQ-NEXT: vmovdqa {{.*#+}} ymm2 = [8,1,10,3,0,1,2,3] +; VL_BW_DQ-NEXT: vpermi2d %ymm1, %ymm0, %ymm2 +; VL_BW_DQ-NEXT: vpmovd2m %ymm2, %k0 ; VL_BW_DQ-NEXT: kmovd %k0, %eax ; VL_BW_DQ-NEXT: # kill: def %al killed %al killed %eax ; VL_BW_DQ-NEXT: vzeroupper Index: test/CodeGen/X86/vector-trunc-math.ll =================================================================== --- test/CodeGen/X86/vector-trunc-math.ll +++ test/CodeGen/X86/vector-trunc-math.ll @@ -1,10 +1,11 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=ALL,AVX,AVX2,AVX2-SLOW +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefixes=ALL,AVX,AVX2,AVX2-FAST ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512F -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512BW -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512DQ +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+fast-variable-shuffle | FileCheck %s --check-prefixes=ALL,AVX,AVX512,AVX512BW +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq,+fast-variable-shuffle | FileCheck %s --check-prefixes=ALL,AVX,AVX512,AVX512DQ ; ; add @@ -28,14 +29,23 @@ ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; -; AVX2-LABEL: trunc_add_v4i64_v4i32: -; AVX2: # %bb.0: -; AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0 -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq +; AVX2-SLOW-LABEL: trunc_add_v4i64_v4i32: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpaddq %ymm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-SLOW-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0 +; AVX2-SLOW-NEXT: vzeroupper +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: trunc_add_v4i64_v4i32: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vpaddq %ymm1, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,4,6,4,6,6,7] +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0 +; AVX2-FAST-NEXT: vzeroupper +; AVX2-FAST-NEXT: retq ; ; AVX512-LABEL: trunc_add_v4i64_v4i32: ; AVX512: # %bb.0: @@ -90,20 +100,34 @@ ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; -; AVX2-LABEL: trunc_add_v8i64_v8i16: -; AVX2: # %bb.0: -; AVX2-NEXT: vpaddq %ymm3, %ymm1, %ymm1 -; AVX2-NEXT: vpaddq %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] -; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0 -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq +; AVX2-SLOW-LABEL: trunc_add_v8i64_v8i16: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpaddq %ymm3, %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vpaddq %ymm2, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-SLOW-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0 +; AVX2-SLOW-NEXT: vzeroupper +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: trunc_add_v8i64_v8i16: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vpaddq %ymm3, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpaddq %ymm2, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7] +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0 +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-FAST-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0 +; AVX2-FAST-NEXT: vzeroupper +; AVX2-FAST-NEXT: retq ; ; AVX512-LABEL: trunc_add_v8i64_v8i16: ; AVX512: # %bb.0: @@ -228,33 +252,58 @@ ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; -; AVX2-LABEL: trunc_add_v16i64_v16i8: -; AVX2: # %bb.0: -; AVX2-NEXT: vpaddq %ymm5, %ymm1, %ymm1 -; AVX2-NEXT: vpaddq %ymm4, %ymm0, %ymm0 -; AVX2-NEXT: vpaddq %ymm7, %ymm3, %ymm3 -; AVX2-NEXT: vpaddq %ymm6, %ymm2, %ymm2 -; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] -; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] -; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] -; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] -; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] -; AVX2-NEXT: vpshufb %ymm3, %ymm2, %ymm2 -; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] -; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX2-NEXT: vpshufb %xmm4, %xmm2, %xmm2 -; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] -; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-NEXT: vpshufb %ymm3, %ymm0, %ymm0 -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: vpshufb %xmm4, %xmm0, %xmm0 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq +; AVX2-SLOW-LABEL: trunc_add_v16i64_v16i8: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpaddq %ymm5, %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vpaddq %ymm4, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpaddq %ymm7, %ymm3, %ymm3 +; AVX2-SLOW-NEXT: vpaddq %ymm6, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> +; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm2, %xmm2 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; AVX2-SLOW-NEXT: vzeroupper +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: trunc_add_v16i64_v16i8: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vpaddq %ymm5, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpaddq %ymm4, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpaddq %ymm7, %ymm3, %ymm3 +; AVX2-FAST-NEXT: vpaddq %ymm6, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [0,2,4,6,4,6,6,7] +; AVX2-FAST-NEXT: vpermd %ymm2, %ymm4, %ymm2 +; AVX2-FAST-NEXT: vpermd %ymm3, %ymm4, %ymm3 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> +; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm2, %xmm2 +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm4, %ymm0 +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm4, %ymm1 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; AVX2-FAST-NEXT: vzeroupper +; AVX2-FAST-NEXT: retq ; ; AVX512-LABEL: trunc_add_v16i64_v16i8: ; AVX512: # %bb.0: @@ -467,13 +516,21 @@ ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; -; AVX2-LABEL: trunc_add_const_v4i64_v4i32: -; AVX2: # %bb.0: -; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0 -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq +; AVX2-SLOW-LABEL: trunc_add_const_v4i64_v4i32: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-SLOW-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vzeroupper +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: trunc_add_const_v4i64_v4i32: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,4,6,4,6,6,7] +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-FAST-NEXT: vzeroupper +; AVX2-FAST-NEXT: retq ; ; AVX512-LABEL: trunc_add_const_v4i64_v4i32: ; AVX512: # %bb.0: @@ -520,18 +577,30 @@ ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; -; AVX2-LABEL: trunc_add_const_v8i64_v8i16: -; AVX2: # %bb.0: -; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] -; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: vpaddw {{.*}}(%rip), %xmm0, %xmm0 -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq +; AVX2-SLOW-LABEL: trunc_add_const_v8i64_v8i16: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-SLOW-NEXT: vpaddw {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vzeroupper +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: trunc_add_const_v8i64_v8i16: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7] +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0 +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-FAST-NEXT: vpaddw {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-FAST-NEXT: vzeroupper +; AVX2-FAST-NEXT: retq ; ; AVX512-LABEL: trunc_add_const_v8i64_v8i16: ; AVX512: # %bb.0: @@ -634,30 +703,52 @@ ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; -; AVX2-LABEL: trunc_add_const_v16i64_v16i8: -; AVX2: # %bb.0: -; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] -; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] -; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] -; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] -; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] -; AVX2-NEXT: vpshufb %ymm3, %ymm2, %ymm2 -; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] -; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX2-NEXT: vpshufb %xmm4, %xmm2, %xmm2 -; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] -; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-NEXT: vpshufb %ymm3, %ymm0, %ymm0 -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: vpshufb %xmm4, %xmm0, %xmm0 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] -; AVX2-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0 -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq +; AVX2-SLOW-LABEL: trunc_add_const_v16i64_v16i8: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> +; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm2, %xmm2 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; AVX2-SLOW-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vzeroupper +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: trunc_add_const_v16i64_v16i8: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [0,2,4,6,4,6,6,7] +; AVX2-FAST-NEXT: vpermd %ymm2, %ymm4, %ymm2 +; AVX2-FAST-NEXT: vpermd %ymm3, %ymm4, %ymm3 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> +; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm2, %xmm2 +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm4, %ymm0 +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm4, %ymm1 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; AVX2-FAST-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-FAST-NEXT: vzeroupper +; AVX2-FAST-NEXT: retq ; ; AVX512-LABEL: trunc_add_const_v16i64_v16i8: ; AVX512: # %bb.0: @@ -811,14 +902,23 @@ ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; -; AVX2-LABEL: trunc_sub_v4i64_v4i32: -; AVX2: # %bb.0: -; AVX2-NEXT: vpsubq %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0 -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq +; AVX2-SLOW-LABEL: trunc_sub_v4i64_v4i32: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpsubq %ymm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-SLOW-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0 +; AVX2-SLOW-NEXT: vzeroupper +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: trunc_sub_v4i64_v4i32: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vpsubq %ymm1, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,4,6,4,6,6,7] +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0 +; AVX2-FAST-NEXT: vzeroupper +; AVX2-FAST-NEXT: retq ; ; AVX512-LABEL: trunc_sub_v4i64_v4i32: ; AVX512: # %bb.0: @@ -873,20 +973,34 @@ ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; -; AVX2-LABEL: trunc_sub_v8i64_v8i16: -; AVX2: # %bb.0: -; AVX2-NEXT: vpsubq %ymm3, %ymm1, %ymm1 -; AVX2-NEXT: vpsubq %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] -; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0 -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq +; AVX2-SLOW-LABEL: trunc_sub_v8i64_v8i16: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpsubq %ymm3, %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vpsubq %ymm2, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-SLOW-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0 +; AVX2-SLOW-NEXT: vzeroupper +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: trunc_sub_v8i64_v8i16: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vpsubq %ymm3, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpsubq %ymm2, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7] +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0 +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-FAST-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0 +; AVX2-FAST-NEXT: vzeroupper +; AVX2-FAST-NEXT: retq ; ; AVX512-LABEL: trunc_sub_v8i64_v8i16: ; AVX512: # %bb.0: @@ -1011,33 +1125,58 @@ ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; -; AVX2-LABEL: trunc_sub_v16i64_v16i8: -; AVX2: # %bb.0: -; AVX2-NEXT: vpsubq %ymm5, %ymm1, %ymm1 -; AVX2-NEXT: vpsubq %ymm4, %ymm0, %ymm0 -; AVX2-NEXT: vpsubq %ymm7, %ymm3, %ymm3 -; AVX2-NEXT: vpsubq %ymm6, %ymm2, %ymm2 -; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] -; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] -; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] -; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] -; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] -; AVX2-NEXT: vpshufb %ymm3, %ymm2, %ymm2 -; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] -; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX2-NEXT: vpshufb %xmm4, %xmm2, %xmm2 -; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] -; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-NEXT: vpshufb %ymm3, %ymm0, %ymm0 -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: vpshufb %xmm4, %xmm0, %xmm0 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq +; AVX2-SLOW-LABEL: trunc_sub_v16i64_v16i8: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpsubq %ymm5, %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vpsubq %ymm4, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpsubq %ymm7, %ymm3, %ymm3 +; AVX2-SLOW-NEXT: vpsubq %ymm6, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> +; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm2, %xmm2 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; AVX2-SLOW-NEXT: vzeroupper +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: trunc_sub_v16i64_v16i8: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vpsubq %ymm5, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpsubq %ymm4, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpsubq %ymm7, %ymm3, %ymm3 +; AVX2-FAST-NEXT: vpsubq %ymm6, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [0,2,4,6,4,6,6,7] +; AVX2-FAST-NEXT: vpermd %ymm2, %ymm4, %ymm2 +; AVX2-FAST-NEXT: vpermd %ymm3, %ymm4, %ymm3 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> +; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm2, %xmm2 +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm4, %ymm0 +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm4, %ymm1 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; AVX2-FAST-NEXT: vzeroupper +; AVX2-FAST-NEXT: retq ; ; AVX512-LABEL: trunc_sub_v16i64_v16i8: ; AVX512: # %bb.0: @@ -1209,14 +1348,23 @@ ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; -; AVX2-LABEL: trunc_sub_const_v4i64_v4i32: -; AVX2: # %bb.0: -; AVX2-NEXT: vpsubq {{.*}}(%rip), %ymm0, %ymm0 -; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0 -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq +; AVX2-SLOW-LABEL: trunc_sub_const_v4i64_v4i32: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpsubq {{.*}}(%rip), %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-SLOW-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0 +; AVX2-SLOW-NEXT: vzeroupper +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: trunc_sub_const_v4i64_v4i32: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vpsubq {{.*}}(%rip), %ymm0, %ymm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,4,6,4,6,6,7] +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0 +; AVX2-FAST-NEXT: vzeroupper +; AVX2-FAST-NEXT: retq ; ; AVX512-LABEL: trunc_sub_const_v4i64_v4i32: ; AVX512: # %bb.0: @@ -1276,20 +1424,34 @@ ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; -; AVX2-LABEL: trunc_sub_const_v8i64_v8i16: -; AVX2: # %bb.0: -; AVX2-NEXT: vpsubq {{.*}}(%rip), %ymm1, %ymm1 -; AVX2-NEXT: vpsubq {{.*}}(%rip), %ymm0, %ymm0 -; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] -; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0 -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq +; AVX2-SLOW-LABEL: trunc_sub_const_v8i64_v8i16: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpsubq {{.*}}(%rip), %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vpsubq {{.*}}(%rip), %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-SLOW-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0 +; AVX2-SLOW-NEXT: vzeroupper +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: trunc_sub_const_v8i64_v8i16: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vpsubq {{.*}}(%rip), %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpsubq {{.*}}(%rip), %ymm0, %ymm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7] +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0 +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-FAST-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0 +; AVX2-FAST-NEXT: vzeroupper +; AVX2-FAST-NEXT: retq ; ; AVX512-LABEL: trunc_sub_const_v8i64_v8i16: ; AVX512: # %bb.0: @@ -1415,33 +1577,58 @@ ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; -; AVX2-LABEL: trunc_sub_const_v16i64_v16i8: -; AVX2: # %bb.0: -; AVX2-NEXT: vpsubq {{.*}}(%rip), %ymm1, %ymm1 -; AVX2-NEXT: vpsubq {{.*}}(%rip), %ymm0, %ymm0 -; AVX2-NEXT: vpsubq {{.*}}(%rip), %ymm3, %ymm3 -; AVX2-NEXT: vpsubq {{.*}}(%rip), %ymm2, %ymm2 -; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] -; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] -; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] -; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] -; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] -; AVX2-NEXT: vpshufb %ymm3, %ymm2, %ymm2 -; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] -; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX2-NEXT: vpshufb %xmm4, %xmm2, %xmm2 -; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] -; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-NEXT: vpshufb %ymm3, %ymm0, %ymm0 -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: vpshufb %xmm4, %xmm0, %xmm0 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq +; AVX2-SLOW-LABEL: trunc_sub_const_v16i64_v16i8: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpsubq {{.*}}(%rip), %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vpsubq {{.*}}(%rip), %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpsubq {{.*}}(%rip), %ymm3, %ymm3 +; AVX2-SLOW-NEXT: vpsubq {{.*}}(%rip), %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> +; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm2, %xmm2 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; AVX2-SLOW-NEXT: vzeroupper +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: trunc_sub_const_v16i64_v16i8: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vpsubq {{.*}}(%rip), %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpsubq {{.*}}(%rip), %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpsubq {{.*}}(%rip), %ymm3, %ymm3 +; AVX2-FAST-NEXT: vpsubq {{.*}}(%rip), %ymm2, %ymm2 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [0,2,4,6,4,6,6,7] +; AVX2-FAST-NEXT: vpermd %ymm2, %ymm4, %ymm2 +; AVX2-FAST-NEXT: vpermd %ymm3, %ymm4, %ymm3 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> +; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm2, %xmm2 +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm4, %ymm0 +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm4, %ymm1 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; AVX2-FAST-NEXT: vzeroupper +; AVX2-FAST-NEXT: retq ; ; AVX512-LABEL: trunc_sub_const_v16i64_v16i8: ; AVX512: # %bb.0: @@ -1623,15 +1810,24 @@ ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; -; AVX2-LABEL: trunc_mul_v4i64_v4i32: -; AVX2: # %bb.0: -; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] -; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq +; AVX2-SLOW-LABEL: trunc_mul_v4i64_v4i32: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-SLOW-NEXT: vpmulld %xmm1, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vzeroupper +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: trunc_mul_v4i64_v4i32: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7] +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1 +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0 +; AVX2-FAST-NEXT: vpmulld %xmm1, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vzeroupper +; AVX2-FAST-NEXT: retq ; ; AVX512F-LABEL: trunc_mul_v4i64_v4i32: ; AVX512F: # %bb.0: @@ -1720,26 +1916,44 @@ ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; -; AVX2-LABEL: trunc_mul_v8i64_v8i16: -; AVX2: # %bb.0: -; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] -; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] -; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] -; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] -; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] -; AVX2-NEXT: vpshufb %ymm3, %ymm2, %ymm2 -; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] -; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] -; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-NEXT: vpshufb %ymm3, %ymm0, %ymm0 -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: vpmullw %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq +; AVX2-SLOW-LABEL: trunc_mul_v8i64_v8i16: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-SLOW-NEXT: vpmullw %xmm2, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vzeroupper +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: trunc_mul_v8i64_v8i16: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [0,2,4,6,4,6,6,7] +; AVX2-FAST-NEXT: vpermd %ymm2, %ymm4, %ymm2 +; AVX2-FAST-NEXT: vpermd %ymm3, %ymm4, %ymm3 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm4, %ymm0 +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm4, %ymm1 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-FAST-NEXT: vpmullw %xmm2, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vzeroupper +; AVX2-FAST-NEXT: retq ; ; AVX512F-LABEL: trunc_mul_v8i64_v8i16: ; AVX512F: # %bb.0: @@ -2028,41 +2242,70 @@ ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; -; AVX2-LABEL: trunc_mul_v16i64_v16i8: -; AVX2: # %bb.0: -; AVX2-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[0,2,2,3,4,6,6,7] -; AVX2-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,2,2,3] -; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] -; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] -; AVX2-NEXT: vpmulld %xmm7, %xmm3, %xmm3 -; AVX2-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[0,2,2,3,4,6,6,7] -; AVX2-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,2,3] -; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] -; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] -; AVX2-NEXT: vpmulld %xmm6, %xmm2, %xmm2 -; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] -; AVX2-NEXT: vpshufb %ymm3, %ymm2, %ymm2 -; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] -; AVX2-NEXT: vmovdqa {{.*#+}} xmm6 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX2-NEXT: vpshufb %xmm6, %xmm2, %xmm2 -; AVX2-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[0,2,2,3,4,6,6,7] -; AVX2-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,2,2,3] -; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] -; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-NEXT: vpmulld %xmm5, %xmm1, %xmm1 -; AVX2-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[0,2,2,3,4,6,6,7] -; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3] -; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: vpmulld %xmm4, %xmm0, %xmm0 -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-NEXT: vpshufb %ymm3, %ymm0, %ymm0 -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: vpshufb %xmm6, %xmm0, %xmm0 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq +; AVX2-SLOW-LABEL: trunc_mul_v16i64_v16i8: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,2,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] +; AVX2-SLOW-NEXT: vpmulld %xmm7, %xmm3, %xmm3 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] +; AVX2-SLOW-NEXT: vpmulld %xmm6, %xmm2, %xmm2 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm6 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> +; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm2, %xmm2 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,2,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] +; AVX2-SLOW-NEXT: vpmulld %xmm5, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-SLOW-NEXT: vpmulld %xmm4, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-SLOW-NEXT: vpshufb %xmm6, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; AVX2-SLOW-NEXT: vzeroupper +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: trunc_mul_v16i64_v16i8: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [0,2,4,6,4,6,6,7] +; AVX2-FAST-NEXT: vpermd %ymm7, %ymm8, %ymm7 +; AVX2-FAST-NEXT: vpermd %ymm3, %ymm8, %ymm3 +; AVX2-FAST-NEXT: vpmulld %xmm7, %xmm3, %xmm3 +; AVX2-FAST-NEXT: vpermd %ymm6, %ymm8, %ymm6 +; AVX2-FAST-NEXT: vpermd %ymm2, %ymm8, %ymm2 +; AVX2-FAST-NEXT: vpmulld %xmm6, %xmm2, %xmm2 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> +; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm2, %xmm2 +; AVX2-FAST-NEXT: vpermd %ymm5, %ymm8, %ymm5 +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm8, %ymm1 +; AVX2-FAST-NEXT: vpmulld %xmm5, %xmm1, %xmm1 +; AVX2-FAST-NEXT: vpermd %ymm4, %ymm8, %ymm4 +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm8, %ymm0 +; AVX2-FAST-NEXT: vpmulld %xmm4, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-FAST-NEXT: vpshufb %xmm6, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; AVX2-FAST-NEXT: vzeroupper +; AVX2-FAST-NEXT: retq ; ; AVX512F-LABEL: trunc_mul_v16i64_v16i8: ; AVX512F: # %bb.0: @@ -2340,13 +2583,21 @@ ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; -; AVX2-LABEL: trunc_mul_const_v4i64_v4i32: -; AVX2: # %bb.0: -; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq +; AVX2-SLOW-LABEL: trunc_mul_const_v4i64_v4i32: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-SLOW-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vzeroupper +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: trunc_mul_const_v4i64_v4i32: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,4,6,4,6,6,7] +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-FAST-NEXT: vzeroupper +; AVX2-FAST-NEXT: retq ; ; AVX512-LABEL: trunc_mul_const_v4i64_v4i32: ; AVX512: # %bb.0: @@ -2393,18 +2644,30 @@ ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; -; AVX2-LABEL: trunc_mul_const_v8i64_v8i16: -; AVX2: # %bb.0: -; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] -; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0 -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq +; AVX2-SLOW-LABEL: trunc_mul_const_v8i64_v8i16: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-SLOW-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vzeroupper +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: trunc_mul_const_v8i64_v8i16: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7] +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0 +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-FAST-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-FAST-NEXT: vzeroupper +; AVX2-FAST-NEXT: retq ; ; AVX512-LABEL: trunc_mul_const_v8i64_v8i16: ; AVX512: # %bb.0: @@ -2613,33 +2876,58 @@ ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; -; AVX2-LABEL: trunc_mul_const_v16i64_v16i8: -; AVX2: # %bb.0: -; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] -; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] -; AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm2, %xmm2 -; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] -; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] -; AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm3, %xmm3 -; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] -; AVX2-NEXT: vpshufb %ymm3, %ymm2, %ymm2 -; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] -; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX2-NEXT: vpshufb %xmm4, %xmm2, %xmm2 -; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] -; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-NEXT: vpshufb %ymm3, %ymm0, %ymm0 -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: vpshufb %xmm4, %xmm0, %xmm0 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq +; AVX2-SLOW-LABEL: trunc_mul_const_v16i64_v16i8: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] +; AVX2-SLOW-NEXT: vpmulld {{.*}}(%rip), %xmm2, %xmm2 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] +; AVX2-SLOW-NEXT: vpmulld {{.*}}(%rip), %xmm3, %xmm3 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> +; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm2, %xmm2 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-SLOW-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] +; AVX2-SLOW-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; AVX2-SLOW-NEXT: vzeroupper +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: trunc_mul_const_v16i64_v16i8: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [0,2,4,6,4,6,6,7] +; AVX2-FAST-NEXT: vpermd %ymm2, %ymm4, %ymm2 +; AVX2-FAST-NEXT: vpmulld {{.*}}(%rip), %xmm2, %xmm2 +; AVX2-FAST-NEXT: vpermd %ymm3, %ymm4, %ymm3 +; AVX2-FAST-NEXT: vpmulld {{.*}}(%rip), %xmm3, %xmm3 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> +; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm2, %xmm2 +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm4, %ymm0 +; AVX2-FAST-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm4, %ymm1 +; AVX2-FAST-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; AVX2-FAST-NEXT: vzeroupper +; AVX2-FAST-NEXT: retq ; ; AVX512-LABEL: trunc_mul_const_v16i64_v16i8: ; AVX512: # %bb.0: @@ -2829,14 +3117,23 @@ ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; -; AVX2-LABEL: trunc_and_v4i64_v4i32: -; AVX2: # %bb.0: -; AVX2-NEXT: vandps %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] -; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0 -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq +; AVX2-SLOW-LABEL: trunc_and_v4i64_v4i32: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vandps %ymm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-SLOW-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0 +; AVX2-SLOW-NEXT: vzeroupper +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: trunc_and_v4i64_v4i32: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vandps %ymm1, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm1 = [0,2,4,6,4,6,6,7] +; AVX2-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0 +; AVX2-FAST-NEXT: vzeroupper +; AVX2-FAST-NEXT: retq ; ; AVX512-LABEL: trunc_and_v4i64_v4i32: ; AVX512: # %bb.0: @@ -2887,20 +3184,34 @@ ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; -; AVX2-LABEL: trunc_and_v8i64_v8i16: -; AVX2: # %bb.0: -; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm1 -; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] -; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0 -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq +; AVX2-SLOW-LABEL: trunc_and_v8i64_v8i16: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpand %ymm3, %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-SLOW-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0 +; AVX2-SLOW-NEXT: vzeroupper +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: trunc_and_v8i64_v8i16: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vpand %ymm3, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7] +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0 +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-FAST-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0 +; AVX2-FAST-NEXT: vzeroupper +; AVX2-FAST-NEXT: retq ; ; AVX512-LABEL: trunc_and_v8i64_v8i16: ; AVX512: # %bb.0: @@ -3015,33 +3326,58 @@ ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; -; AVX2-LABEL: trunc_and_v16i64_v16i8: -; AVX2: # %bb.0: -; AVX2-NEXT: vpand %ymm5, %ymm1, %ymm1 -; AVX2-NEXT: vpand %ymm4, %ymm0, %ymm0 -; AVX2-NEXT: vpand %ymm7, %ymm3, %ymm3 -; AVX2-NEXT: vpand %ymm6, %ymm2, %ymm2 -; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] -; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] -; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] -; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] -; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] -; AVX2-NEXT: vpshufb %ymm3, %ymm2, %ymm2 -; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] -; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX2-NEXT: vpshufb %xmm4, %xmm2, %xmm2 -; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] -; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-NEXT: vpshufb %ymm3, %ymm0, %ymm0 -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: vpshufb %xmm4, %xmm0, %xmm0 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq +; AVX2-SLOW-LABEL: trunc_and_v16i64_v16i8: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpand %ymm5, %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vpand %ymm4, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpand %ymm7, %ymm3, %ymm3 +; AVX2-SLOW-NEXT: vpand %ymm6, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> +; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm2, %xmm2 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; AVX2-SLOW-NEXT: vzeroupper +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: trunc_and_v16i64_v16i8: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vpand %ymm5, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpand %ymm4, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpand %ymm7, %ymm3, %ymm3 +; AVX2-FAST-NEXT: vpand %ymm6, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [0,2,4,6,4,6,6,7] +; AVX2-FAST-NEXT: vpermd %ymm2, %ymm4, %ymm2 +; AVX2-FAST-NEXT: vpermd %ymm3, %ymm4, %ymm3 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> +; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm2, %xmm2 +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm4, %ymm0 +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm4, %ymm1 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; AVX2-FAST-NEXT: vzeroupper +; AVX2-FAST-NEXT: retq ; ; AVX512-LABEL: trunc_and_v16i64_v16i8: ; AVX512: # %bb.0: @@ -3199,13 +3535,21 @@ ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; -; AVX2-LABEL: trunc_and_const_v4i64_v4i32: -; AVX2: # %bb.0: -; AVX2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] -; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm0 -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq +; AVX2-SLOW-LABEL: trunc_and_const_v4i64_v4i32: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-SLOW-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vzeroupper +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: trunc_and_const_v4i64_v4i32: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm1 = [0,2,4,6,4,6,6,7] +; AVX2-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-FAST-NEXT: vzeroupper +; AVX2-FAST-NEXT: retq ; ; AVX512-LABEL: trunc_and_const_v4i64_v4i32: ; AVX512: # %bb.0: @@ -3252,18 +3596,30 @@ ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; -; AVX2-LABEL: trunc_and_const_v8i64_v8i16: -; AVX2: # %bb.0: -; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] -; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq +; AVX2-SLOW-LABEL: trunc_and_const_v8i64_v8i16: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-SLOW-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vzeroupper +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: trunc_and_const_v8i64_v8i16: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7] +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0 +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-FAST-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-FAST-NEXT: vzeroupper +; AVX2-FAST-NEXT: retq ; ; AVX512-LABEL: trunc_and_const_v8i64_v8i16: ; AVX512: # %bb.0: @@ -3366,30 +3722,52 @@ ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; -; AVX2-LABEL: trunc_and_const_v16i64_v16i8: -; AVX2: # %bb.0: -; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] -; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] -; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] -; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] -; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] -; AVX2-NEXT: vpshufb %ymm3, %ymm2, %ymm2 -; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] -; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX2-NEXT: vpshufb %xmm4, %xmm2, %xmm2 -; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] -; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-NEXT: vpshufb %ymm3, %ymm0, %ymm0 -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: vpshufb %xmm4, %xmm0, %xmm0 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] -; AVX2-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq +; AVX2-SLOW-LABEL: trunc_and_const_v16i64_v16i8: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> +; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm2, %xmm2 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; AVX2-SLOW-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vzeroupper +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: trunc_and_const_v16i64_v16i8: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [0,2,4,6,4,6,6,7] +; AVX2-FAST-NEXT: vpermd %ymm2, %ymm4, %ymm2 +; AVX2-FAST-NEXT: vpermd %ymm3, %ymm4, %ymm3 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> +; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm2, %xmm2 +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm4, %ymm0 +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm4, %ymm1 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; AVX2-FAST-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-FAST-NEXT: vzeroupper +; AVX2-FAST-NEXT: retq ; ; AVX512-LABEL: trunc_and_const_v16i64_v16i8: ; AVX512: # %bb.0: @@ -3541,14 +3919,23 @@ ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; -; AVX2-LABEL: trunc_xor_v4i64_v4i32: -; AVX2: # %bb.0: -; AVX2-NEXT: vxorps %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] -; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0 -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq +; AVX2-SLOW-LABEL: trunc_xor_v4i64_v4i32: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vxorps %ymm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-SLOW-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0 +; AVX2-SLOW-NEXT: vzeroupper +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: trunc_xor_v4i64_v4i32: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vxorps %ymm1, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm1 = [0,2,4,6,4,6,6,7] +; AVX2-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0 +; AVX2-FAST-NEXT: vzeroupper +; AVX2-FAST-NEXT: retq ; ; AVX512-LABEL: trunc_xor_v4i64_v4i32: ; AVX512: # %bb.0: @@ -3599,20 +3986,34 @@ ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; -; AVX2-LABEL: trunc_xor_v8i64_v8i16: -; AVX2: # %bb.0: -; AVX2-NEXT: vpxor %ymm3, %ymm1, %ymm1 -; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] -; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0 -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq +; AVX2-SLOW-LABEL: trunc_xor_v8i64_v8i16: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpxor %ymm3, %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vpxor %ymm2, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-SLOW-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0 +; AVX2-SLOW-NEXT: vzeroupper +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: trunc_xor_v8i64_v8i16: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vpxor %ymm3, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpxor %ymm2, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7] +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0 +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-FAST-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0 +; AVX2-FAST-NEXT: vzeroupper +; AVX2-FAST-NEXT: retq ; ; AVX512-LABEL: trunc_xor_v8i64_v8i16: ; AVX512: # %bb.0: @@ -3727,33 +4128,58 @@ ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; -; AVX2-LABEL: trunc_xor_v16i64_v16i8: -; AVX2: # %bb.0: -; AVX2-NEXT: vpxor %ymm5, %ymm1, %ymm1 -; AVX2-NEXT: vpxor %ymm4, %ymm0, %ymm0 -; AVX2-NEXT: vpxor %ymm7, %ymm3, %ymm3 -; AVX2-NEXT: vpxor %ymm6, %ymm2, %ymm2 -; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] -; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] -; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] -; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] -; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] -; AVX2-NEXT: vpshufb %ymm3, %ymm2, %ymm2 -; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] -; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX2-NEXT: vpshufb %xmm4, %xmm2, %xmm2 -; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] -; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-NEXT: vpshufb %ymm3, %ymm0, %ymm0 -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: vpshufb %xmm4, %xmm0, %xmm0 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq +; AVX2-SLOW-LABEL: trunc_xor_v16i64_v16i8: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpxor %ymm5, %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vpxor %ymm4, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpxor %ymm7, %ymm3, %ymm3 +; AVX2-SLOW-NEXT: vpxor %ymm6, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> +; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm2, %xmm2 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; AVX2-SLOW-NEXT: vzeroupper +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: trunc_xor_v16i64_v16i8: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vpxor %ymm5, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpxor %ymm4, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpxor %ymm7, %ymm3, %ymm3 +; AVX2-FAST-NEXT: vpxor %ymm6, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [0,2,4,6,4,6,6,7] +; AVX2-FAST-NEXT: vpermd %ymm2, %ymm4, %ymm2 +; AVX2-FAST-NEXT: vpermd %ymm3, %ymm4, %ymm3 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> +; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm2, %xmm2 +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm4, %ymm0 +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm4, %ymm1 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; AVX2-FAST-NEXT: vzeroupper +; AVX2-FAST-NEXT: retq ; ; AVX512-LABEL: trunc_xor_v16i64_v16i8: ; AVX512: # %bb.0: @@ -3911,13 +4337,21 @@ ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; -; AVX2-LABEL: trunc_xor_const_v4i64_v4i32: -; AVX2: # %bb.0: -; AVX2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] -; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: vxorps {{.*}}(%rip), %xmm0, %xmm0 -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq +; AVX2-SLOW-LABEL: trunc_xor_const_v4i64_v4i32: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-SLOW-NEXT: vxorps {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vzeroupper +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: trunc_xor_const_v4i64_v4i32: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm1 = [0,2,4,6,4,6,6,7] +; AVX2-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT: vxorps {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-FAST-NEXT: vzeroupper +; AVX2-FAST-NEXT: retq ; ; AVX512-LABEL: trunc_xor_const_v4i64_v4i32: ; AVX512: # %bb.0: @@ -3964,18 +4398,30 @@ ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; -; AVX2-LABEL: trunc_xor_const_v8i64_v8i16: -; AVX2: # %bb.0: -; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] -; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq +; AVX2-SLOW-LABEL: trunc_xor_const_v8i64_v8i16: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-SLOW-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vzeroupper +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: trunc_xor_const_v8i64_v8i16: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7] +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0 +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-FAST-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-FAST-NEXT: vzeroupper +; AVX2-FAST-NEXT: retq ; ; AVX512-LABEL: trunc_xor_const_v8i64_v8i16: ; AVX512: # %bb.0: @@ -4078,30 +4524,52 @@ ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; -; AVX2-LABEL: trunc_xor_const_v16i64_v16i8: -; AVX2: # %bb.0: -; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] -; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] -; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] -; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] -; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] -; AVX2-NEXT: vpshufb %ymm3, %ymm2, %ymm2 -; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] -; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX2-NEXT: vpshufb %xmm4, %xmm2, %xmm2 -; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] -; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-NEXT: vpshufb %ymm3, %ymm0, %ymm0 -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: vpshufb %xmm4, %xmm0, %xmm0 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] -; AVX2-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq +; AVX2-SLOW-LABEL: trunc_xor_const_v16i64_v16i8: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> +; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm2, %xmm2 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; AVX2-SLOW-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vzeroupper +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: trunc_xor_const_v16i64_v16i8: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [0,2,4,6,4,6,6,7] +; AVX2-FAST-NEXT: vpermd %ymm2, %ymm4, %ymm2 +; AVX2-FAST-NEXT: vpermd %ymm3, %ymm4, %ymm3 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> +; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm2, %xmm2 +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm4, %ymm0 +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm4, %ymm1 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; AVX2-FAST-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-FAST-NEXT: vzeroupper +; AVX2-FAST-NEXT: retq ; ; AVX512-LABEL: trunc_xor_const_v16i64_v16i8: ; AVX512: # %bb.0: @@ -4253,14 +4721,23 @@ ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; -; AVX2-LABEL: trunc_or_v4i64_v4i32: -; AVX2: # %bb.0: -; AVX2-NEXT: vorps %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] -; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0 -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq +; AVX2-SLOW-LABEL: trunc_or_v4i64_v4i32: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vorps %ymm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-SLOW-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0 +; AVX2-SLOW-NEXT: vzeroupper +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: trunc_or_v4i64_v4i32: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vorps %ymm1, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm1 = [0,2,4,6,4,6,6,7] +; AVX2-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0 +; AVX2-FAST-NEXT: vzeroupper +; AVX2-FAST-NEXT: retq ; ; AVX512-LABEL: trunc_or_v4i64_v4i32: ; AVX512: # %bb.0: @@ -4311,20 +4788,34 @@ ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; -; AVX2-LABEL: trunc_or_v8i64_v8i16: -; AVX2: # %bb.0: -; AVX2-NEXT: vpor %ymm3, %ymm1, %ymm1 -; AVX2-NEXT: vpor %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] -; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0 -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq +; AVX2-SLOW-LABEL: trunc_or_v8i64_v8i16: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpor %ymm3, %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vpor %ymm2, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-SLOW-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0 +; AVX2-SLOW-NEXT: vzeroupper +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: trunc_or_v8i64_v8i16: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vpor %ymm3, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpor %ymm2, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7] +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0 +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-FAST-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0 +; AVX2-FAST-NEXT: vzeroupper +; AVX2-FAST-NEXT: retq ; ; AVX512-LABEL: trunc_or_v8i64_v8i16: ; AVX512: # %bb.0: @@ -4439,33 +4930,58 @@ ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; -; AVX2-LABEL: trunc_or_v16i64_v16i8: -; AVX2: # %bb.0: -; AVX2-NEXT: vpor %ymm5, %ymm1, %ymm1 -; AVX2-NEXT: vpor %ymm4, %ymm0, %ymm0 -; AVX2-NEXT: vpor %ymm7, %ymm3, %ymm3 -; AVX2-NEXT: vpor %ymm6, %ymm2, %ymm2 -; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] -; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] -; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] -; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] -; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] -; AVX2-NEXT: vpshufb %ymm3, %ymm2, %ymm2 -; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] -; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX2-NEXT: vpshufb %xmm4, %xmm2, %xmm2 -; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] -; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-NEXT: vpshufb %ymm3, %ymm0, %ymm0 -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: vpshufb %xmm4, %xmm0, %xmm0 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq +; AVX2-SLOW-LABEL: trunc_or_v16i64_v16i8: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpor %ymm5, %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vpor %ymm4, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpor %ymm7, %ymm3, %ymm3 +; AVX2-SLOW-NEXT: vpor %ymm6, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> +; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm2, %xmm2 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; AVX2-SLOW-NEXT: vzeroupper +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: trunc_or_v16i64_v16i8: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vpor %ymm5, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpor %ymm4, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpor %ymm7, %ymm3, %ymm3 +; AVX2-FAST-NEXT: vpor %ymm6, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [0,2,4,6,4,6,6,7] +; AVX2-FAST-NEXT: vpermd %ymm2, %ymm4, %ymm2 +; AVX2-FAST-NEXT: vpermd %ymm3, %ymm4, %ymm3 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> +; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm2, %xmm2 +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm4, %ymm0 +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm4, %ymm1 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; AVX2-FAST-NEXT: vzeroupper +; AVX2-FAST-NEXT: retq ; ; AVX512-LABEL: trunc_or_v16i64_v16i8: ; AVX512: # %bb.0: @@ -4623,13 +5139,21 @@ ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; -; AVX2-LABEL: trunc_or_const_v4i64_v4i32: -; AVX2: # %bb.0: -; AVX2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] -; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: vorps {{.*}}(%rip), %xmm0, %xmm0 -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq +; AVX2-SLOW-LABEL: trunc_or_const_v4i64_v4i32: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-SLOW-NEXT: vorps {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vzeroupper +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: trunc_or_const_v4i64_v4i32: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm1 = [0,2,4,6,4,6,6,7] +; AVX2-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT: vorps {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-FAST-NEXT: vzeroupper +; AVX2-FAST-NEXT: retq ; ; AVX512-LABEL: trunc_or_const_v4i64_v4i32: ; AVX512: # %bb.0: @@ -4676,18 +5200,30 @@ ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; -; AVX2-LABEL: trunc_or_const_v8i64_v8i16: -; AVX2: # %bb.0: -; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] -; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq +; AVX2-SLOW-LABEL: trunc_or_const_v8i64_v8i16: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-SLOW-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vzeroupper +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: trunc_or_const_v8i64_v8i16: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7] +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0 +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-FAST-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-FAST-NEXT: vzeroupper +; AVX2-FAST-NEXT: retq ; ; AVX512-LABEL: trunc_or_const_v8i64_v8i16: ; AVX512: # %bb.0: @@ -4790,30 +5326,52 @@ ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; -; AVX2-LABEL: trunc_or_const_v16i64_v16i8: -; AVX2: # %bb.0: -; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] -; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] -; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] -; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] -; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] -; AVX2-NEXT: vpshufb %ymm3, %ymm2, %ymm2 -; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] -; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX2-NEXT: vpshufb %xmm4, %xmm2, %xmm2 -; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] -; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-NEXT: vpshufb %ymm3, %ymm0, %ymm0 -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: vpshufb %xmm4, %xmm0, %xmm0 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] -; AVX2-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq +; AVX2-SLOW-LABEL: trunc_or_const_v16i64_v16i8: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> +; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm2, %xmm2 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-SLOW-NEXT: vpshufb %xmm4, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; AVX2-SLOW-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vzeroupper +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: trunc_or_const_v16i64_v16i8: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [0,2,4,6,4,6,6,7] +; AVX2-FAST-NEXT: vpermd %ymm2, %ymm4, %ymm2 +; AVX2-FAST-NEXT: vpermd %ymm3, %ymm4, %ymm3 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm2 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> +; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm2, %xmm2 +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm4, %ymm0 +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm4, %ymm1 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-FAST-NEXT: vpshufb %xmm5, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; AVX2-FAST-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-FAST-NEXT: vzeroupper +; AVX2-FAST-NEXT: retq ; ; AVX512-LABEL: trunc_or_const_v16i64_v16i8: ; AVX512: # %bb.0: Index: test/CodeGen/X86/vector-trunc.ll =================================================================== --- test/CodeGen/X86/vector-trunc.ll +++ test/CodeGen/X86/vector-trunc.ll @@ -3,11 +3,12 @@ ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefix=SSE --check-prefix=SSSE3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE --check-prefix=SSE41 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-SLOW +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-FAST ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512F -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512VL -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512BW -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512BWVL +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VL +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BW +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BWVL define <8 x i32> @trunc8i64_8i32(<8 x i64> %a) { ; SSE-LABEL: trunc8i64_8i32: @@ -26,14 +27,22 @@ ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; -; AVX2-LABEL: trunc8i64_8i32: -; AVX2: # %bb.0: # %entry -; AVX2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] -; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] -; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-NEXT: retq +; AVX2-SLOW-LABEL: trunc8i64_8i32: +; AVX2-SLOW: # %bb.0: # %entry +; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,2,3] +; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: trunc8i64_8i32: +; AVX2-FAST: # %bb.0: # %entry +; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7] +; AVX2-FAST-NEXT: vpermps %ymm0, %ymm2, %ymm0 +; AVX2-FAST-NEXT: vpermps %ymm1, %ymm2, %ymm1 +; AVX2-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-FAST-NEXT: retq ; ; AVX512-LABEL: trunc8i64_8i32: ; AVX512: # %bb.0: # %entry @@ -103,14 +112,22 @@ ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; -; AVX2-LABEL: trunc8i64_8i32_ashr: -; AVX2: # %bb.0: # %entry -; AVX2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,3,2,3,5,7,6,7] -; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[1,3,2,3,5,7,6,7] -; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-NEXT: retq +; AVX2-SLOW-LABEL: trunc8i64_8i32_ashr: +; AVX2-SLOW: # %bb.0: # %entry +; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,3,2,3,5,7,6,7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[1,3,2,3,5,7,6,7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,2,3] +; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: trunc8i64_8i32_ashr: +; AVX2-FAST: # %bb.0: # %entry +; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm2 = [1,3,5,7,5,7,6,7] +; AVX2-FAST-NEXT: vpermps %ymm0, %ymm2, %ymm0 +; AVX2-FAST-NEXT: vpermps %ymm1, %ymm2, %ymm1 +; AVX2-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-FAST-NEXT: retq ; ; AVX512-LABEL: trunc8i64_8i32_ashr: ; AVX512: # %bb.0: # %entry @@ -148,16 +165,26 @@ ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; -; AVX2-LABEL: trunc8i64_8i32_lshr: -; AVX2: # %bb.0: # %entry -; AVX2-NEXT: vpsrlq $32, %ymm1, %ymm1 -; AVX2-NEXT: vpsrlq $32, %ymm0, %ymm0 -; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] -; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-NEXT: retq +; AVX2-SLOW-LABEL: trunc8i64_8i32_lshr: +; AVX2-SLOW: # %bb.0: # %entry +; AVX2-SLOW-NEXT: vpsrlq $32, %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vpsrlq $32, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: trunc8i64_8i32_lshr: +; AVX2-FAST: # %bb.0: # %entry +; AVX2-FAST-NEXT: vpsrlq $32, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpsrlq $32, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7] +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0 +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-FAST-NEXT: retq ; ; AVX512-LABEL: trunc8i64_8i32_lshr: ; AVX512: # %bb.0: # %entry @@ -228,18 +255,30 @@ ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; -; AVX2-LABEL: trunc8i64_8i16: -; AVX2: # %bb.0: # %entry -; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] -; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0 -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq +; AVX2-SLOW-LABEL: trunc8i64_8i16: +; AVX2-SLOW: # %bb.0: # %entry +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-SLOW-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0 +; AVX2-SLOW-NEXT: vzeroupper +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: trunc8i64_8i16: +; AVX2-FAST: # %bb.0: # %entry +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7] +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0 +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-FAST-NEXT: # kill: def %xmm0 killed %xmm0 killed %ymm0 +; AVX2-FAST-NEXT: vzeroupper +; AVX2-FAST-NEXT: retq ; ; AVX512-LABEL: trunc8i64_8i16: ; AVX512: # %bb.0: # %entry @@ -283,19 +322,32 @@ ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; -; AVX2-LABEL: trunc8i64_8i8: -; AVX2: # %bb.0: # %entry -; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] -; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX2-NEXT: vmovq %xmm0, (%rax) -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq +; AVX2-SLOW-LABEL: trunc8i64_8i8: +; AVX2-SLOW: # %bb.0: # %entry +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX2-SLOW-NEXT: vmovq %xmm0, (%rax) +; AVX2-SLOW-NEXT: vzeroupper +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: trunc8i64_8i8: +; AVX2-FAST: # %bb.0: # %entry +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7] +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0 +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX2-FAST-NEXT: vmovq %xmm0, (%rax) +; AVX2-FAST-NEXT: vzeroupper +; AVX2-FAST-NEXT: retq ; ; AVX512-LABEL: trunc8i64_8i8: ; AVX512: # %bb.0: # %entry @@ -1368,14 +1420,22 @@ ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; -; AVX2-LABEL: trunc2x4i64_8i32: -; AVX2: # %bb.0: # %entry -; AVX2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] -; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] -; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-NEXT: retq +; AVX2-SLOW-LABEL: trunc2x4i64_8i32: +; AVX2-SLOW: # %bb.0: # %entry +; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,2,3] +; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: trunc2x4i64_8i32: +; AVX2-FAST: # %bb.0: # %entry +; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7] +; AVX2-FAST-NEXT: vpermps %ymm0, %ymm2, %ymm0 +; AVX2-FAST-NEXT: vpermps %ymm1, %ymm2, %ymm1 +; AVX2-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-FAST-NEXT: retq ; ; AVX512F-LABEL: trunc2x4i64_8i32: ; AVX512F: # %bb.0: # %entry @@ -1474,18 +1534,30 @@ ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; -; AVX2-LABEL: trunc2x4i64_8i16: -; AVX2: # %bb.0: # %entry -; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] -; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] -; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq +; AVX2-SLOW-LABEL: trunc2x4i64_8i16: +; AVX2-SLOW: # %bb.0: # %entry +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] +; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX2-SLOW-NEXT: vzeroupper +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: trunc2x4i64_8i16: +; AVX2-FAST: # %bb.0: # %entry +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7] +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0 +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1 +; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm0 +; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX2-FAST-NEXT: vzeroupper +; AVX2-FAST-NEXT: retq ; ; AVX512F-LABEL: trunc2x4i64_8i16: ; AVX512F: # %bb.0: # %entry @@ -1882,14 +1954,22 @@ ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; -; AVX2-LABEL: PR32160: -; AVX2: # %bb.0: -; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,2,2,3,4,5,6,7] -; AVX2-NEXT: vpbroadcastd %xmm0, %xmm0 -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq +; AVX2-SLOW-LABEL: PR32160: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,2,2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpbroadcastd %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vzeroupper +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-LABEL: PR32160: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,4,5,4,5,4,5,4,5,4,5,4,5,4,5] +; AVX2-FAST-NEXT: vzeroupper +; AVX2-FAST-NEXT: retq ; ; AVX512F-LABEL: PR32160: ; AVX512F: # %bb.0: @@ -1903,8 +1983,7 @@ ; AVX512VL-LABEL: PR32160: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vpmovdw %ymm0, %xmm0 -; AVX512VL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,2,2,3,4,5,6,7] -; AVX512VL-NEXT: vpbroadcastd %xmm0, %xmm0 +; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,4,5,4,5,4,5,4,5,4,5,4,5,4,5] ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq ; @@ -1912,16 +1991,14 @@ ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def %ymm0 killed %ymm0 def %zmm0 ; AVX512BW-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,2,2,3,4,5,6,7] -; AVX512BW-NEXT: vpbroadcastd %xmm0, %xmm0 +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,4,5,4,5,4,5,4,5,4,5,4,5,4,5] ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512BWVL-LABEL: PR32160: ; AVX512BWVL: # %bb.0: ; AVX512BWVL-NEXT: vpmovdw %ymm0, %xmm0 -; AVX512BWVL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,2,2,3,4,5,6,7] -; AVX512BWVL-NEXT: vpbroadcastd %xmm0, %xmm0 +; AVX512BWVL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,4,5,4,5,4,5,4,5,4,5,4,5,4,5] ; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq %shuf = trunc <8 x i32> %x to <8 x i16> Index: test/CodeGen/X86/vector-zext.ll =================================================================== --- test/CodeGen/X86/vector-zext.ll +++ test/CodeGen/X86/vector-zext.ll @@ -6,7 +6,7 @@ ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 --check-prefix=AVX2-SLOW ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 --check-prefix=AVX2-FAST ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512F -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512BW +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX,AVX512,AVX512BW define <8 x i16> @zext_16i8_to_8i16(<16 x i8> %A) nounwind uwtable readnone ssp { ; SSE2-LABEL: zext_16i8_to_8i16: @@ -1929,11 +1929,16 @@ ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3],zero,zero,xmm0[4,5],zero,zero,xmm0[6,7],zero,zero,xmm0[8,9],zero,zero ; AVX2-FAST-NEXT: retq ; -; AVX512-LABEL: shuf_zext_8i16_to_4i32_offset1: -; AVX512: # %bb.0: # %entry -; AVX512-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; AVX512-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; AVX512-NEXT: retq +; AVX512F-LABEL: shuf_zext_8i16_to_4i32_offset1: +; AVX512F: # %bb.0: # %entry +; AVX512F-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero +; AVX512F-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: shuf_zext_8i16_to_4i32_offset1: +; AVX512BW: # %bb.0: # %entry +; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3],zero,zero,xmm0[4,5],zero,zero,xmm0[6,7],zero,zero,xmm0[8,9],zero,zero +; AVX512BW-NEXT: retq entry: %B = shufflevector <8 x i16> %A, <8 x i16> zeroinitializer, <8 x i32> %Z = bitcast <8 x i16> %B to <4 x i32>