Index: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp =================================================================== --- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp +++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp @@ -9052,6 +9052,26 @@ MutableArrayRef HToLInputs(LoInputs.data() + NumLToL, NumHToL); MutableArrayRef HToHInputs(HiInputs.data() + NumLToH, NumHToH); + // If we are splatting two values from one half - one to each half, then + // we can shuffle that half so each is splatted to a dword, then splat those + // to their respective halves. + auto SplatHalfs = [&](int LoInput, int HiInput, unsigned ShufWOp, + int DOffset) { + int PSHUFHalfMask[] = {LoInput % 4, LoInput % 4, HiInput % 4, HiInput % 4}; + int PSHUFDMask[] = {DOffset + 0, DOffset + 0, DOffset + 1, DOffset + 1}; + V = DAG.getNode(ShufWOp, DL, VT, V, + getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG)); + V = DAG.getBitcast(PSHUFDVT, V); + V = DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, V, + getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)); + return DAG.getBitcast(VT, V); + }; + + if (NumLToL == 1 && NumLToH == 1 && (NumHToL + NumHToH) == 0) + return SplatHalfs(LToLInputs[0], LToHInputs[0], X86ISD::PSHUFLW, 0); + if (NumHToL == 1 && NumHToH == 1 && (NumLToL + NumLToH) == 0) + return SplatHalfs(HToLInputs[0], HToHInputs[0], X86ISD::PSHUFHW, 2); + // Simplify the 1-into-3 and 3-into-1 cases with a single pshufd. For all // such inputs we can swap two of the dwords across the half mark and end up // with <=2 inputs to each half in each half. Once there, we can fall through Index: llvm/trunk/test/CodeGen/X86/avx-splat.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/avx-splat.ll +++ llvm/trunk/test/CodeGen/X86/avx-splat.ll @@ -15,7 +15,8 @@ define <16 x i16> @funcB(<16 x i16> %a) nounwind uwtable readnone ssp { ; CHECK-LABEL: funcB: ; CHECK: ## BB#0: ## %entry -; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[10,11,10,11,10,11,10,11,10,11,10,11,10,11,10,11] +; CHECK-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5] +; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] ; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; CHECK-NEXT: retq entry: Index: llvm/trunk/test/CodeGen/X86/psubus.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/psubus.ll +++ llvm/trunk/test/CodeGen/X86/psubus.ll @@ -54,30 +54,21 @@ } define void @test3(i16* nocapture %head, i16 zeroext %w) nounwind { -; SSE2-LABEL: test3: -; SSE2: ## BB#0: ## %vector.ph -; SSE2-NEXT: movd %esi, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] -; SSE2-NEXT: movdqu (%rdi), %xmm1 -; SSE2-NEXT: psubusw %xmm0, %xmm1 -; SSE2-NEXT: movdqu %xmm1, (%rdi) -; SSE2-NEXT: retq -; -; SSSE3-LABEL: test3: -; SSSE3: ## BB#0: ## %vector.ph -; SSSE3-NEXT: movd %esi, %xmm0 -; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] -; SSSE3-NEXT: movdqu (%rdi), %xmm1 -; SSSE3-NEXT: psubusw %xmm0, %xmm1 -; SSSE3-NEXT: movdqu %xmm1, (%rdi) -; SSSE3-NEXT: retq +; SSE-LABEL: test3: +; SSE: ## BB#0: ## %vector.ph +; SSE-NEXT: movd %esi, %xmm0 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; SSE-NEXT: movdqu (%rdi), %xmm1 +; SSE-NEXT: psubusw %xmm0, %xmm1 +; SSE-NEXT: movdqu %xmm1, (%rdi) +; SSE-NEXT: retq ; ; AVX1-LABEL: test3: ; AVX1: ## BB#0: ## %vector.ph ; AVX1-NEXT: vmovd %esi, %xmm0 -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] ; AVX1-NEXT: vmovdqu (%rdi), %xmm1 ; AVX1-NEXT: vpsubusw %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vmovdqu %xmm0, (%rdi) @@ -159,9 +150,8 @@ ; SSE2: ## BB#0: ## %vector.ph ; SSE2-NEXT: movd %esi, %xmm0 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] ; SSE2-NEXT: movdqu (%rdi), %xmm1 ; SSE2-NEXT: psubusb %xmm0, %xmm1 ; SSE2-NEXT: movdqu %xmm1, (%rdi) @@ -304,46 +294,34 @@ } define void @test9(i16* nocapture %head, i16 zeroext %w) nounwind { -; SSE2-LABEL: test9: -; SSE2: ## BB#0: ## %vector.ph -; SSE2-NEXT: movd %esi, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] -; SSE2-NEXT: movdqu (%rdi), %xmm1 -; SSE2-NEXT: movdqu 16(%rdi), %xmm2 -; SSE2-NEXT: psubusw %xmm0, %xmm1 -; SSE2-NEXT: psubusw %xmm0, %xmm2 -; SSE2-NEXT: movdqu %xmm2, 16(%rdi) -; SSE2-NEXT: movdqu %xmm1, (%rdi) -; SSE2-NEXT: retq -; -; SSSE3-LABEL: test9: -; SSSE3: ## BB#0: ## %vector.ph -; SSSE3-NEXT: movd %esi, %xmm0 -; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] -; SSSE3-NEXT: movdqu (%rdi), %xmm1 -; SSSE3-NEXT: movdqu 16(%rdi), %xmm2 -; SSSE3-NEXT: psubusw %xmm0, %xmm1 -; SSSE3-NEXT: psubusw %xmm0, %xmm2 -; SSSE3-NEXT: movdqu %xmm2, 16(%rdi) -; SSSE3-NEXT: movdqu %xmm1, (%rdi) -; SSSE3-NEXT: retq +; SSE-LABEL: test9: +; SSE: ## BB#0: ## %vector.ph +; SSE-NEXT: movd %esi, %xmm0 +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; SSE-NEXT: movdqu (%rdi), %xmm1 +; SSE-NEXT: movdqu 16(%rdi), %xmm2 +; SSE-NEXT: psubusw %xmm0, %xmm1 +; SSE-NEXT: psubusw %xmm0, %xmm2 +; SSE-NEXT: movdqu %xmm2, 16(%rdi) +; SSE-NEXT: movdqu %xmm1, (%rdi) +; SSE-NEXT: retq ; ; AVX1-LABEL: test9: ; AVX1: ## BB#0: ## %vector.ph ; AVX1-NEXT: vmovups (%rdi), %ymm0 -; AVX1-NEXT: vmovd %esi, %xmm1 -; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vpsubw %xmm1, %xmm2, %xmm3 -; AVX1-NEXT: vpsubw %xmm1, %xmm0, %xmm4 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vmovd %esi, %xmm2 +; AVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,0,0,0,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,1,1] +; AVX1-NEXT: vpsubw %xmm2, %xmm1, %xmm3 +; AVX1-NEXT: vpsubw %xmm2, %xmm0, %xmm4 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 -; AVX1-NEXT: vpmaxuw %xmm1, %xmm2, %xmm4 -; AVX1-NEXT: vpcmpeqw %xmm4, %xmm2, %xmm2 -; AVX1-NEXT: vpmaxuw %xmm1, %xmm0, %xmm1 -; AVX1-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: vpmaxuw %xmm2, %xmm1, %xmm4 +; AVX1-NEXT: vpcmpeqw %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vpmaxuw %xmm2, %xmm0, %xmm2 +; AVX1-NEXT: vpcmpeqw %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: vandps %ymm3, %ymm0, %ymm0 ; AVX1-NEXT: vmovups %ymm0, (%rdi) ; AVX1-NEXT: vzeroupper @@ -471,9 +449,8 @@ ; SSE2: ## BB#0: ## %vector.ph ; SSE2-NEXT: movd %esi, %xmm0 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] ; SSE2-NEXT: movdqu (%rdi), %xmm1 ; SSE2-NEXT: movdqu 16(%rdi), %xmm2 ; SSE2-NEXT: psubusb %xmm0, %xmm1 Index: llvm/trunk/test/CodeGen/X86/vector-shift-ashr-128.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/vector-shift-ashr-128.ll +++ llvm/trunk/test/CodeGen/X86/vector-shift-ashr-128.ll @@ -745,9 +745,8 @@ ; SSE2-LABEL: splatvar_shift_v16i8: ; SSE2: # BB#0: ; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3] ; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] -; SSE2-NEXT: pshufhw {{.*#+}} xmm3 = xmm1[0,1,2,3,4,4,4,4] +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,0,1,1] ; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] ; SSE2-NEXT: psllw $5, %xmm3 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm3[8],xmm4[9],xmm3[9],xmm4[10],xmm3[10],xmm4[11],xmm3[11],xmm4[12],xmm3[12],xmm4[13],xmm3[13],xmm4[14],xmm3[14],xmm4[15],xmm3[15] @@ -949,9 +948,8 @@ ; X32-SSE-LABEL: splatvar_shift_v16i8: ; X32-SSE: # BB#0: ; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3] ; X32-SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] -; X32-SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm1[0,1,2,3,4,4,4,4] +; X32-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,0,1,1] ; X32-SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] ; X32-SSE-NEXT: psllw $5, %xmm3 ; X32-SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm3[8],xmm4[9],xmm3[9],xmm4[10],xmm3[10],xmm4[11],xmm3[11],xmm4[12],xmm3[12],xmm4[13],xmm3[13],xmm4[14],xmm3[14],xmm4[15],xmm3[15] Index: llvm/trunk/test/CodeGen/X86/vector-shift-lshr-128.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/vector-shift-lshr-128.ll +++ llvm/trunk/test/CodeGen/X86/vector-shift-lshr-128.ll @@ -597,9 +597,8 @@ ; SSE2-LABEL: splatvar_shift_v16i8: ; SSE2: # BB#0: ; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3] ; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] -; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm1[0,1,2,3,4,4,4,4] +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,1,1] ; SSE2-NEXT: psllw $5, %xmm2 ; SSE2-NEXT: pxor %xmm1, %xmm1 ; SSE2-NEXT: pxor %xmm3, %xmm3 @@ -727,9 +726,8 @@ ; X32-SSE-LABEL: splatvar_shift_v16i8: ; X32-SSE: # BB#0: ; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3] ; X32-SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] -; X32-SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm1[0,1,2,3,4,4,4,4] +; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,1,1] ; X32-SSE-NEXT: psllw $5, %xmm2 ; X32-SSE-NEXT: pxor %xmm1, %xmm1 ; X32-SSE-NEXT: pxor %xmm3, %xmm3 Index: llvm/trunk/test/CodeGen/X86/vector-shift-shl-128.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/vector-shift-shl-128.ll +++ llvm/trunk/test/CodeGen/X86/vector-shift-shl-128.ll @@ -545,9 +545,8 @@ ; SSE2-LABEL: splatvar_shift_v16i8: ; SSE2: # BB#0: ; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3] ; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] -; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm1[0,1,2,3,4,4,4,4] +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,1,1] ; SSE2-NEXT: psllw $5, %xmm2 ; SSE2-NEXT: pxor %xmm1, %xmm1 ; SSE2-NEXT: pxor %xmm3, %xmm3 @@ -667,9 +666,8 @@ ; X32-SSE-LABEL: splatvar_shift_v16i8: ; X32-SSE: # BB#0: ; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3] ; X32-SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] -; X32-SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm1[0,1,2,3,4,4,4,4] +; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,1,1] ; X32-SSE-NEXT: psllw $5, %xmm2 ; X32-SSE-NEXT: pxor %xmm1, %xmm1 ; X32-SSE-NEXT: pxor %xmm3, %xmm3 Index: llvm/trunk/test/CodeGen/X86/vector-shuffle-128-v16.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/vector-shuffle-128-v16.ll +++ llvm/trunk/test/CodeGen/X86/vector-shuffle-128-v16.ll @@ -9,20 +9,11 @@ target triple = "x86_64-unknown-unknown" define <16 x i8> @shuffle_v16i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<16 x i8> %a, <16 x i8> %b) { -; FIXME: SSE2 should look like the following: -; FIXME-LABEL: @shuffle_v16i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00 -; FIXME: # BB#0: -; FIXME-NEXT: punpcklbw %xmm0, %xmm0 -; FIXME-NEXT: pshuflw {{.*}} # xmm0 = xmm0[0,0,0,0,4,5,6,7] -; FIXME-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,1,0,1] -; FIXME-NEXT: retq -; ; SSE2-LABEL: shuffle_v16i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: ; SSE2: # BB#0: ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] ; SSE2-NEXT: retq ; ; SSSE3-LABEL: shuffle_v16i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: @@ -55,9 +46,8 @@ ; SSE2-LABEL: shuffle_v16i8_00_00_00_00_00_00_00_00_01_01_01_01_01_01_01_01: ; SSE2: # BB#0: ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,1,1,4,5,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] ; SSE2-NEXT: retq ; ; SSSE3-LABEL: shuffle_v16i8_00_00_00_00_00_00_00_00_01_01_01_01_01_01_01_01: @@ -82,10 +72,10 @@ ; SSE2-LABEL: shuffle_v16i8_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08: ; SSE2: # BB#0: ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,2,4,5,6,7] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,6,6,6] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,2,4,5,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] ; SSE2-NEXT: retq ; ; SSSE3-LABEL: shuffle_v16i8_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08: @@ -182,33 +172,16 @@ } define <16 x i8> @shuffle_v16i8_0101010101010101(<16 x i8> %a, <16 x i8> %b) { -; FIXME: SSE2 should be the following: -; FIXME-LABEL: @shuffle_v16i8_0101010101010101 -; FIXME: # BB#0: -; FIXME-NEXT: pshuflw {{.*}} # xmm0 = xmm0[0,0,0,0,4,5,6,7] -; FIXME-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,1,0,1] -; FIXME-NEXT: retq -; -; SSE2-LABEL: shuffle_v16i8_0101010101010101: -; SSE2: # BB#0: -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] -; SSE2-NEXT: retq -; -; SSSE3-LABEL: shuffle_v16i8_0101010101010101: -; SSSE3: # BB#0: -; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] -; SSSE3-NEXT: retq -; -; SSE41-LABEL: shuffle_v16i8_0101010101010101: -; SSE41: # BB#0: -; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] -; SSE41-NEXT: retq +; SSE-LABEL: shuffle_v16i8_0101010101010101: +; SSE: # BB#0: +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; SSE-NEXT: retq ; ; AVX1-LABEL: shuffle_v16i8_0101010101010101: ; AVX1: # BB#0: -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] ; AVX1-NEXT: retq ; ; AVX2-LABEL: shuffle_v16i8_0101010101010101: @@ -252,9 +225,8 @@ ; SSE2: # BB#0: ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3] ; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] -; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] ; SSE2-NEXT: pand %xmm2, %xmm1 ; SSE2-NEXT: pandn %xmm0, %xmm2 ; SSE2-NEXT: por %xmm1, %xmm2 @@ -1132,9 +1104,8 @@ ; SSE2-NEXT: packuswb %xmm5, %xmm2 ; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255] ; SSE2-NEXT: pand %xmm0, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,3,3,4,5,6,7] -; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,7] +; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,3,1,1,4,5,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] ; SSE2-NEXT: pandn %xmm1, %xmm0 ; SSE2-NEXT: por %xmm2, %xmm0 ; SSE2-NEXT: retq @@ -1377,9 +1348,8 @@ ; SSE2: # BB#0: ; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] ; SSE2-NEXT: retq ; ; SSSE3-LABEL: insert_dup_mem_v16i8_i32: @@ -1420,9 +1390,8 @@ ; SSE2-NEXT: movsbl (%rdi), %eax ; SSE2-NEXT: movd %eax, %xmm0 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] ; SSE2-NEXT: retq ; ; SSSE3-LABEL: insert_dup_mem_v16i8_sext_i8: @@ -1466,9 +1435,8 @@ ; SSE2: # BB#0: ; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,1,1,1,4,5,6,7] -; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] ; SSE2-NEXT: retq ; ; SSSE3-LABEL: insert_dup_elt1_mem_v16i8_i32: @@ -1505,9 +1473,8 @@ ; SSE2: # BB#0: ; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,1] ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,2,2,2,4,5,6,7] -; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,6,6,6] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] ; SSE2-NEXT: retq ; ; SSSE3-LABEL: insert_dup_elt2_mem_v16i8_i32: @@ -1545,9 +1512,8 @@ ; SSE2-NEXT: movsbl (%rdi), %eax ; SSE2-NEXT: movd %eax, %xmm0 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,1,1,1,4,5,6,7] -; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] ; SSE2-NEXT: retq ; ; SSSE3-LABEL: insert_dup_elt1_mem_v16i8_sext_i8: @@ -1592,9 +1558,8 @@ ; SSE2-NEXT: movsbl (%rdi), %eax ; SSE2-NEXT: movd %eax, %xmm0 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,1] ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,2,2,2,4,5,6,7] -; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,6,6,6] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] ; SSE2-NEXT: retq ; ; SSSE3-LABEL: insert_dup_elt2_mem_v16i8_sext_i8: Index: llvm/trunk/test/CodeGen/X86/vector-shuffle-128-v8.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/vector-shuffle-128-v8.ll +++ llvm/trunk/test/CodeGen/X86/vector-shuffle-128-v8.ll @@ -61,26 +61,16 @@ } define <8 x i16> @shuffle_v8i16_00000000(<8 x i16> %a, <8 x i16> %b) { -; SSE2-LABEL: shuffle_v8i16_00000000: -; SSE2: # BB#0: -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] -; SSE2-NEXT: retq -; -; SSSE3-LABEL: shuffle_v8i16_00000000: -; SSSE3: # BB#0: -; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] -; SSSE3-NEXT: retq -; -; SSE41-LABEL: shuffle_v8i16_00000000: -; SSE41: # BB#0: -; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] -; SSE41-NEXT: retq +; SSE-LABEL: shuffle_v8i16_00000000: +; SSE: # BB#0: +; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; SSE-NEXT: retq ; ; AVX1-LABEL: shuffle_v8i16_00000000: ; AVX1: # BB#0: -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] ; AVX1-NEXT: retq ; ; AVX2-LABEL: shuffle_v8i16_00000000: @@ -2151,9 +2141,8 @@ ; SSE2-LABEL: insert_dup_mem_v8i16_i32: ; SSE2: # BB#0: ; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] ; SSE2-NEXT: retq ; ; SSSE3-LABEL: insert_dup_mem_v8i16_i32: @@ -2190,9 +2179,8 @@ ; SSE2: # BB#0: ; SSE2-NEXT: movswl (%rdi), %eax ; SSE2-NEXT: movd %eax, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] ; SSE2-NEXT: retq ; ; SSSE3-LABEL: insert_dup_mem_v8i16_sext_i16: @@ -2234,9 +2222,8 @@ ; SSE2-LABEL: insert_dup_elt1_mem_v8i16_i32: ; SSE2: # BB#0: ; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,1,1,1,4,5,6,7] -; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] ; SSE2-NEXT: retq ; ; SSSE3-LABEL: insert_dup_elt1_mem_v8i16_i32: @@ -2272,9 +2259,9 @@ ; SSE2-LABEL: insert_dup_elt3_mem_v8i16_i32: ; SSE2: # BB#0: ; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,0,1,0] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,0,1,1] ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7] -; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,7,7,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] ; SSE2-NEXT: retq ; ; SSSE3-LABEL: insert_dup_elt3_mem_v8i16_i32: @@ -2311,9 +2298,8 @@ ; SSE2: # BB#0: ; SSE2-NEXT: movswl (%rdi), %eax ; SSE2-NEXT: movd %eax, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,1,1,1,4,5,6,7] -; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] ; SSE2-NEXT: retq ; ; SSSE3-LABEL: insert_dup_elt1_mem_v8i16_sext_i16: @@ -2357,9 +2343,9 @@ ; SSE2: # BB#0: ; SSE2-NEXT: movswl (%rdi), %eax ; SSE2-NEXT: movd %eax, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,0,1,0] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,0,1,1] ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7] -; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,7,7,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] ; SSE2-NEXT: retq ; ; SSSE3-LABEL: insert_dup_elt3_mem_v8i16_sext_i16: Index: llvm/trunk/test/CodeGen/X86/vector-shuffle-256-v16.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/vector-shuffle-256-v16.ll +++ llvm/trunk/test/CodeGen/X86/vector-shuffle-256-v16.ll @@ -7,7 +7,8 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<16 x i16> %a, <16 x i16> %b) { ; AVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: ; AVX1: # BB#0: -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-NEXT: retq ; @@ -22,10 +23,9 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00(<16 x i16> %a, <16 x i16> %b) { ; AVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00: ; AVX1: # BB#0: -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] -; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; AVX1-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,4,4,4] -; AVX1-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,5,4] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,0,0,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,2,3,0,1] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; @@ -42,7 +42,8 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00(<16 x i16> %a, <16 x i16> %b) { ; AVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00: ; AVX1: # BB#0: -; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,0,0,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] ; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,4,5,0,1,0,1] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: retq @@ -60,7 +61,8 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_03_00_00_00(<16 x i16> %a, <16 x i16> %b) { ; AVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_03_00_00_00: ; AVX1: # BB#0: -; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,0,0,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] ; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,6,7,0,1,0,1,0,1] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: retq @@ -78,7 +80,8 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_04_00_00_00_00(<16 x i16> %a, <16 x i16> %b) { ; AVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_04_00_00_00_00: ; AVX1: # BB#0: -; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,0,0,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] ; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,8,9,0,1,0,1,0,1,0,1] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: retq @@ -96,7 +99,8 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_05_00_00_00_00_00(<16 x i16> %a, <16 x i16> %b) { ; AVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_05_00_00_00_00_00: ; AVX1: # BB#0: -; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,0,0,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] ; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,10,11,0,1,0,1,0,1,0,1,0,1] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: retq @@ -114,7 +118,8 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_00_06_00_00_00_00_00_00(<16 x i16> %a, <16 x i16> %b) { ; AVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_06_00_00_00_00_00_00: ; AVX1: # BB#0: -; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,0,0,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] ; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,12,13,0,1,0,1,0,1,0,1,0,1,0,1] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: retq @@ -132,7 +137,8 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00(<16 x i16> %a, <16 x i16> %b) { ; AVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00: ; AVX1: # BB#0: -; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,0,0,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] ; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[14,15,0,1,0,1,0,1,0,1,0,1,0,1,0,1] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: retq @@ -153,7 +159,8 @@ ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,0,1,0,1,0,1,0,1,0,1,0,1,2,3] -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; @@ -161,7 +168,8 @@ ; AVX2: # BB#0: ; AVX2-NEXT: vpbroadcastw %xmm0, %xmm1 ; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3,0,1] -; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17] +; AVX2-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15] +; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,1,1,4,4,5,5] ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: retq @@ -175,7 +183,8 @@ ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,0,1,0,1,0,1,0,1,0,1,6,7,0,1] -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; @@ -196,7 +205,8 @@ ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,0,1,0,1,0,1,0,1,10,11,0,1,0,1] -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; @@ -216,7 +226,8 @@ ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,0,1,0,1,0,1,14,15,0,1,0,1,0,1] -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; @@ -236,7 +247,8 @@ ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5,6,7] ; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,0,1,0,1,8,9,0,1,0,1,0,1,0,1] -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; @@ -256,7 +268,8 @@ ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5,6,7] ; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,0,1,10,11,0,1,0,1,0,1,0,1,0,1] -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; @@ -276,7 +289,8 @@ ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5,6,7] ; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,12,13,0,1,0,1,0,1,0,1,0,1,0,1] -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; @@ -296,7 +310,8 @@ ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5,6,7] ; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[14,15,0,1,0,1,0,1,0,1,0,1,0,1,0,1] -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; @@ -313,16 +328,18 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08(<16 x i16> %a, <16 x i16> %b) { ; AVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08: ; AVX1: # BB#0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] -; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,0,0,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08: ; AVX2: # BB#0: -; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17] +; AVX2-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15] +; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,1,1,4,4,5,5] ; AVX2-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle @@ -331,16 +348,18 @@ define <16 x i16> @shuffle_v16i16_07_07_07_07_07_07_07_07_15_15_15_15_15_15_15_15(<16 x i16> %a, <16 x i16> %b) { ; AVX1-LABEL: shuffle_v16i16_07_07_07_07_07_07_07_07_15_15_15_15_15_15_15_15: ; AVX1: # BB#0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15] -; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,7,7,7,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,7,7,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: shuffle_v16i16_07_07_07_07_07_07_07_07_15_15_15_15_15_15_15_15: ; AVX2: # BB#0: -; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[14,15,14,15,14,15,14,15,14,15,14,15,14,15,14,15,30,31,30,31,30,31,30,31,30,31,30,31,30,31,30,31] +; AVX2-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,7,7,7,7,8,9,10,11,15,15,15,15] +; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,2,3,3,6,6,7,7] ; AVX2-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle @@ -721,8 +740,9 @@ ; ; AVX2-LABEL: shuffle_v16i16_00_16_00_16_00_16_00_16_08_24_08_24_08_24_08_24: ; AVX2: # BB#0: -; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,0,1,4,5,0,1,0,1,0,1,12,13,0,1,16,17,16,17,20,21,16,17,16,17,16,17,28,29,16,17] ; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4] +; AVX2-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15] +; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,0,1,1,4,4,5,5] ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] ; AVX2-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> @@ -1162,7 +1182,8 @@ define <16 x i16> @shuffle_v16i16_00_uu_uu_00_00_00_00_00_08_08_uu_uu_08_08_14_08(<16 x i16> %a, <16 x i16> %b) { ; AVX1-LABEL: shuffle_v16i16_00_uu_uu_00_00_00_00_00_08_08_uu_uu_08_08_14_08: ; AVX1: # BB#0: -; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,1,2,3,4,5,0,1,0,1,0,1,0,1,0,1] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,0,0,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,4,5,6,7,0,1,0,1,12,13,0,1] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -1654,7 +1675,8 @@ ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,2,3] -; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; @@ -3269,13 +3291,15 @@ define <16 x i16> @shuffle_v16i16_u_u_u_u_u_u_u_u_3_3_3_3_3_3_3_3(<16 x i16> %a, <16 x i16> %b) { ; AVX1-LABEL: shuffle_v16i16_u_u_u_u_u_u_u_u_3_3_3_3_3_3_3_3: ; AVX1: # BB#0: -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,7,6,7,6,7,6,7,6,7,6,7,6,7,6,7] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: shuffle_v16i16_u_u_u_u_u_u_u_u_3_3_3_3_3_3_3_3: ; AVX2: # BB#0: -; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,7,6,7,6,7,6,7,6,7,6,7,6,7,6,7] +; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7] +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; AVX2-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> @@ -3286,7 +3310,8 @@ ; AVX1-LABEL: shuffle_v16i16_8_8_8_8_8_8_8_8_8_8_8_8_8_8_8_8: ; AVX1: # BB#0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-NEXT: retq ; @@ -3311,7 +3336,8 @@ define <16 x i16> @shuffle_v16i16_3_3_3_3_3_3_3_3_u_u_u_u_u_u_u_u(<16 x i16> %a, <16 x i16> %b) { ; ALL-LABEL: shuffle_v16i16_3_3_3_3_3_3_3_3_u_u_u_u_u_u_u_u: ; ALL: # BB#0: -; ALL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,7,6,7,6,7,6,7,6,7,6,7,6,7,6,7] +; ALL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7] +; ALL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] ; ALL-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle @@ -3321,13 +3347,15 @@ ; AVX1-LABEL: shuffle_v16i16_9_9_9_9_9_9_9_9_u_u_u_u_u_u_u_u: ; AVX1: # BB#0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,2,3,2,3,2,3,2,3,2,3,2,3,2,3] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,1,1,1,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] ; AVX1-NEXT: retq ; ; AVX2-LABEL: shuffle_v16i16_9_9_9_9_9_9_9_9_u_u_u_u_u_u_u_u: ; AVX2: # BB#0: ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,2,3,2,3,2,3,2,3,2,3,2,3,2,3] +; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,1,1,1,4,5,6,7] +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] ; AVX2-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle @@ -3378,7 +3406,8 @@ ; AVX1-LABEL: insert_dup_mem_v16i16_i32: ; AVX1: # BB#0: ; AVX1-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-NEXT: retq ; @@ -3398,7 +3427,8 @@ ; AVX1: # BB#0: ; AVX1-NEXT: movswl (%rdi), %eax ; AVX1-NEXT: vmovd %eax, %xmm0 -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-NEXT: retq ; @@ -3420,7 +3450,8 @@ ; AVX1-LABEL: insert_dup_elt1_mem_v16i16_i32: ; AVX1: # BB#0: ; AVX1-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,2,3,2,3,2,3,2,3,2,3,2,3,2,3] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,1,1,1,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-NEXT: retq ; Index: llvm/trunk/test/CodeGen/X86/vector-shuffle-256-v32.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/vector-shuffle-256-v32.ll +++ llvm/trunk/test/CodeGen/X86/vector-shuffle-256-v32.ll @@ -970,7 +970,8 @@ ; AVX1-LABEL: shuffle_v32i8_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32: ; AVX1: # BB#0: ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-NEXT: retq ; @@ -988,21 +989,23 @@ define <32 x i8> @shuffle_v32i8_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_16_48_16_48_16_48_16_48_16_48_16_48_16_48_16_48(<32 x i8> %a, <32 x i8> %b) { ; AVX1-LABEL: shuffle_v32i8_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_16_48_16_48_16_48_16_48_16_48_16_48_16_48_16_48: ; AVX1: # BB#0: -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] -; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,0,0,0,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,1,1] +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: shuffle_v32i8_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_16_48_16_48_16_48_16_48_16_48_16_48_16_48_16_48: ; AVX2: # BB#0: ; AVX2-NEXT: vpxor %ymm2, %ymm2, %ymm2 ; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 -; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17] +; AVX2-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15] +; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,1,1,4,4,5,5] ; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] ; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: retq @@ -1465,7 +1468,7 @@ ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,8,8,9,9,8,8,8,8,8,8,8,8] +; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,8,8,8,8,8,8,8,8] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ;