Index: lib/CodeGen/SelectionDAG/DAGCombiner.cpp =================================================================== --- lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -14076,16 +14076,19 @@ // Attempt to combine a shuffle of 2 inputs of 'scalar sources' - // BUILD_VECTOR or SCALAR_TO_VECTOR into a single BUILD_VECTOR. -// This combine is done in the following cases: -// 1. Both N0,N1 are BUILD_VECTOR's composed of constants or undefs. -// 2. Only one of N0,N1 is a BUILD_VECTOR composed of constants or undefs - -// Combine iff that node is ALL_ZEROS. We prefer not to combine a -// BUILD_VECTOR of all constants to allow efficient materialization of -// constant vectors, but the ALL_ZEROS is an exception because -// zero-extension matching seems to rely on having BUILD_VECTOR nodes with -// zero padding between elements. FIXME: Eliminate this exception for -// ALL_ZEROS constant vectors. -// 3. Neither N0,N1 are composed of only constants. +// +// SHUFFLE(BUILD_VECTOR(), BUILD_VECTOR()) -> BUILD_VECTOR() is always +// a simplification in some sense, but it isn't appropriate in general: some +// BUILD_VECTORS are substantially cheaper than others. The general case +// of a BUILD_VECTOR requires inserting each element individually (or +// performing the equivalent in a temporary stack variable). A BUILD_VECTOR of +// all constants is a single constant pool load. A BUILD_VECTOR where each +// element is identical is a splat. A BUILD_VECTOR where most of the operands +// are undef lowers to a small number of element insertions. +// +// For the moment, only handle cases where both operands are constant. A +// constant pool load is cheap enough that we can't substantially pessimize +// code even if we miss some lowering which is a little more clever. static SDValue combineShuffleOfScalars(ShuffleVectorSDNode *SVN, SelectionDAG &DAG, const TargetLowering &TLI) { @@ -14094,18 +14097,10 @@ SDValue N0 = SVN->getOperand(0); SDValue N1 = SVN->getOperand(1); - if (!N0->hasOneUse() || !N1->hasOneUse()) + if (!isAnyConstantBuildVector(N0.getNode())) + return SDValue(); + if (!N1.isUndef() && !isAnyConstantBuildVector(N1.getNode())) return SDValue(); - // If only one of N1,N2 is constant, bail out if it is not ALL_ZEROS as - // discussed above. - if (!N1.isUndef()) { - bool N0AnyConst = isAnyConstantBuildVector(N0.getNode()); - bool N1AnyConst = isAnyConstantBuildVector(N1.getNode()); - if (N0AnyConst && !N1AnyConst && !ISD::isBuildVectorAllZeros(N0.getNode())) - return SDValue(); - if (!N0AnyConst && N1AnyConst && !ISD::isBuildVectorAllZeros(N1.getNode())) - return SDValue(); - } SmallVector Ops; for (int M : SVN->getMask()) { @@ -14363,6 +14358,8 @@ if (N0.getOpcode() == ISD::VECTOR_SHUFFLE && N->isOnlyUserOf(N0.getNode()) && Level < AfterLegalizeDAG && TLI.isTypeLegal(VT)) { ShuffleVectorSDNode *OtherSV = cast(N0); + if (OtherSV->isSplat()) + return SDValue(); // The incoming shuffle must be of the same type as the result of the // current shuffle. Index: test/CodeGen/AArch64/arm64-neon-copy.ll =================================================================== --- test/CodeGen/AArch64/arm64-neon-copy.ll +++ test/CodeGen/AArch64/arm64-neon-copy.ll @@ -1122,7 +1122,7 @@ ; CHECK-LABEL: test_concat_diff_v1i32_v1i32: ; CHECK: sqabs s{{[0-9]+}}, s{{[0-9]+}} ; CHECK: sqabs s{{[0-9]+}}, s{{[0-9]+}} -; CHECK: ins {{v[0-9]+}}.s[1], w{{[0-9]+}} +; CHECK: zip1 v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s entry: %c = tail call i32 @llvm.aarch64.neon.sqabs.i32(i32 %a) %d = insertelement <2 x i32> undef, i32 %c, i32 0 Index: test/CodeGen/ARM/vtrn.ll =================================================================== --- test/CodeGen/ARM/vtrn.ll +++ test/CodeGen/ARM/vtrn.ll @@ -343,9 +343,8 @@ define <8 x i8> @vtrn_mismatched_builvector0(<8 x i8> %tr0, <8 x i8> %tr1, <4 x i32> %cmp0, <4 x i32> %cmp1, <4 x i16> %cmp2, <4 x i16> %cmp3) { - ; CHECK-LABEL: vtrn_mismatched_builvector0 + ; CHECK-LABEL: vtrn_mismatched_builvector0: ; CHECK: vmovn.i32 - ; CHECK: vtrn ; CHECK: vbsl %c0 = icmp ult <4 x i32> %cmp0, %cmp1 %c1 = icmp ult <4 x i16> %cmp2, %cmp3 @@ -359,10 +358,9 @@ ; (from the icmp operation). define <8 x i8> @vtrn_mismatched_builvector1(<8 x i8> %tr0, <8 x i8> %tr1, <4 x i32> %cmp0, <4 x i32> %cmp1, <4 x i8> *%cmp2_ptr) { - ; CHECK-LABEL: vtrn_mismatched_builvector1 + ; CHECK-LABEL: vtrn_mismatched_builvector1: ; We need to extend the 4 x i8 to 4 x i16 in order to perform the vtrn ; CHECK: vmovl - ; CHECK: vtrn.8 ; CHECK: vbsl %cmp2_load = load <4 x i8>, <4 x i8> * %cmp2_ptr, align 4 %cmp2 = trunc <4 x i8> %cmp2_load to <4 x i1> @@ -376,7 +374,7 @@ ; full result. define void @lower_twice_no_vtrn(<4 x i16>* %A, <4 x i16>* %B, <8 x i16>* %C) { entry: - ; CHECK-LABEL: lower_twice_no_vtrn + ; CHECK-LABEL: lower_twice_no_vtrn: ; CHECK: @ BB#0: ; CHECK-NEXT: vldr d16, [r1] ; CHECK-NEXT: vldr d18, [r0] @@ -395,7 +393,7 @@ ; full result. define void @upper_twice_no_vtrn(<4 x i16>* %A, <4 x i16>* %B, <8 x i16>* %C) { entry: - ; CHECK-LABEL: upper_twice_no_vtrn + ; CHECK-LABEL: upper_twice_no_vtrn: ; CHECK: @ BB#0: ; CHECK-NEXT: vldr d16, [r1] ; CHECK-NEXT: vldr d18, [r0] Index: test/CodeGen/ARM/vzip.ll =================================================================== --- test/CodeGen/ARM/vzip.ll +++ test/CodeGen/ARM/vzip.ll @@ -316,3 +316,20 @@ store <4 x i16> %0, <4 x i16>* %B ret void } + +define <8 x i8> @vdup_zip(i8* nocapture readonly %x, i8* nocapture readonly %y) { +entry: + ; CHECK-LABEL: vdup_zip: + ; CHECK: vld1.8 + ; CHECK-NEXT: vld1.8 + ; CHECK-NEXT: vzip.8 + ; CHECK-NEXT: vmov r0, r1 + %0 = load i8, i8* %x, align 1 + %1 = insertelement <8 x i8> undef, i8 %0, i32 0 + %lane = shufflevector <8 x i8> %1, <8 x i8> undef, <8 x i32> + %2 = load i8, i8* %y, align 1 + %3 = insertelement <8 x i8> undef, i8 %2, i32 0 + %lane3 = shufflevector <8 x i8> %3, <8 x i8> undef, <8 x i32> + %vzip.i = shufflevector <8 x i8> %lane, <8 x i8> %lane3, <8 x i32> + ret <8 x i8> %vzip.i +} Index: test/CodeGen/X86/mmx-bitcast.ll =================================================================== --- test/CodeGen/X86/mmx-bitcast.ll +++ test/CodeGen/X86/mmx-bitcast.ll @@ -80,7 +80,8 @@ ; CHECK-NEXT: movd %esi, %xmm0 ; CHECK-NEXT: movd %edi, %xmm1 ; CHECK-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; CHECK-NEXT: movd %xmm1, %rax +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,1,3] +; CHECK-NEXT: movd %xmm0, %rax ; CHECK-NEXT: retq %v0 = insertelement <2 x i32> undef, i32 %a, i32 0 %v1 = insertelement <2 x i32> %v0, i32 %b, i32 1 Index: test/CodeGen/X86/oddshuffles.ll =================================================================== --- test/CodeGen/X86/oddshuffles.ll +++ test/CodeGen/X86/oddshuffles.ll @@ -233,36 +233,34 @@ define void @v7i8(<4 x i8> %a, <4 x i8> %b, <7 x i8>* %p) nounwind { ; SSE2-LABEL: v7i8: ; SSE2: # BB#0: -; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,1,3,4,5,6,7] -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,255,0,255,0,255,255,255,255,255,255,255,255,255,255,255] -; SSE2-NEXT: pshufhw {{.*#+}} xmm3 = xmm1[0,1,2,3,4,6,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,2,0] -; SSE2-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,5,7] -; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,0,2,1,4,5,6,7] -; SSE2-NEXT: pand %xmm2, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,1,3] +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,65535,0,65535,0,65535,65535,65535] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,0,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,2,0,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,4,7] +; SSE2-NEXT: pand %xmm2, %xmm1 ; SSE2-NEXT: pandn %xmm0, %xmm2 -; SSE2-NEXT: por %xmm3, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,1,1,3] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) +; SSE2-NEXT: por %xmm1, %xmm2 +; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255] +; SSE2-NEXT: pand %xmm2, %xmm0 +; SSE2-NEXT: packuswb %xmm0, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,1,3] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; SSE2-NEXT: movdqa %xmm2, -{{[0-9]+}}(%rsp) ; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al ; SSE2-NEXT: movb %al, 6(%rdi) -; SSE2-NEXT: movd %xmm0, (%rdi) -; SSE2-NEXT: pextrw $4, %xmm2, %eax +; SSE2-NEXT: movd %xmm1, (%rdi) +; SSE2-NEXT: pextrw $4, %xmm0, %eax ; SSE2-NEXT: movw %ax, 4(%rdi) ; SSE2-NEXT: retq ; ; SSE42-LABEL: v7i8: ; SSE42: # BB#0: -; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0],zero,xmm0[12],zero,xmm0[4],zero,zero,xmm0[u,u,u,u,u,u,u,u,u] -; SSE42-NEXT: pextrb $0, %xmm1, 6(%rdi) -; SSE42-NEXT: pshufb {{.*#+}} xmm1 = zero,xmm1[8],zero,xmm1[8],zero,xmm1[12,0,u,u,u,u,u,u,u,u,u] -; SSE42-NEXT: por %xmm0, %xmm1 +; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,1,3] +; SSE42-NEXT: pshufb {{.*#+}} xmm1 = xmm1[8,9,8,9,4,5,8,9,0,1,12,13,0,1,14,15] +; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5,6,7] +; SSE42-NEXT: pextrb $12, %xmm1, 6(%rdi) +; SSE42-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] ; SSE42-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero ; SSE42-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero ; SSE42-NEXT: pextrw $4, %xmm1, 4(%rdi) @@ -271,13 +269,14 @@ ; ; AVX-LABEL: v7i8: ; AVX: # BB#0: -; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0],zero,xmm0[12],zero,xmm0[4],zero,zero,xmm0[u,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpshufb {{.*#+}} xmm2 = zero,xmm1[8],zero,xmm1[8],zero,xmm1[12,0,u,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpor %xmm0, %xmm2, %xmm0 -; AVX-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero -; AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; AVX-NEXT: vpextrb $0, %xmm1, 6(%rdi) -; AVX-NEXT: vpextrw $4, %xmm0, 4(%rdi) +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,3,1,3] +; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[8,9,8,9,4,5,8,9,0,1,12,13,0,1,14,15] +; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5,6,7] +; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] +; AVX-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero +; AVX-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; AVX-NEXT: vpextrb $12, %xmm0, 6(%rdi) +; AVX-NEXT: vpextrw $4, %xmm1, 4(%rdi) ; AVX-NEXT: vmovd %xmm2, (%rdi) ; AVX-NEXT: retq %r = shufflevector <4 x i8> %a, <4 x i8> %b, <7 x i32> Index: test/CodeGen/X86/promote-vec3.ll =================================================================== --- test/CodeGen/X86/promote-vec3.ll +++ test/CodeGen/X86/promote-vec3.ll @@ -9,17 +9,17 @@ ; SSE3-LABEL: zext_i8: ; SSE3: # BB#0: ; SSE3-NEXT: movzbl {{[0-9]+}}(%esp), %eax -; SSE3-NEXT: pxor %xmm0, %xmm0 -; SSE3-NEXT: pxor %xmm1, %xmm1 -; SSE3-NEXT: pinsrw $0, %eax, %xmm1 +; SSE3-NEXT: pinsrw $0, %eax, %xmm0 ; SSE3-NEXT: movzbl {{[0-9]+}}(%esp), %eax -; SSE3-NEXT: pinsrw $1, %eax, %xmm1 +; SSE3-NEXT: pinsrw $1, %eax, %xmm0 ; SSE3-NEXT: movzbl {{[0-9]+}}(%esp), %eax -; SSE3-NEXT: pinsrw $2, %eax, %xmm1 -; SSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE3-NEXT: movd %xmm1, %eax -; SSE3-NEXT: pextrw $2, %xmm1, %edx -; SSE3-NEXT: pextrw $4, %xmm1, %ecx +; SSE3-NEXT: pinsrw $2, %eax, %xmm0 +; SSE3-NEXT: pand {{\.LCPI.*}}, %xmm0 +; SSE3-NEXT: pxor %xmm1, %xmm1 +; SSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE3-NEXT: movd %xmm0, %eax +; SSE3-NEXT: pextrw $2, %xmm0, %edx +; SSE3-NEXT: pextrw $4, %xmm0, %ecx ; SSE3-NEXT: # kill: %AX %AX %EAX ; SSE3-NEXT: # kill: %DX %DX %EDX ; SSE3-NEXT: # kill: %CX %CX %ECX @@ -27,10 +27,10 @@ ; ; SSE41-LABEL: zext_i8: ; SSE41: # BB#0: -; SSE41-NEXT: pxor %xmm0, %xmm0 ; SSE41-NEXT: pinsrb $0, {{[0-9]+}}(%esp), %xmm0 ; SSE41-NEXT: pinsrb $4, {{[0-9]+}}(%esp), %xmm0 ; SSE41-NEXT: pinsrb $8, {{[0-9]+}}(%esp), %xmm0 +; SSE41-NEXT: pand {{\.LCPI.*}}, %xmm0 ; SSE41-NEXT: movd %xmm0, %eax ; SSE41-NEXT: pextrw $2, %xmm0, %edx ; SSE41-NEXT: pextrw $4, %xmm0, %ecx @@ -41,10 +41,10 @@ ; ; AVX-32-LABEL: zext_i8: ; AVX-32: # BB#0: -; AVX-32-NEXT: vpxor %xmm0, %xmm0, %xmm0 ; AVX-32-NEXT: vpinsrb $0, {{[0-9]+}}(%esp), %xmm0, %xmm0 ; AVX-32-NEXT: vpinsrb $4, {{[0-9]+}}(%esp), %xmm0, %xmm0 ; AVX-32-NEXT: vpinsrb $8, {{[0-9]+}}(%esp), %xmm0, %xmm0 +; AVX-32-NEXT: vpand {{\.LCPI.*}}, %xmm0, %xmm0 ; AVX-32-NEXT: vmovd %xmm0, %eax ; AVX-32-NEXT: vpextrw $2, %xmm0, %edx ; AVX-32-NEXT: vpextrw $4, %xmm0, %ecx Index: test/CodeGen/X86/vec_insert-5.ll =================================================================== --- test/CodeGen/X86/vec_insert-5.ll +++ test/CodeGen/X86/vec_insert-5.ll @@ -11,7 +11,7 @@ ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X32-NEXT: shll $12, %ecx ; X32-NEXT: movd %ecx, %xmm0 -; X32-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,0,1,1] +; X32-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,0,0,1] ; X32-NEXT: movq %xmm0, (%eax) ; X32-NEXT: retl ; Index: test/CodeGen/X86/vec_insert-mmx.ll =================================================================== --- test/CodeGen/X86/vec_insert-mmx.ll +++ test/CodeGen/X86/vec_insert-mmx.ll @@ -8,7 +8,7 @@ ; X32: ## BB#0: ; X32-NEXT: subl $12, %esp ; X32-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X32-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,0,1,1] +; X32-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,0,0,1] ; X32-NEXT: movq %xmm0, (%esp) ; X32-NEXT: movq (%esp), %mm0 ; X32-NEXT: addl $12, %esp Index: test/CodeGen/X86/vector-shuffle-128-v16.ll =================================================================== --- test/CodeGen/X86/vector-shuffle-128-v16.ll +++ test/CodeGen/X86/vector-shuffle-128-v16.ll @@ -667,15 +667,27 @@ define <16 x i8> @shuffle_v16i8_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz(i8 %i) { ; SSE-LABEL: shuffle_v16i8_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz: ; SSE: # BB#0: -; SSE-NEXT: movzbl %dil, %eax +; SSE-NEXT: movd %edi, %xmm1 +; SSE-NEXT: movl $255, %eax ; SSE-NEXT: movd %eax, %xmm0 +; SSE-NEXT: pand %xmm1, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: shuffle_v16i8_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz: -; AVX: # BB#0: -; AVX-NEXT: movzbl %dil, %eax -; AVX-NEXT: vmovd %eax, %xmm0 -; AVX-NEXT: retq +; AVX1OR2-LABEL: shuffle_v16i8_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz: +; AVX1OR2: # BB#0: +; AVX1OR2-NEXT: vmovd %edi, %xmm0 +; AVX1OR2-NEXT: movl $255, %eax +; AVX1OR2-NEXT: vmovd %eax, %xmm1 +; AVX1OR2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX1OR2-NEXT: retq +; +; AVX512VL-LABEL: shuffle_v16i8_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz: +; AVX512VL: # BB#0: +; AVX512VL-NEXT: vmovd %edi, %xmm0 +; AVX512VL-NEXT: movl $255, %eax +; AVX512VL-NEXT: vmovd %eax, %xmm1 +; AVX512VL-NEXT: vpandd %xmm1, %xmm0, %xmm0 +; AVX512VL-NEXT: retq %a = insertelement <16 x i8> undef, i8 %i, i32 0 %shuffle = shufflevector <16 x i8> zeroinitializer, <16 x i8> %a, <16 x i32> ret <16 x i8> %shuffle @@ -684,72 +696,45 @@ define <16 x i8> @shuffle_v16i8_zz_zz_zz_zz_zz_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz(i8 %i) { ; SSE2-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz: ; SSE2: # BB#0: -; SSE2-NEXT: shll $8, %edi -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: pinsrw $2, %edi, %xmm0 +; SSE2-NEXT: movzbl %dil, %eax +; SSE2-NEXT: movd %eax, %xmm0 +; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10] ; SSE2-NEXT: retq ; ; SSSE3-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz: ; SSSE3: # BB#0: -; SSSE3-NEXT: shll $8, %edi -; SSSE3-NEXT: pxor %xmm0, %xmm0 -; SSSE3-NEXT: pinsrw $2, %edi, %xmm0 +; SSSE3-NEXT: movd %edi, %xmm0 +; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = zero,zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; SSSE3-NEXT: retq ; ; SSE41-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz: ; SSE41: # BB#0: -; SSE41-NEXT: pxor %xmm0, %xmm0 -; SSE41-NEXT: pinsrb $5, %edi, %xmm0 +; SSE41-NEXT: movd %edi, %xmm0 +; SSE41-NEXT: pshufb {{.*#+}} xmm0 = zero,zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; SSE41-NEXT: retq ; -; AVX1OR2-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz: -; AVX1OR2: # BB#0: -; AVX1OR2-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; AVX1OR2-NEXT: vpinsrb $5, %edi, %xmm0, %xmm0 -; AVX1OR2-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz: -; AVX512VL: # BB#0: -; AVX512VL-NEXT: vpxord %xmm0, %xmm0, %xmm0 -; AVX512VL-NEXT: vpinsrb $5, %edi, %xmm0, %xmm0 -; AVX512VL-NEXT: retq +; AVX-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz: +; AVX: # BB#0: +; AVX-NEXT: vmovd %edi, %xmm0 +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX-NEXT: retq %a = insertelement <16 x i8> undef, i8 %i, i32 0 %shuffle = shufflevector <16 x i8> zeroinitializer, <16 x i8> %a, <16 x i32> ret <16 x i8> %shuffle } define <16 x i8> @shuffle_v16i8_zz_uu_uu_zz_uu_uu_zz_zz_zz_zz_zz_zz_zz_zz_zz_16(i8 %i) { -; SSE2-LABEL: shuffle_v16i8_zz_uu_uu_zz_uu_uu_zz_zz_zz_zz_zz_zz_zz_zz_zz_16: -; SSE2: # BB#0: -; SSE2-NEXT: shll $8, %edi -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: pinsrw $7, %edi, %xmm0 -; SSE2-NEXT: retq -; -; SSSE3-LABEL: shuffle_v16i8_zz_uu_uu_zz_uu_uu_zz_zz_zz_zz_zz_zz_zz_zz_zz_16: -; SSSE3: # BB#0: -; SSSE3-NEXT: shll $8, %edi -; SSSE3-NEXT: pxor %xmm0, %xmm0 -; SSSE3-NEXT: pinsrw $7, %edi, %xmm0 -; SSSE3-NEXT: retq -; -; SSE41-LABEL: shuffle_v16i8_zz_uu_uu_zz_uu_uu_zz_zz_zz_zz_zz_zz_zz_zz_zz_16: -; SSE41: # BB#0: -; SSE41-NEXT: pxor %xmm0, %xmm0 -; SSE41-NEXT: pinsrb $15, %edi, %xmm0 -; SSE41-NEXT: retq -; -; AVX1OR2-LABEL: shuffle_v16i8_zz_uu_uu_zz_uu_uu_zz_zz_zz_zz_zz_zz_zz_zz_zz_16: -; AVX1OR2: # BB#0: -; AVX1OR2-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; AVX1OR2-NEXT: vpinsrb $15, %edi, %xmm0, %xmm0 -; AVX1OR2-NEXT: retq +; SSE-LABEL: shuffle_v16i8_zz_uu_uu_zz_uu_uu_zz_zz_zz_zz_zz_zz_zz_zz_zz_16: +; SSE: # BB#0: +; SSE-NEXT: movd %edi, %xmm0 +; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0] +; SSE-NEXT: retq ; -; AVX512VL-LABEL: shuffle_v16i8_zz_uu_uu_zz_uu_uu_zz_zz_zz_zz_zz_zz_zz_zz_zz_16: -; AVX512VL: # BB#0: -; AVX512VL-NEXT: vpxord %xmm0, %xmm0, %xmm0 -; AVX512VL-NEXT: vpinsrb $15, %edi, %xmm0, %xmm0 -; AVX512VL-NEXT: retq +; AVX-LABEL: shuffle_v16i8_zz_uu_uu_zz_uu_uu_zz_zz_zz_zz_zz_zz_zz_zz_zz_16: +; AVX: # BB#0: +; AVX-NEXT: vmovd %edi, %xmm0 +; AVX-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0] +; AVX-NEXT: retq %a = insertelement <16 x i8> undef, i8 %i, i32 0 %shuffle = shufflevector <16 x i8> zeroinitializer, <16 x i8> %a, <16 x i32> ret <16 x i8> %shuffle @@ -759,34 +744,27 @@ ; SSE2-LABEL: shuffle_v16i8_zz_zz_19_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz: ; SSE2: # BB#0: ; SSE2-NEXT: movzbl %dil, %eax -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: pinsrw $1, %eax, %xmm0 +; SSE2-NEXT: movd %eax, %xmm0 +; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13] ; SSE2-NEXT: retq ; ; SSSE3-LABEL: shuffle_v16i8_zz_zz_19_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz: ; SSSE3: # BB#0: -; SSSE3-NEXT: movzbl %dil, %eax -; SSSE3-NEXT: pxor %xmm0, %xmm0 -; SSSE3-NEXT: pinsrw $1, %eax, %xmm0 +; SSSE3-NEXT: movd %edi, %xmm0 +; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = zero,zero,xmm0[0],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; SSSE3-NEXT: retq ; ; SSE41-LABEL: shuffle_v16i8_zz_zz_19_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz: ; SSE41: # BB#0: -; SSE41-NEXT: pxor %xmm0, %xmm0 -; SSE41-NEXT: pinsrb $2, %edi, %xmm0 +; SSE41-NEXT: movd %edi, %xmm0 +; SSE41-NEXT: pshufb {{.*#+}} xmm0 = zero,zero,xmm0[0],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; SSE41-NEXT: retq ; -; AVX1OR2-LABEL: shuffle_v16i8_zz_zz_19_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz: -; AVX1OR2: # BB#0: -; AVX1OR2-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; AVX1OR2-NEXT: vpinsrb $2, %edi, %xmm0, %xmm0 -; AVX1OR2-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v16i8_zz_zz_19_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz: -; AVX512VL: # BB#0: -; AVX512VL-NEXT: vpxord %xmm0, %xmm0, %xmm0 -; AVX512VL-NEXT: vpinsrb $2, %edi, %xmm0, %xmm0 -; AVX512VL-NEXT: retq +; AVX-LABEL: shuffle_v16i8_zz_zz_19_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz: +; AVX: # BB#0: +; AVX-NEXT: vmovd %edi, %xmm0 +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[0],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX-NEXT: retq %a = insertelement <16 x i8> undef, i8 %i, i32 3 %shuffle = shufflevector <16 x i8> zeroinitializer, <16 x i8> %a, <16 x i32> ret <16 x i8> %shuffle @@ -1731,3 +1709,65 @@ %tmp4 = shufflevector <16 x i8> %tmp3, <16 x i8> undef, <16 x i32> ret <16 x i8> %tmp4 } + +define <16 x i8> @PR31364(i8* nocapture readonly %a, i8* nocapture readonly %b) { +; SSE2-LABEL: PR31364: +; SSE2: # BB#0: +; SSE2-NEXT: movzbl (%rdi), %eax +; SSE2-NEXT: movzbl (%rsi), %ecx +; SSE2-NEXT: shll $8, %ecx +; SSE2-NEXT: orl %eax, %ecx +; SSE2-NEXT: pxor %xmm0, %xmm0 +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: pinsrw $0, %ecx, %xmm1 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,0,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,1,1,1,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,5,4,4,4] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,0,1] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,1,1,1,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,7] +; SSE2-NEXT: packuswb %xmm2, %xmm0 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: PR31364: +; SSSE3: # BB#0: +; SSSE3-NEXT: movzbl (%rdi), %eax +; SSSE3-NEXT: movzbl (%rsi), %ecx +; SSSE3-NEXT: shll $8, %ecx +; SSSE3-NEXT: orl %eax, %ecx +; SSSE3-NEXT: pxor %xmm0, %xmm0 +; SSSE3-NEXT: pinsrw $0, %ecx, %xmm0 +; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[1,1,1,1,1,1,1],zero,xmm0[1,1,1,1,1,0,0,0] +; SSSE3-NEXT: retq +; +; SSE41-LABEL: PR31364: +; SSE41: # BB#0: +; SSE41-NEXT: pxor %xmm0, %xmm0 +; SSE41-NEXT: pinsrb $0, (%rdi), %xmm0 +; SSE41-NEXT: pinsrb $1, (%rsi), %xmm0 +; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[1,1,1,1,1,1,1],zero,xmm0[1,1,1,1,1,0,0,0] +; SSE41-NEXT: retq +; +; AVX1OR2-LABEL: PR31364: +; AVX1OR2: # BB#0: +; AVX1OR2-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX1OR2-NEXT: vpinsrb $0, (%rdi), %xmm0, %xmm0 +; AVX1OR2-NEXT: vpinsrb $1, (%rsi), %xmm0, %xmm0 +; AVX1OR2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,1,1,1,1,1,1],zero,xmm0[1,1,1,1,1,0,0,0] +; AVX1OR2-NEXT: retq +; +; AVX512VL-LABEL: PR31364: +; AVX512VL: # BB#0: +; AVX512VL-NEXT: vpxord %xmm0, %xmm0, %xmm0 +; AVX512VL-NEXT: vpinsrb $0, (%rdi), %xmm0, %xmm0 +; AVX512VL-NEXT: vpinsrb $1, (%rsi), %xmm0, %xmm0 +; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,1,1,1,1,1,1],zero,xmm0[1,1,1,1,1,0,0,0] +; AVX512VL-NEXT: retq + %v0 = load i8, i8* %a, align 1 + %vecins = insertelement <16 x i8> , i8 %v0, i32 0 + %v1 = load i8, i8* %b, align 1 + %vecins2 = insertelement <16 x i8> %vecins, i8 %v1, i32 1 + %result = shufflevector <16 x i8> %vecins2, <16 x i8> undef, <16 x i32> + ret <16 x i8> %result +} Index: test/CodeGen/X86/vector-shuffle-128-v8.ll =================================================================== --- test/CodeGen/X86/vector-shuffle-128-v8.ll +++ test/CodeGen/X86/vector-shuffle-128-v8.ll @@ -1411,21 +1411,17 @@ define <8 x i16> @shuffle_v8i16_z8zzzzzz(i16 %i) { ; SSE-LABEL: shuffle_v8i16_z8zzzzzz: ; SSE: # BB#0: -; SSE-NEXT: pxor %xmm0, %xmm0 -; SSE-NEXT: pinsrw $1, %edi, %xmm0 +; SSE-NEXT: movzwl %di, %eax +; SSE-NEXT: movd %eax, %xmm0 +; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13] ; SSE-NEXT: retq ; -; AVX1OR2-LABEL: shuffle_v8i16_z8zzzzzz: -; AVX1OR2: # BB#0: -; AVX1OR2-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; AVX1OR2-NEXT: vpinsrw $1, %edi, %xmm0, %xmm0 -; AVX1OR2-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v8i16_z8zzzzzz: -; AVX512VL: # BB#0: -; AVX512VL-NEXT: vpxord %xmm0, %xmm0, %xmm0 -; AVX512VL-NEXT: vpinsrw $1, %edi, %xmm0, %xmm0 -; AVX512VL-NEXT: retq +; AVX-LABEL: shuffle_v8i16_z8zzzzzz: +; AVX: # BB#0: +; AVX-NEXT: movzwl %di, %eax +; AVX-NEXT: vmovd %eax, %xmm0 +; AVX-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13] +; AVX-NEXT: retq %a = insertelement <8 x i16> undef, i16 %i, i32 0 %shuffle = shufflevector <8 x i16> zeroinitializer, <8 x i16> %a, <8 x i32> ret <8 x i16> %shuffle @@ -1434,21 +1430,17 @@ define <8 x i16> @shuffle_v8i16_zzzzz8zz(i16 %i) { ; SSE-LABEL: shuffle_v8i16_zzzzz8zz: ; SSE: # BB#0: -; SSE-NEXT: pxor %xmm0, %xmm0 -; SSE-NEXT: pinsrw $5, %edi, %xmm0 +; SSE-NEXT: movzwl %di, %eax +; SSE-NEXT: movd %eax, %xmm0 +; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5] ; SSE-NEXT: retq ; -; AVX1OR2-LABEL: shuffle_v8i16_zzzzz8zz: -; AVX1OR2: # BB#0: -; AVX1OR2-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; AVX1OR2-NEXT: vpinsrw $5, %edi, %xmm0, %xmm0 -; AVX1OR2-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v8i16_zzzzz8zz: -; AVX512VL: # BB#0: -; AVX512VL-NEXT: vpxord %xmm0, %xmm0, %xmm0 -; AVX512VL-NEXT: vpinsrw $5, %edi, %xmm0, %xmm0 -; AVX512VL-NEXT: retq +; AVX-LABEL: shuffle_v8i16_zzzzz8zz: +; AVX: # BB#0: +; AVX-NEXT: movzwl %di, %eax +; AVX-NEXT: vmovd %eax, %xmm0 +; AVX-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5] +; AVX-NEXT: retq %a = insertelement <8 x i16> undef, i16 %i, i32 0 %shuffle = shufflevector <8 x i16> zeroinitializer, <8 x i16> %a, <8 x i32> ret <8 x i16> %shuffle @@ -1457,21 +1449,15 @@ define <8 x i16> @shuffle_v8i16_zuuzuuz8(i16 %i) { ; SSE-LABEL: shuffle_v8i16_zuuzuuz8: ; SSE: # BB#0: -; SSE-NEXT: pxor %xmm0, %xmm0 -; SSE-NEXT: pinsrw $7, %edi, %xmm0 +; SSE-NEXT: movd %edi, %xmm0 +; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1] ; SSE-NEXT: retq ; -; AVX1OR2-LABEL: shuffle_v8i16_zuuzuuz8: -; AVX1OR2: # BB#0: -; AVX1OR2-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; AVX1OR2-NEXT: vpinsrw $7, %edi, %xmm0, %xmm0 -; AVX1OR2-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v8i16_zuuzuuz8: -; AVX512VL: # BB#0: -; AVX512VL-NEXT: vpxord %xmm0, %xmm0, %xmm0 -; AVX512VL-NEXT: vpinsrw $7, %edi, %xmm0, %xmm0 -; AVX512VL-NEXT: retq +; AVX-LABEL: shuffle_v8i16_zuuzuuz8: +; AVX: # BB#0: +; AVX-NEXT: vmovd %edi, %xmm0 +; AVX-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1] +; AVX-NEXT: retq %a = insertelement <8 x i16> undef, i16 %i, i32 0 %shuffle = shufflevector <8 x i16> zeroinitializer, <8 x i16> %a, <8 x i32> ret <8 x i16> %shuffle @@ -1480,21 +1466,17 @@ define <8 x i16> @shuffle_v8i16_zzBzzzzz(i16 %i) { ; SSE-LABEL: shuffle_v8i16_zzBzzzzz: ; SSE: # BB#0: -; SSE-NEXT: pxor %xmm0, %xmm0 -; SSE-NEXT: pinsrw $2, %edi, %xmm0 +; SSE-NEXT: movzwl %di, %eax +; SSE-NEXT: movd %eax, %xmm0 +; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11] ; SSE-NEXT: retq ; -; AVX1OR2-LABEL: shuffle_v8i16_zzBzzzzz: -; AVX1OR2: # BB#0: -; AVX1OR2-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; AVX1OR2-NEXT: vpinsrw $2, %edi, %xmm0, %xmm0 -; AVX1OR2-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v8i16_zzBzzzzz: -; AVX512VL: # BB#0: -; AVX512VL-NEXT: vpxord %xmm0, %xmm0, %xmm0 -; AVX512VL-NEXT: vpinsrw $2, %edi, %xmm0, %xmm0 -; AVX512VL-NEXT: retq +; AVX-LABEL: shuffle_v8i16_zzBzzzzz: +; AVX: # BB#0: +; AVX-NEXT: movzwl %di, %eax +; AVX-NEXT: vmovd %eax, %xmm0 +; AVX-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11] +; AVX-NEXT: retq %a = insertelement <8 x i16> undef, i16 %i, i32 3 %shuffle = shufflevector <8 x i16> zeroinitializer, <8 x i16> %a, <8 x i32> ret <8 x i16> %shuffle