Index: lib/CodeGen/SelectionDAG/DAGCombiner.cpp =================================================================== --- lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -14076,16 +14076,20 @@ // Attempt to combine a shuffle of 2 inputs of 'scalar sources' - // BUILD_VECTOR or SCALAR_TO_VECTOR into a single BUILD_VECTOR. -// This combine is done in the following cases: -// 1. Both N0,N1 are BUILD_VECTOR's composed of constants or undefs. -// 2. Only one of N0,N1 is a BUILD_VECTOR composed of constants or undefs - -// Combine iff that node is ALL_ZEROS. We prefer not to combine a -// BUILD_VECTOR of all constants to allow efficient materialization of -// constant vectors, but the ALL_ZEROS is an exception because -// zero-extension matching seems to rely on having BUILD_VECTOR nodes with -// zero padding between elements. FIXME: Eliminate this exception for -// ALL_ZEROS constant vectors. -// 3. Neither N0,N1 are composed of only constants. +// +// SHUFFLE(BUILD_VECTOR(), BUILD_VECTOR()) -> BUILD_VECTOR() is always +// a simplification in some sense, but it isn't appropriate in general: some +// BUILD_VECTORS are substantially cheaper than others. The general case +// of a BUILD_VECTOR requires inserting each element individually (or +// performing the equivalent in a temporary stack variable). A BUILD_VECTOR of +// all constants is a single constant pool load. A BUILD_VECTOR where each +// element is identical is a splat. A BUILD_VECTOR where most of the operands +// are undef lowers to a small number of element insertions. +// +// To deal with this, we currently use a bunch of mostly arbitrary heuristics. +// We don't fold shuffles where one side is a non-zero constant, and we don't +// fold shuffles if the resulting BUILD_VECTOR would have duplicate +// non-constant operand. This seems to work out reasonably well in practice. static SDValue combineShuffleOfScalars(ShuffleVectorSDNode *SVN, SelectionDAG &DAG, const TargetLowering &TLI) { @@ -14108,6 +14112,7 @@ } SmallVector Ops; + SmallSet DuplicateOps; for (int M : SVN->getMask()) { SDValue Op = DAG.getUNDEF(VT.getScalarType()); if (M >= 0) { @@ -14123,6 +14128,12 @@ return SDValue(); } } + + // Don't duplicate a BUILD_VECTOR operand; semantically, this is fine, + // but it's likely to generate bad code. + if (!Op.isUndef() && !isa(Op) && !isa(Op)) + if (!DuplicateOps.insert(Op).second) + return SDValue(); Ops.push_back(Op); } // BUILD_VECTOR requires all inputs to be of the same type, find the Index: test/CodeGen/ARM/vtrn.ll =================================================================== --- test/CodeGen/ARM/vtrn.ll +++ test/CodeGen/ARM/vtrn.ll @@ -343,9 +343,8 @@ define <8 x i8> @vtrn_mismatched_builvector0(<8 x i8> %tr0, <8 x i8> %tr1, <4 x i32> %cmp0, <4 x i32> %cmp1, <4 x i16> %cmp2, <4 x i16> %cmp3) { - ; CHECK-LABEL: vtrn_mismatched_builvector0 + ; CHECK-LABEL: vtrn_mismatched_builvector0: ; CHECK: vmovn.i32 - ; CHECK: vtrn ; CHECK: vbsl %c0 = icmp ult <4 x i32> %cmp0, %cmp1 %c1 = icmp ult <4 x i16> %cmp2, %cmp3 @@ -359,10 +358,9 @@ ; (from the icmp operation). define <8 x i8> @vtrn_mismatched_builvector1(<8 x i8> %tr0, <8 x i8> %tr1, <4 x i32> %cmp0, <4 x i32> %cmp1, <4 x i8> *%cmp2_ptr) { - ; CHECK-LABEL: vtrn_mismatched_builvector1 + ; CHECK-LABEL: vtrn_mismatched_builvector1: ; We need to extend the 4 x i8 to 4 x i16 in order to perform the vtrn ; CHECK: vmovl - ; CHECK: vtrn.8 ; CHECK: vbsl %cmp2_load = load <4 x i8>, <4 x i8> * %cmp2_ptr, align 4 %cmp2 = trunc <4 x i8> %cmp2_load to <4 x i1> @@ -376,7 +374,7 @@ ; full result. define void @lower_twice_no_vtrn(<4 x i16>* %A, <4 x i16>* %B, <8 x i16>* %C) { entry: - ; CHECK-LABEL: lower_twice_no_vtrn + ; CHECK-LABEL: lower_twice_no_vtrn: ; CHECK: @ BB#0: ; CHECK-NEXT: vldr d16, [r1] ; CHECK-NEXT: vldr d18, [r0] @@ -395,7 +393,7 @@ ; full result. define void @upper_twice_no_vtrn(<4 x i16>* %A, <4 x i16>* %B, <8 x i16>* %C) { entry: - ; CHECK-LABEL: upper_twice_no_vtrn + ; CHECK-LABEL: upper_twice_no_vtrn: ; CHECK: @ BB#0: ; CHECK-NEXT: vldr d16, [r1] ; CHECK-NEXT: vldr d18, [r0] Index: test/CodeGen/ARM/vzip.ll =================================================================== --- test/CodeGen/ARM/vzip.ll +++ test/CodeGen/ARM/vzip.ll @@ -317,21 +317,16 @@ ret void } -; FIXME: This should generate a vzip define <8 x i8> @vdup_zip(i8* nocapture readonly %x, i8* nocapture readonly %y) { entry: ; CHECK-LABEL: vdup_zip: - ; CHECK: ldrb r0, [r0] - ; CHECK-NEXT: ldrb r1, [r1] - ; CHECK-NEXT: vmov.8 d16[0], r0 - ; CHECK-NEXT: vmov.8 d16[1], r1 - ; CHECK-NEXT: vmov.8 d16[2], r0 - ; CHECK-NEXT: vmov.8 d16[3], r1 - ; CHECK-NEXT: vmov.8 d16[4], r0 - ; CHECK-NEXT: vmov.8 d16[5], r1 - ; CHECK-NEXT: vmov.8 d16[6], r0 - ; CHECK-NEXT: vmov.8 d16[7], r1 - ; CHECK-NEXT: vmov r0, r1, d16 + ; CHECK: ldrb + ; CHECK-NEXT: ldrb + ; CHECK-NEXT: vmov.i16 d{{.*}}, #0x800 + ; CHECK-NEXT: vmov.8 + ; CHECK-NEXT: vmov.8 + ; CHECK-NEXT: vtbl.8 + ; CHECK-NEXT: vmov r0, r1 %0 = load i8, i8* %x, align 1 %1 = insertelement <8 x i8> undef, i8 %0, i32 0 %lane = shufflevector <8 x i8> %1, <8 x i8> undef, <8 x i32> Index: test/CodeGen/X86/oddshuffles.ll =================================================================== --- test/CodeGen/X86/oddshuffles.ll +++ test/CodeGen/X86/oddshuffles.ll @@ -233,36 +233,34 @@ define void @v7i8(<4 x i8> %a, <4 x i8> %b, <7 x i8>* %p) nounwind { ; SSE2-LABEL: v7i8: ; SSE2: # BB#0: -; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,1,3,4,5,6,7] -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,255,0,255,0,255,255,255,255,255,255,255,255,255,255,255] -; SSE2-NEXT: pshufhw {{.*#+}} xmm3 = xmm1[0,1,2,3,4,6,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,2,0] -; SSE2-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,5,7] -; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,0,2,1,4,5,6,7] -; SSE2-NEXT: pand %xmm2, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,1,3] +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,65535,0,65535,0,65535,65535,65535] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,0,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,2,0,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,4,7] +; SSE2-NEXT: pand %xmm2, %xmm1 ; SSE2-NEXT: pandn %xmm0, %xmm2 -; SSE2-NEXT: por %xmm3, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,1,1,3] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) +; SSE2-NEXT: por %xmm1, %xmm2 +; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255] +; SSE2-NEXT: pand %xmm2, %xmm0 +; SSE2-NEXT: packuswb %xmm0, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,1,3] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; SSE2-NEXT: movdqa %xmm2, -{{[0-9]+}}(%rsp) ; SSE2-NEXT: movb -{{[0-9]+}}(%rsp), %al ; SSE2-NEXT: movb %al, 6(%rdi) -; SSE2-NEXT: movd %xmm0, (%rdi) -; SSE2-NEXT: pextrw $4, %xmm2, %eax +; SSE2-NEXT: movd %xmm1, (%rdi) +; SSE2-NEXT: pextrw $4, %xmm0, %eax ; SSE2-NEXT: movw %ax, 4(%rdi) ; SSE2-NEXT: retq ; ; SSE42-LABEL: v7i8: ; SSE42: # BB#0: -; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0],zero,xmm0[12],zero,xmm0[4],zero,zero,xmm0[u,u,u,u,u,u,u,u,u] -; SSE42-NEXT: pextrb $0, %xmm1, 6(%rdi) -; SSE42-NEXT: pshufb {{.*#+}} xmm1 = zero,xmm1[8],zero,xmm1[8],zero,xmm1[12,0,u,u,u,u,u,u,u,u,u] -; SSE42-NEXT: por %xmm0, %xmm1 +; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,1,3] +; SSE42-NEXT: pshufb {{.*#+}} xmm1 = xmm1[8,9,8,9,4,5,8,9,0,1,12,13,0,1,14,15] +; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5,6,7] +; SSE42-NEXT: pextrb $12, %xmm1, 6(%rdi) +; SSE42-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] ; SSE42-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero ; SSE42-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero ; SSE42-NEXT: pextrw $4, %xmm1, 4(%rdi) @@ -271,13 +269,14 @@ ; ; AVX-LABEL: v7i8: ; AVX: # BB#0: -; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0],zero,xmm0[12],zero,xmm0[4],zero,zero,xmm0[u,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpshufb {{.*#+}} xmm2 = zero,xmm1[8],zero,xmm1[8],zero,xmm1[12,0,u,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpor %xmm0, %xmm2, %xmm0 -; AVX-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero -; AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; AVX-NEXT: vpextrb $0, %xmm1, 6(%rdi) -; AVX-NEXT: vpextrw $4, %xmm0, 4(%rdi) +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,3,1,3] +; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[8,9,8,9,4,5,8,9,0,1,12,13,0,1,14,15] +; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5,6,7] +; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] +; AVX-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero +; AVX-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; AVX-NEXT: vpextrb $12, %xmm0, 6(%rdi) +; AVX-NEXT: vpextrw $4, %xmm1, 4(%rdi) ; AVX-NEXT: vmovd %xmm2, (%rdi) ; AVX-NEXT: retq %r = shufflevector <4 x i8> %a, <4 x i8> %b, <7 x i32> Index: test/CodeGen/X86/vector-shuffle-128-v16.ll =================================================================== --- test/CodeGen/X86/vector-shuffle-128-v16.ll +++ test/CodeGen/X86/vector-shuffle-128-v16.ll @@ -1735,93 +1735,57 @@ define <16 x i8> @PR31364(i8* nocapture readonly %a, i8* nocapture readonly %b) { ; SSE2-LABEL: PR31364: ; SSE2: # BB#0: -; SSE2-NEXT: xorl %eax, %eax -; SSE2-NEXT: movd %eax, %xmm1 ; SSE2-NEXT: movzbl (%rdi), %eax -; SSE2-NEXT: movd %eax, %xmm2 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; SSE2-NEXT: movzbl (%rsi), %eax -; SSE2-NEXT: movd %eax, %xmm3 -; SSE2-NEXT: movdqa %xmm3, %xmm0 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: movdqa %xmm0, %xmm4 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3],xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSE2-NEXT: movzbl (%rsi), %ecx +; SSE2-NEXT: shll $8, %ecx +; SSE2-NEXT: orl %eax, %ecx +; SSE2-NEXT: pxor %xmm0, %xmm0 +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: pinsrw $0, %ecx, %xmm1 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,0,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,1,1,1,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,5,4,4,4] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,0,1] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,1,1,1,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,7] +; SSE2-NEXT: packuswb %xmm2, %xmm0 ; SSE2-NEXT: retq ; ; SSSE3-LABEL: PR31364: ; SSSE3: # BB#0: -; SSSE3-NEXT: xorl %eax, %eax -; SSSE3-NEXT: movd %eax, %xmm1 ; SSSE3-NEXT: movzbl (%rdi), %eax -; SSSE3-NEXT: movd %eax, %xmm2 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; SSSE3-NEXT: movzbl (%rsi), %eax -; SSSE3-NEXT: movd %eax, %xmm3 -; SSSE3-NEXT: movdqa %xmm3, %xmm0 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSSE3-NEXT: movdqa %xmm0, %xmm4 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3],xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] -; SSSE3-NEXT: movdqa %xmm0, %xmm1 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSSE3-NEXT: movzbl (%rsi), %ecx +; SSSE3-NEXT: shll $8, %ecx +; SSSE3-NEXT: orl %eax, %ecx +; SSSE3-NEXT: pxor %xmm0, %xmm0 +; SSSE3-NEXT: pinsrw $0, %ecx, %xmm0 +; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[1,1,1,1,1,1,1],zero,xmm0[1,1,1,1,1,0,0,0] ; SSSE3-NEXT: retq ; ; SSE41-LABEL: PR31364: ; SSE41: # BB#0: -; SSE41-NEXT: movzbl (%rsi), %eax -; SSE41-NEXT: movd %eax, %xmm0 -; SSE41-NEXT: pinsrb $1, %eax, %xmm0 -; SSE41-NEXT: pinsrb $2, %eax, %xmm0 -; SSE41-NEXT: pinsrb $3, %eax, %xmm0 -; SSE41-NEXT: pinsrb $4, %eax, %xmm0 -; SSE41-NEXT: pinsrb $5, %eax, %xmm0 -; SSE41-NEXT: pinsrb $6, %eax, %xmm0 -; SSE41-NEXT: xorl %ecx, %ecx -; SSE41-NEXT: pinsrb $7, %ecx, %xmm0 -; SSE41-NEXT: pinsrb $8, %eax, %xmm0 -; SSE41-NEXT: pinsrb $9, %eax, %xmm0 -; SSE41-NEXT: pinsrb $10, %eax, %xmm0 -; SSE41-NEXT: pinsrb $11, %eax, %xmm0 -; SSE41-NEXT: pinsrb $12, %eax, %xmm0 -; SSE41-NEXT: movzbl (%rdi), %eax -; SSE41-NEXT: pinsrb $13, %eax, %xmm0 -; SSE41-NEXT: pinsrb $14, %eax, %xmm0 -; SSE41-NEXT: pinsrb $15, %eax, %xmm0 +; SSE41-NEXT: pxor %xmm0, %xmm0 +; SSE41-NEXT: pinsrb $0, (%rdi), %xmm0 +; SSE41-NEXT: pinsrb $1, (%rsi), %xmm0 +; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[1,1,1,1,1,1,1],zero,xmm0[1,1,1,1,1,0,0,0] ; SSE41-NEXT: retq ; -; AVX-LABEL: PR31364: -; AVX: # BB#0: -; AVX-NEXT: movzbl (%rsi), %eax -; AVX-NEXT: vmovd %eax, %xmm0 -; AVX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 -; AVX-NEXT: vpinsrb $2, %eax, %xmm0, %xmm0 -; AVX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 -; AVX-NEXT: vpinsrb $4, %eax, %xmm0, %xmm0 -; AVX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 -; AVX-NEXT: vpinsrb $6, %eax, %xmm0, %xmm0 -; AVX-NEXT: xorl %ecx, %ecx -; AVX-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0 -; AVX-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0 -; AVX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 -; AVX-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0 -; AVX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 -; AVX-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0 -; AVX-NEXT: movzbl (%rdi), %eax -; AVX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 -; AVX-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 -; AVX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1OR2-LABEL: PR31364: +; AVX1OR2: # BB#0: +; AVX1OR2-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX1OR2-NEXT: vpinsrb $0, (%rdi), %xmm0, %xmm0 +; AVX1OR2-NEXT: vpinsrb $1, (%rsi), %xmm0, %xmm0 +; AVX1OR2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,1,1,1,1,1,1],zero,xmm0[1,1,1,1,1,0,0,0] +; AVX1OR2-NEXT: retq +; +; AVX512VL-LABEL: PR31364: +; AVX512VL: # BB#0: +; AVX512VL-NEXT: vpxord %xmm0, %xmm0, %xmm0 +; AVX512VL-NEXT: vpinsrb $0, (%rdi), %xmm0, %xmm0 +; AVX512VL-NEXT: vpinsrb $1, (%rsi), %xmm0, %xmm0 +; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,1,1,1,1,1,1],zero,xmm0[1,1,1,1,1,0,0,0] +; AVX512VL-NEXT: retq %v0 = load i8, i8* %a, align 1 %vecins = insertelement <16 x i8> , i8 %v0, i32 0 %v1 = load i8, i8* %b, align 1 @@ -1833,74 +1797,63 @@ define <16 x i8> @PR31301(i8* nocapture readonly %x, i8* nocapture readonly %y) { ; SSE2-LABEL: PR31301: ; SSE2: # BB#0: # %entry +; SSE2-NEXT: movzbl (%rdi), %eax +; SSE2-NEXT: movd %eax, %xmm0 ; SSE2-NEXT: movzbl (%rsi), %eax ; SSE2-NEXT: movd %eax, %xmm1 +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,0,1,1] +; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] +; SSE2-NEXT: pand %xmm0, %xmm2 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: movzbl (%rdi), %eax -; SSE2-NEXT: movd %eax, %xmm0 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] +; SSE2-NEXT: pandn %xmm1, %xmm0 +; SSE2-NEXT: por %xmm2, %xmm0 ; SSE2-NEXT: retq ; ; SSSE3-LABEL: PR31301: ; SSSE3: # BB#0: # %entry -; SSSE3-NEXT: movzbl (%rsi), %eax -; SSSE3-NEXT: movd %eax, %xmm1 -; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1] -; SSSE3-NEXT: pshufb %xmm2, %xmm1 ; SSSE3-NEXT: movzbl (%rdi), %eax ; SSSE3-NEXT: movd %eax, %xmm0 -; SSSE3-NEXT: pshufb %xmm2, %xmm0 +; SSSE3-NEXT: movzbl (%rsi), %eax +; SSSE3-NEXT: movd %eax, %xmm1 ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] ; SSSE3-NEXT: retq ; ; SSE41-LABEL: PR31301: ; SSE41: # BB#0: # %entry +; SSE41-NEXT: movzbl (%rdi), %eax +; SSE41-NEXT: movd %eax, %xmm0 ; SSE41-NEXT: movzbl (%rsi), %eax -; SSE41-NEXT: movzbl (%rdi), %ecx -; SSE41-NEXT: movd %ecx, %xmm0 -; SSE41-NEXT: pinsrb $1, %eax, %xmm0 -; SSE41-NEXT: pinsrb $2, %ecx, %xmm0 -; SSE41-NEXT: pinsrb $3, %eax, %xmm0 -; SSE41-NEXT: pinsrb $4, %ecx, %xmm0 -; SSE41-NEXT: pinsrb $5, %eax, %xmm0 -; SSE41-NEXT: pinsrb $6, %ecx, %xmm0 -; SSE41-NEXT: pinsrb $7, %eax, %xmm0 -; SSE41-NEXT: pinsrb $8, %ecx, %xmm0 -; SSE41-NEXT: pinsrb $9, %eax, %xmm0 -; SSE41-NEXT: pinsrb $10, %ecx, %xmm0 -; SSE41-NEXT: pinsrb $11, %eax, %xmm0 -; SSE41-NEXT: pinsrb $12, %ecx, %xmm0 -; SSE41-NEXT: pinsrb $13, %eax, %xmm0 -; SSE41-NEXT: pinsrb $14, %ecx, %xmm0 -; SSE41-NEXT: pinsrb $15, %eax, %xmm0 +; SSE41-NEXT: movd %eax, %xmm1 +; SSE41-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSE41-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] ; SSE41-NEXT: retq ; -; AVX-LABEL: PR31301: -; AVX: # BB#0: # %entry -; AVX-NEXT: movzbl (%rsi), %eax -; AVX-NEXT: movzbl (%rdi), %ecx -; AVX-NEXT: vmovd %ecx, %xmm0 -; AVX-NEXT: vpinsrb $1, %eax, %xmm0, %xmm0 -; AVX-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0 -; AVX-NEXT: vpinsrb $3, %eax, %xmm0, %xmm0 -; AVX-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0 -; AVX-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 -; AVX-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 -; AVX-NEXT: vpinsrb $7, %eax, %xmm0, %xmm0 -; AVX-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0 -; AVX-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0 -; AVX-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0 -; AVX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 -; AVX-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0 -; AVX-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0 -; AVX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 -; AVX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: PR31301: +; AVX1: # BB#0: # %entry +; AVX1-NEXT: movzbl (%rdi), %eax +; AVX1-NEXT: vmovd %eax, %xmm0 +; AVX1-NEXT: movzbl (%rsi), %eax +; AVX1-NEXT: vmovd %eax, %xmm1 +; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; AVX1-NEXT: retq +; +; AVX2OR512VL-LABEL: PR31301: +; AVX2OR512VL: # BB#0: # %entry +; AVX2OR512VL-NEXT: movzbl (%rdi), %eax +; AVX2OR512VL-NEXT: vmovd %eax, %xmm0 +; AVX2OR512VL-NEXT: movzbl (%rsi), %eax +; AVX2OR512VL-NEXT: vmovd %eax, %xmm1 +; AVX2OR512VL-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX2OR512VL-NEXT: vpbroadcastw %xmm0, %xmm0 +; AVX2OR512VL-NEXT: retq entry: %0 = load i8, i8* %x, align 1 %1 = insertelement <16 x i8> undef, i8 %0, i32 0