Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -7825,24 +7825,19 @@ } // Next, we iteratively mix elements, e.g. for v4f32: - // Step 1: unpcklps 0, 2 ==> X: - // : unpcklps 1, 3 ==> Y: - // Step 2: unpcklps X, Y ==> <3, 2, 1, 0> - unsigned EltStride = NumElems >> 1; - while (EltStride != 0) { - for (unsigned i = 0; i < EltStride; ++i) { - // If Ops[i+EltStride] is undef and this is the first round of mixing, - // then it is safe to just drop this shuffle: V[i] is already in the - // right place, the one element (since it's the first round) being - // inserted as undef can be dropped. This isn't safe for successive - // rounds because they will permute elements within both vectors. - if (Ops[i+EltStride].isUndef() && - EltStride == NumElems/2) - continue; + // Step 1: unpcklps 0, 1 ==> X: + // : unpcklps 2, 3 ==> Y: + // Step 2: unpcklpd X, Y ==> <3, 2, 1, 0> + for (unsigned Scale = 1; Scale < NumElems; Scale *= 2) { + SmallVector Mask; + for(unsigned i = 0; i != Scale; ++i) + Mask.push_back(i); + for (unsigned i = 0; i != Scale; ++i) + Mask.push_back(NumElems+i); + Mask.append(NumElems - Mask.size(), SM_SentinelUndef); - Ops[i] = getUnpackl(DAG, dl, VT, Ops[i], Ops[i + EltStride]); - } - EltStride >>= 1; + for (unsigned i = 0, e = NumElems / (2 * Scale); i != e; ++i) + Ops[i] = DAG.getVectorShuffle(VT, dl, Ops[2*i], Ops[(2*i)+1], Mask); } return Ops[0]; } Index: test/CodeGen/X86/build-vector-128.ll =================================================================== --- test/CodeGen/X86/build-vector-128.ll +++ test/CodeGen/X86/build-vector-128.ll @@ -41,9 +41,9 @@ ; ; SSE2-64-LABEL: test_buildvector_v4f32: ; SSE2-64: # BB#0: -; SSE2-64-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] -; SSE2-64-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE2-64-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] ; SSE2-64-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE2-64-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0] ; SSE2-64-NEXT: retq ; ; SSE41-64-LABEL: test_buildvector_v4f32: @@ -74,13 +74,9 @@ define <2 x i64> @test_buildvector_v2i64(i64 %a0, i64 %a1) { ; SSE2-32-LABEL: test_buildvector_v2i64: ; SSE2-32: # BB#0: -; SSE2-32-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSE2-32-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; SSE2-32-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE2-32-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero -; SSE2-32-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSE2-32-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE2-32-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE2-32-NEXT: movq {{.*#+}} xmm1 = mem[0],zero +; SSE2-32-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; SSE2-32-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSE2-32-NEXT: retl ; ; SSE-64-LABEL: test_buildvector_v2i64: @@ -126,12 +122,12 @@ ; SSE2-64-LABEL: test_buildvector_v4i32: ; SSE2-64: # BB#0: ; SSE2-64-NEXT: movd %ecx, %xmm0 -; SSE2-64-NEXT: movd %esi, %xmm1 +; SSE2-64-NEXT: movd %edx, %xmm1 ; SSE2-64-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE2-64-NEXT: movd %edx, %xmm2 +; SSE2-64-NEXT: movd %esi, %xmm2 ; SSE2-64-NEXT: movd %edi, %xmm0 ; SSE2-64-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE2-64-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE2-64-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSE2-64-NEXT: retq ; ; SSE41-64-LABEL: test_buildvector_v4i32: @@ -170,34 +166,34 @@ ; SSE2-32-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; SSE2-32-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero ; SSE2-32-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; SSE2-32-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSE2-32-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; SSE2-32-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; SSE2-32-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero ; SSE2-32-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; SSE2-32-NEXT: movd {{.*#+}} xmm3 = mem[0],zero,zero,zero ; SSE2-32-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; SSE2-32-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; SSE2-32-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE2-32-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE2-32-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE2-32-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] ; SSE2-32-NEXT: retl ; ; SSE2-64-LABEL: test_buildvector_v8i16: ; SSE2-64: # BB#0: -; SSE2-64-NEXT: movd %ecx, %xmm0 +; SSE2-64-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; SSE2-64-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; SSE2-64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE2-64-NEXT: movd %r9d, %xmm1 -; SSE2-64-NEXT: movd %esi, %xmm2 -; SSE2-64-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSE2-64-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE2-64-NEXT: movd %r9d, %xmm0 +; SSE2-64-NEXT: movd %r8d, %xmm2 ; SSE2-64-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; SSE2-64-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE2-64-NEXT: movd %ecx, %xmm0 ; SSE2-64-NEXT: movd %edx, %xmm1 -; SSE2-64-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; SSE2-64-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE2-64-NEXT: movd %r8d, %xmm3 +; SSE2-64-NEXT: movd %esi, %xmm3 ; SSE2-64-NEXT: movd %edi, %xmm0 ; SSE2-64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; SSE2-64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE2-64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE2-64-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE2-64-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] ; SSE2-64-NEXT: retq ; ; SSE41-32-LABEL: test_buildvector_v8i16: @@ -267,31 +263,31 @@ ; SSE2-32-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; SSE2-32-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero ; SSE2-32-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; SSE2-32-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; SSE2-32-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; SSE2-32-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; SSE2-32-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero ; SSE2-32-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; SSE2-32-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; SSE2-32-NEXT: movd {{.*#+}} xmm3 = mem[0],zero,zero,zero ; SSE2-32-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] -; SSE2-32-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] -; SSE2-32-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; SSE2-32-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] +; SSE2-32-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] ; SSE2-32-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; SSE2-32-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero ; SSE2-32-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; SSE2-32-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; SSE2-32-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero ; SSE2-32-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; SSE2-32-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; SSE2-32-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; SSE2-32-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; SSE2-32-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero ; SSE2-32-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; SSE2-32-NEXT: movd {{.*#+}} xmm4 = mem[0],zero,zero,zero ; SSE2-32-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; SSE2-32-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] -; SSE2-32-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; SSE2-32-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; SSE2-32-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] +; SSE2-32-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE2-32-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE2-32-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0] ; SSE2-32-NEXT: retl ; ; SSE2-64-LABEL: test_buildvector_v16i8: @@ -299,34 +295,34 @@ ; SSE2-64-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; SSE2-64-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero ; SSE2-64-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; SSE2-64-NEXT: movd %ecx, %xmm0 -; SSE2-64-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero -; SSE2-64-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; SSE2-64-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; SSE2-64-NEXT: movd %r9d, %xmm1 +; SSE2-64-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; SSE2-64-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero -; SSE2-64-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; SSE2-64-NEXT: movd %esi, %xmm2 -; SSE2-64-NEXT: movd {{.*#+}} xmm3 = mem[0],zero,zero,zero -; SSE2-64-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] -; SSE2-64-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] ; SSE2-64-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; SSE2-64-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; SSE2-64-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; SSE2-64-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero ; SSE2-64-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; SSE2-64-NEXT: movd %edx, %xmm3 ; SSE2-64-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE2-64-NEXT: movd {{.*#+}} xmm3 = mem[0],zero,zero,zero ; SSE2-64-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] -; SSE2-64-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] -; SSE2-64-NEXT: movd %r8d, %xmm1 +; SSE2-64-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] +; SSE2-64-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] ; SSE2-64-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE2-64-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSE2-64-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSE2-64-NEXT: movd %r9d, %xmm0 +; SSE2-64-NEXT: movd %r8d, %xmm2 +; SSE2-64-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; SSE2-64-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSE2-64-NEXT: movd %ecx, %xmm0 +; SSE2-64-NEXT: movd %edx, %xmm1 ; SSE2-64-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSE2-64-NEXT: movd %esi, %xmm4 ; SSE2-64-NEXT: movd %edi, %xmm0 -; SSE2-64-NEXT: movd {{.*#+}} xmm4 = mem[0],zero,zero,zero ; SSE2-64-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] -; SSE2-64-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; SSE2-64-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] -; SSE2-64-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; SSE2-64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE2-64-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE2-64-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0] ; SSE2-64-NEXT: retq ; ; SSE41-32-LABEL: test_buildvector_v16i8: Index: test/CodeGen/X86/buildvec-insertvec.ll =================================================================== --- test/CodeGen/X86/buildvec-insertvec.ll +++ test/CodeGen/X86/buildvec-insertvec.ll @@ -75,9 +75,9 @@ define <4 x float> @test_buildvector_v4f32_register(float %f0, float %f1, float %f2, float %f3) { ; SSE2-LABEL: test_buildvector_v4f32_register: ; SSE2: # BB#0: -; SSE2-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] -; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE2-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] ; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE2-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0] ; SSE2-NEXT: retq ; ; SSE41-LABEL: test_buildvector_v4f32_register: @@ -102,7 +102,7 @@ ; SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero ; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE2-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSE2-NEXT: retq ; ; SSE41-LABEL: test_buildvector_v4f32_load: @@ -126,10 +126,10 @@ define <4 x float> @test_buildvector_v4f32_partial_load(float %f0, float %f1, float %f2, float* %p3) { ; SSE2-LABEL: test_buildvector_v4f32_partial_load: ; SSE2: # BB#0: -; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; SSE2-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSE2-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE2-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0] ; SSE2-NEXT: retq ; ; SSE41-LABEL: test_buildvector_v4f32_partial_load: @@ -150,12 +150,12 @@ ; SSE2-LABEL: test_buildvector_v4i32_register: ; SSE2: # BB#0: ; SSE2-NEXT: movd %ecx, %xmm0 -; SSE2-NEXT: movd %esi, %xmm1 +; SSE2-NEXT: movd %edx, %xmm1 ; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE2-NEXT: movd %edx, %xmm2 +; SSE2-NEXT: movd %esi, %xmm2 ; SSE2-NEXT: movd %edi, %xmm0 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSE2-NEXT: retq ; ; SSE41-LABEL: test_buildvector_v4i32_register: @@ -178,7 +178,7 @@ ; SSE2-NEXT: movd %edi, %xmm0 ; SSE2-NEXT: movd %esi, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSE2-NEXT: retq ; ; SSE41-LABEL: test_buildvector_v4i32_partial: @@ -228,21 +228,21 @@ define <8 x i16> @test_buildvector_v8i16_register(i16 %a0, i16 %a1, i16 %a2, i16 %a3, i16 %a4, i16 %a5, i16 %a6, i16 %a7) { ; SSE2-LABEL: test_buildvector_v8i16_register: ; SSE2: # BB#0: -; SSE2-NEXT: movd %ecx, %xmm0 +; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE2-NEXT: movd %r9d, %xmm1 -; SSE2-NEXT: movd %esi, %xmm2 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE2-NEXT: movd %r9d, %xmm0 +; SSE2-NEXT: movd %r8d, %xmm2 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE2-NEXT: movd %ecx, %xmm0 ; SSE2-NEXT: movd %edx, %xmm1 -; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE2-NEXT: movd %r8d, %xmm3 +; SSE2-NEXT: movd %esi, %xmm3 ; SSE2-NEXT: movd %edi, %xmm0 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] ; SSE2-NEXT: retq ; ; SSE41-LABEL: test_buildvector_v8i16_register: @@ -333,34 +333,34 @@ ; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero ; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; SSE2-NEXT: movd %ecx, %xmm0 -; SSE2-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; SSE2-NEXT: movd %r9d, %xmm1 +; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; SSE2-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero -; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; SSE2-NEXT: movd %esi, %xmm2 -; SSE2-NEXT: movd {{.*#+}} xmm3 = mem[0],zero,zero,zero -; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] ; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero ; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; SSE2-NEXT: movd %edx, %xmm3 ; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT: movd {{.*#+}} xmm3 = mem[0],zero,zero,zero ; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] -; SSE2-NEXT: movd %r8d, %xmm1 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] ; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero ; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSE2-NEXT: movd %r9d, %xmm0 +; SSE2-NEXT: movd %r8d, %xmm2 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSE2-NEXT: movd %ecx, %xmm0 +; SSE2-NEXT: movd %edx, %xmm1 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSE2-NEXT: movd %esi, %xmm4 ; SSE2-NEXT: movd %edi, %xmm0 -; SSE2-NEXT: movd {{.*#+}} xmm4 = mem[0],zero,zero,zero ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0] ; SSE2-NEXT: retq ; ; SSE41-LABEL: test_buildvector_v16i8_register: Index: test/CodeGen/X86/clear_upper_vector_element_bits.ll =================================================================== --- test/CodeGen/X86/clear_upper_vector_element_bits.ll +++ test/CodeGen/X86/clear_upper_vector_element_bits.ll @@ -159,27 +159,18 @@ define <8 x i16> @_clearupper8xi16a(<8 x i16>) nounwind { ; SSE-LABEL: _clearupper8xi16a: ; SSE: # BB#0: -; SSE-NEXT: pextrw $1, %xmm0, %eax -; SSE-NEXT: pextrw $2, %xmm0, %r9d -; SSE-NEXT: pextrw $3, %xmm0, %edx -; SSE-NEXT: pextrw $4, %xmm0, %r8d -; SSE-NEXT: pextrw $5, %xmm0, %edi -; SSE-NEXT: pextrw $6, %xmm0, %esi -; SSE-NEXT: pextrw $7, %xmm0, %ecx -; SSE-NEXT: movd %ecx, %xmm1 +; SSE-NEXT: pextrw $4, %xmm0, %eax +; SSE-NEXT: pextrw $5, %xmm0, %ecx +; SSE-NEXT: pextrw $6, %xmm0, %edx +; SSE-NEXT: pextrw $7, %xmm0, %esi +; SSE-NEXT: movd %esi, %xmm1 ; SSE-NEXT: movd %edx, %xmm2 ; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; SSE-NEXT: movd %edi, %xmm1 +; SSE-NEXT: movd %ecx, %xmm1 ; SSE-NEXT: movd %eax, %xmm3 ; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] -; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; SSE-NEXT: movd %esi, %xmm1 -; SSE-NEXT: movd %r9d, %xmm2 -; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; SSE-NEXT: movd %r8d, %xmm1 -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0] ; SSE-NEXT: pand {{.*}}(%rip), %xmm0 ; SSE-NEXT: retq ; @@ -225,61 +216,33 @@ define <16 x i16> @_clearupper16xi16a(<16 x i16>) nounwind { ; SSE-LABEL: _clearupper16xi16a: ; SSE: # BB#0: -; SSE-NEXT: pushq %rbp -; SSE-NEXT: pushq %r15 -; SSE-NEXT: pushq %r14 -; SSE-NEXT: pushq %r12 -; SSE-NEXT: pushq %rbx -; SSE-NEXT: pextrw $1, %xmm0, %edi -; SSE-NEXT: pextrw $2, %xmm0, %eax -; SSE-NEXT: pextrw $3, %xmm0, %ecx -; SSE-NEXT: pextrw $4, %xmm0, %edx -; SSE-NEXT: pextrw $5, %xmm0, %esi -; SSE-NEXT: pextrw $6, %xmm0, %ebx -; SSE-NEXT: pextrw $7, %xmm0, %ebp -; SSE-NEXT: pextrw $1, %xmm1, %r10d -; SSE-NEXT: pextrw $2, %xmm1, %r9d -; SSE-NEXT: pextrw $3, %xmm1, %r14d +; SSE-NEXT: pextrw $4, %xmm0, %eax +; SSE-NEXT: pextrw $5, %xmm0, %ecx +; SSE-NEXT: pextrw $6, %xmm0, %edx +; SSE-NEXT: pextrw $7, %xmm0, %esi ; SSE-NEXT: pextrw $4, %xmm1, %r8d -; SSE-NEXT: pextrw $5, %xmm1, %r15d -; SSE-NEXT: pextrw $6, %xmm1, %r11d -; SSE-NEXT: pextrw $7, %xmm1, %r12d -; SSE-NEXT: movd %ebp, %xmm2 -; SSE-NEXT: movd %ecx, %xmm3 -; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; SSE-NEXT: pextrw $5, %xmm1, %r9d +; SSE-NEXT: pextrw $6, %xmm1, %r10d +; SSE-NEXT: pextrw $7, %xmm1, %edi ; SSE-NEXT: movd %esi, %xmm2 -; SSE-NEXT: movd %edi, %xmm4 -; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] -; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; SSE-NEXT: movd %ebx, %xmm2 -; SSE-NEXT: movd %eax, %xmm3 +; SSE-NEXT: movd %edx, %xmm3 ; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; SSE-NEXT: movd %edx, %xmm2 -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] +; SSE-NEXT: movd %ecx, %xmm2 +; SSE-NEXT: movd %eax, %xmm4 +; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm4[0] ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] ; SSE-NEXT: pand %xmm2, %xmm0 -; SSE-NEXT: movd %r12d, %xmm3 -; SSE-NEXT: movd %r14d, %xmm4 +; SSE-NEXT: movd %edi, %xmm3 +; SSE-NEXT: movd %r10d, %xmm4 ; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; SSE-NEXT: movd %r15d, %xmm3 -; SSE-NEXT: movd %r10d, %xmm5 +; SSE-NEXT: movd %r9d, %xmm3 +; SSE-NEXT: movd %r8d, %xmm5 ; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] -; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; SSE-NEXT: movd %r11d, %xmm3 -; SSE-NEXT: movd %r9d, %xmm4 -; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; SSE-NEXT: movd %r8d, %xmm3 -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3] +; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm5[0] ; SSE-NEXT: pand %xmm2, %xmm1 -; SSE-NEXT: popq %rbx -; SSE-NEXT: popq %r12 -; SSE-NEXT: popq %r14 -; SSE-NEXT: popq %r15 -; SSE-NEXT: popq %rbp ; SSE-NEXT: retq ; ; AVX-LABEL: _clearupper16xi16a: @@ -364,10 +327,9 @@ ; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; SSE-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax ; SSE-NEXT: movd %eax, %xmm0 -; SSE-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; SSE-NEXT: movd %eax, %xmm2 +; SSE-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero ; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; SSE-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax ; SSE-NEXT: movd %eax, %xmm0 ; SSE-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax @@ -375,31 +337,32 @@ ; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] ; SSE-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax ; SSE-NEXT: movd %eax, %xmm0 -; SSE-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; SSE-NEXT: movd %eax, %xmm1 +; SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero ; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSE-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero -; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; SSE-NEXT: movd {{.*#+}} xmm3 = mem[0],zero,zero,zero -; SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSE-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; SSE-NEXT: movd %eax, %xmm0 ; SSE-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax ; SSE-NEXT: movd %eax, %xmm2 +; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] ; SSE-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; SSE-NEXT: movd %eax, %xmm3 -; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; SSE-NEXT: movd %eax, %xmm0 +; SSE-NEXT: movd {{.*#+}} xmm3 = mem[0],zero,zero,zero +; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] +; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; SSE-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; SSE-NEXT: movd %eax, %xmm0 ; SSE-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax ; SSE-NEXT: movd %eax, %xmm2 +; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] ; SSE-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax ; SSE-NEXT: movd %eax, %xmm4 -; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] -; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] +; SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSE-NEXT: pand {{.*}}(%rip), %xmm0 ; SSE-NEXT: retq ; @@ -486,10 +449,9 @@ ; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; SSE-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax ; SSE-NEXT: movd %eax, %xmm0 -; SSE-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; SSE-NEXT: movd %eax, %xmm2 +; SSE-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero ; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; SSE-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax ; SSE-NEXT: movd %eax, %xmm0 ; SSE-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax @@ -497,31 +459,32 @@ ; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] ; SSE-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax ; SSE-NEXT: movd %eax, %xmm0 -; SSE-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; SSE-NEXT: movd %eax, %xmm1 +; SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero ; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSE-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero -; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; SSE-NEXT: movd {{.*#+}} xmm3 = mem[0],zero,zero,zero -; SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSE-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; SSE-NEXT: movd %eax, %xmm0 ; SSE-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax ; SSE-NEXT: movd %eax, %xmm2 +; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] ; SSE-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; SSE-NEXT: movd %eax, %xmm3 -; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; SSE-NEXT: movd %eax, %xmm0 +; SSE-NEXT: movd {{.*#+}} xmm3 = mem[0],zero,zero,zero +; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] +; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; SSE-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; SSE-NEXT: movd %eax, %xmm0 ; SSE-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax ; SSE-NEXT: movd %eax, %xmm2 +; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] ; SSE-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax ; SSE-NEXT: movd %eax, %xmm4 -; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] -; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] +; SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; SSE-NEXT: pand %xmm2, %xmm0 ; SSE-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax @@ -531,10 +494,9 @@ ; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] ; SSE-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax ; SSE-NEXT: movd %eax, %xmm1 -; SSE-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; SSE-NEXT: movd %eax, %xmm4 +; SSE-NEXT: movd {{.*#+}} xmm4 = mem[0],zero,zero,zero ; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3],xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7] -; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] +; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] ; SSE-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax ; SSE-NEXT: movd %eax, %xmm1 ; SSE-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax @@ -542,31 +504,32 @@ ; SSE-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3],xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7] ; SSE-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax ; SSE-NEXT: movd %eax, %xmm1 -; SSE-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; SSE-NEXT: movd %eax, %xmm3 +; SSE-NEXT: movd {{.*#+}} xmm3 = mem[0],zero,zero,zero ; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] -; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3],xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7] -; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] -; SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; SSE-NEXT: movd {{.*#+}} xmm4 = mem[0],zero,zero,zero -; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3],xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7] -; SSE-NEXT: movd {{.*#+}} xmm5 = mem[0],zero,zero,zero -; SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3],xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7] -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] +; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3] +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] +; SSE-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; SSE-NEXT: movd %eax, %xmm1 ; SSE-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax ; SSE-NEXT: movd %eax, %xmm4 +; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3],xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7] ; SSE-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; SSE-NEXT: movd %eax, %xmm5 -; SSE-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] +; SSE-NEXT: movd %eax, %xmm1 +; SSE-NEXT: movd {{.*#+}} xmm5 = mem[0],zero,zero,zero +; SSE-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3],xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7] +; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; SSE-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; SSE-NEXT: movd %eax, %xmm1 ; SSE-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax ; SSE-NEXT: movd %eax, %xmm4 +; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3],xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7] ; SSE-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax ; SSE-NEXT: movd %eax, %xmm6 -; SSE-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3],xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7] -; SSE-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3],xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] +; SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero ; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3],xmm1[4],xmm6[4],xmm1[5],xmm6[5],xmm1[6],xmm6[6],xmm1[7],xmm6[7] -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] +; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0] ; SSE-NEXT: pand %xmm2, %xmm1 ; SSE-NEXT: retq ; Index: test/CodeGen/X86/haddsub-2.ll =================================================================== --- test/CodeGen/X86/haddsub-2.ll +++ test/CodeGen/X86/haddsub-2.ll @@ -142,12 +142,12 @@ ; SSE3-NEXT: movd %xmm0, %edi ; SSE3-NEXT: addl %eax, %edi ; SSE3-NEXT: movd %edi, %xmm0 -; SSE3-NEXT: movd %edx, %xmm1 +; SSE3-NEXT: movd %esi, %xmm1 ; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE3-NEXT: movd %esi, %xmm2 +; SSE3-NEXT: movd %edx, %xmm2 ; SSE3-NEXT: movd %ecx, %xmm0 ; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSE3-NEXT: retq ; ; SSSE3-LABEL: phadd_d_test1: @@ -196,16 +196,16 @@ ; SSE3-NEXT: movd %xmm0, %esi ; SSE3-NEXT: addl %eax, %esi ; SSE3-NEXT: movd %esi, %xmm0 +; SSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,2,3] +; SSE3-NEXT: movd %xmm2, %eax +; SSE3-NEXT: movd %xmm1, %esi +; SSE3-NEXT: addl %eax, %esi +; SSE3-NEXT: movd %esi, %xmm1 +; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE3-NEXT: movd %ecx, %xmm2 -; SSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; SSE3-NEXT: movd %xmm0, %eax -; SSE3-NEXT: movd %xmm1, %ecx -; SSE3-NEXT: addl %eax, %ecx -; SSE3-NEXT: movd %ecx, %xmm1 ; SSE3-NEXT: movd %edx, %xmm0 -; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSE3-NEXT: retq ; ; SSSE3-LABEL: phadd_d_test2: @@ -258,12 +258,12 @@ ; SSE3-NEXT: movd %xmm0, %edi ; SSE3-NEXT: subl %edi, %esi ; SSE3-NEXT: movd %esi, %xmm0 -; SSE3-NEXT: movd %ecx, %xmm1 +; SSE3-NEXT: movd %edx, %xmm1 ; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE3-NEXT: movd %edx, %xmm2 +; SSE3-NEXT: movd %ecx, %xmm2 ; SSE3-NEXT: movd %eax, %xmm0 ; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSE3-NEXT: retq ; ; SSSE3-LABEL: phsub_d_test1: @@ -312,16 +312,16 @@ ; SSE3-NEXT: movd %xmm0, %esi ; SSE3-NEXT: subl %esi, %edx ; SSE3-NEXT: movd %edx, %xmm0 +; SSE3-NEXT: movd %xmm1, %edx +; SSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,2,3] +; SSE3-NEXT: movd %xmm1, %esi +; SSE3-NEXT: subl %esi, %edx +; SSE3-NEXT: movd %edx, %xmm1 +; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE3-NEXT: movd %eax, %xmm2 -; SSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE3-NEXT: movd %xmm1, %eax -; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; SSE3-NEXT: movd %xmm0, %edx -; SSE3-NEXT: subl %edx, %eax -; SSE3-NEXT: movd %eax, %xmm1 ; SSE3-NEXT: movd %ecx, %xmm0 -; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSE3-NEXT: retq ; ; SSSE3-LABEL: phsub_d_test2: @@ -518,19 +518,19 @@ ; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] ; SSE3-NEXT: movd %xmm0, %r9d ; SSE3-NEXT: addl %edx, %r9d -; SSE3-NEXT: movd %xmm1, %esi +; SSE3-NEXT: movd %xmm1, %edx ; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; SSE3-NEXT: movd %xmm0, %r10d -; SSE3-NEXT: addl %esi, %r10d -; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] ; SSE3-NEXT: movd %xmm0, %esi +; SSE3-NEXT: addl %edx, %esi +; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE3-NEXT: movd %xmm0, %edx ; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,1,2,3] ; SSE3-NEXT: movd %xmm0, %edi -; SSE3-NEXT: addl %esi, %edi +; SSE3-NEXT: addl %edx, %edi ; SSE3-NEXT: movd %xmm2, %eax ; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3] -; SSE3-NEXT: movd %xmm0, %r11d -; SSE3-NEXT: addl %eax, %r11d +; SSE3-NEXT: movd %xmm0, %r10d +; SSE3-NEXT: addl %eax, %r10d ; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] ; SSE3-NEXT: movd %xmm0, %eax ; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[3,1,2,3] @@ -541,24 +541,24 @@ ; SSE3-NEXT: movd %xmm0, %edx ; SSE3-NEXT: addl %eax, %edx ; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,3,0,1] -; SSE3-NEXT: movd %xmm0, %eax +; SSE3-NEXT: movd %xmm0, %r11d ; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm3[3,1,2,3] -; SSE3-NEXT: movd %xmm0, %esi -; SSE3-NEXT: addl %eax, %esi +; SSE3-NEXT: movd %xmm0, %eax +; SSE3-NEXT: addl %r11d, %eax ; SSE3-NEXT: movd %edi, %xmm0 -; SSE3-NEXT: movd %r9d, %xmm1 +; SSE3-NEXT: movd %esi, %xmm1 ; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE3-NEXT: movd %r10d, %xmm2 +; SSE3-NEXT: movd %r9d, %xmm2 ; SSE3-NEXT: movd %r8d, %xmm0 ; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE3-NEXT: movd %esi, %xmm1 -; SSE3-NEXT: movd %ecx, %xmm2 +; SSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE3-NEXT: movd %eax, %xmm1 +; SSE3-NEXT: movd %edx, %xmm2 ; SSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; SSE3-NEXT: movd %edx, %xmm3 -; SSE3-NEXT: movd %r11d, %xmm1 +; SSE3-NEXT: movd %ecx, %xmm3 +; SSE3-NEXT: movd %r10d, %xmm1 ; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] -; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; SSE3-NEXT: retq ; ; SSSE3-LABEL: avx2_vphadd_d_test: @@ -658,83 +658,83 @@ ; SSE3-NEXT: addl %eax, %ecx ; SSE3-NEXT: movl %ecx, -{{[0-9]+}}(%rsp) # 4-byte Spill ; SSE3-NEXT: pextrw $2, %xmm0, %eax -; SSE3-NEXT: pextrw $3, %xmm0, %r11d -; SSE3-NEXT: addl %eax, %r11d +; SSE3-NEXT: pextrw $3, %xmm0, %ecx +; SSE3-NEXT: addl %eax, %ecx +; SSE3-NEXT: movl %ecx, -{{[0-9]+}}(%rsp) # 4-byte Spill ; SSE3-NEXT: pextrw $4, %xmm0, %eax -; SSE3-NEXT: pextrw $5, %xmm0, %r10d -; SSE3-NEXT: addl %eax, %r10d +; SSE3-NEXT: pextrw $5, %xmm0, %r11d +; SSE3-NEXT: addl %eax, %r11d ; SSE3-NEXT: pextrw $6, %xmm0, %eax -; SSE3-NEXT: pextrw $7, %xmm0, %r13d -; SSE3-NEXT: addl %eax, %r13d +; SSE3-NEXT: pextrw $7, %xmm0, %r15d +; SSE3-NEXT: addl %eax, %r15d ; SSE3-NEXT: movd %xmm1, %eax -; SSE3-NEXT: pextrw $1, %xmm1, %r14d -; SSE3-NEXT: addl %eax, %r14d +; SSE3-NEXT: pextrw $1, %xmm1, %r13d +; SSE3-NEXT: addl %eax, %r13d ; SSE3-NEXT: pextrw $2, %xmm1, %eax -; SSE3-NEXT: pextrw $3, %xmm1, %ebp -; SSE3-NEXT: addl %eax, %ebp -; SSE3-NEXT: pextrw $4, %xmm1, %eax -; SSE3-NEXT: pextrw $5, %xmm1, %ebx +; SSE3-NEXT: pextrw $3, %xmm1, %ebx ; SSE3-NEXT: addl %eax, %ebx +; SSE3-NEXT: pextrw $4, %xmm1, %eax +; SSE3-NEXT: pextrw $5, %xmm1, %r8d +; SSE3-NEXT: addl %eax, %r8d ; SSE3-NEXT: pextrw $6, %xmm1, %eax -; SSE3-NEXT: pextrw $7, %xmm1, %edx -; SSE3-NEXT: addl %eax, %edx +; SSE3-NEXT: pextrw $7, %xmm1, %esi +; SSE3-NEXT: addl %eax, %esi ; SSE3-NEXT: movd %xmm2, %eax -; SSE3-NEXT: pextrw $1, %xmm2, %ecx -; SSE3-NEXT: addl %eax, %ecx -; SSE3-NEXT: movl %ecx, -{{[0-9]+}}(%rsp) # 4-byte Spill +; SSE3-NEXT: pextrw $1, %xmm2, %r10d +; SSE3-NEXT: addl %eax, %r10d ; SSE3-NEXT: pextrw $2, %xmm2, %eax -; SSE3-NEXT: pextrw $3, %xmm2, %r12d -; SSE3-NEXT: addl %eax, %r12d +; SSE3-NEXT: pextrw $3, %xmm2, %r14d +; SSE3-NEXT: addl %eax, %r14d ; SSE3-NEXT: pextrw $4, %xmm2, %eax -; SSE3-NEXT: pextrw $5, %xmm2, %r15d -; SSE3-NEXT: addl %eax, %r15d +; SSE3-NEXT: pextrw $5, %xmm2, %r12d +; SSE3-NEXT: addl %eax, %r12d ; SSE3-NEXT: pextrw $6, %xmm2, %eax -; SSE3-NEXT: pextrw $7, %xmm2, %r8d -; SSE3-NEXT: addl %eax, %r8d -; SSE3-NEXT: movd %xmm3, %eax -; SSE3-NEXT: pextrw $1, %xmm3, %r9d +; SSE3-NEXT: pextrw $7, %xmm2, %r9d ; SSE3-NEXT: addl %eax, %r9d -; SSE3-NEXT: pextrw $2, %xmm3, %eax -; SSE3-NEXT: pextrw $3, %xmm3, %esi -; SSE3-NEXT: addl %eax, %esi -; SSE3-NEXT: pextrw $4, %xmm3, %eax -; SSE3-NEXT: pextrw $5, %xmm3, %edi -; SSE3-NEXT: addl %eax, %edi -; SSE3-NEXT: pextrw $6, %xmm3, %ecx +; SSE3-NEXT: movd %xmm3, %eax +; SSE3-NEXT: pextrw $1, %xmm3, %ebp +; SSE3-NEXT: addl %eax, %ebp +; SSE3-NEXT: pextrw $2, %xmm3, %edx +; SSE3-NEXT: pextrw $3, %xmm3, %edi +; SSE3-NEXT: addl %edx, %edi +; SSE3-NEXT: pextrw $4, %xmm3, %edx +; SSE3-NEXT: pextrw $5, %xmm3, %ecx +; SSE3-NEXT: addl %edx, %ecx +; SSE3-NEXT: pextrw $6, %xmm3, %edx ; SSE3-NEXT: pextrw $7, %xmm3, %eax -; SSE3-NEXT: addl %ecx, %eax -; SSE3-NEXT: movd %edx, %xmm8 -; SSE3-NEXT: movd %r13d, %xmm3 -; SSE3-NEXT: movd %ebp, %xmm9 -; SSE3-NEXT: movd %r11d, %xmm4 -; SSE3-NEXT: movd %ebx, %xmm10 -; SSE3-NEXT: movd %r10d, %xmm7 -; SSE3-NEXT: movd %r14d, %xmm11 +; SSE3-NEXT: addl %edx, %eax +; SSE3-NEXT: movd %esi, %xmm8 +; SSE3-NEXT: movd %r8d, %xmm3 +; SSE3-NEXT: movd %ebx, %xmm9 +; SSE3-NEXT: movd %r13d, %xmm4 +; SSE3-NEXT: movd %r15d, %xmm10 +; SSE3-NEXT: movd %r11d, %xmm7 +; SSE3-NEXT: movd -{{[0-9]+}}(%rsp), %xmm11 # 4-byte Folded Reload +; SSE3-NEXT: # xmm11 = mem[0],zero,zero,zero ; SSE3-NEXT: movd -{{[0-9]+}}(%rsp), %xmm0 # 4-byte Folded Reload ; SSE3-NEXT: # xmm0 = mem[0],zero,zero,zero ; SSE3-NEXT: movd %eax, %xmm12 -; SSE3-NEXT: movd %r8d, %xmm6 -; SSE3-NEXT: movd %esi, %xmm13 -; SSE3-NEXT: movd %r12d, %xmm5 -; SSE3-NEXT: movd %edi, %xmm14 -; SSE3-NEXT: movd %r15d, %xmm2 -; SSE3-NEXT: movd %r9d, %xmm15 -; SSE3-NEXT: movd -{{[0-9]+}}(%rsp), %xmm1 # 4-byte Folded Reload -; SSE3-NEXT: # xmm1 = mem[0],zero,zero,zero +; SSE3-NEXT: movd %ecx, %xmm6 +; SSE3-NEXT: movd %edi, %xmm13 +; SSE3-NEXT: movd %ebp, %xmm5 +; SSE3-NEXT: movd %r9d, %xmm14 +; SSE3-NEXT: movd %r12d, %xmm2 +; SSE3-NEXT: movd %r14d, %xmm15 +; SSE3-NEXT: movd %r10d, %xmm1 ; SSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3] ; SSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm9[0],xmm4[1],xmm9[1],xmm4[2],xmm9[2],xmm4[3],xmm9[3] -; SSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; SSE3-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] ; SSE3-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm10[0],xmm7[1],xmm10[1],xmm7[2],xmm10[2],xmm7[3],xmm10[3] ; SSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1],xmm0[2],xmm11[2],xmm0[3],xmm11[3] -; SSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1],xmm0[2],xmm7[2],xmm0[3],xmm7[3] -; SSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] +; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1] +; SSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm4[0] ; SSE3-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm12[0],xmm6[1],xmm12[1],xmm6[2],xmm12[2],xmm6[3],xmm12[3] ; SSE3-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm13[0],xmm5[1],xmm13[1],xmm5[2],xmm13[2],xmm5[3],xmm13[3] -; SSE3-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] +; SSE3-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] ; SSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm14[0],xmm2[1],xmm14[1],xmm2[2],xmm14[2],xmm2[3],xmm14[3] ; SSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1],xmm1[2],xmm15[2],xmm1[3],xmm15[3] -; SSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; SSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3] +; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm5[0] ; SSE3-NEXT: popq %rbx ; SSE3-NEXT: popq %r12 ; SSE3-NEXT: popq %r13 @@ -858,12 +858,12 @@ ; SSE-NEXT: movd %xmm0, %edi ; SSE-NEXT: subl %edi, %esi ; SSE-NEXT: movd %esi, %xmm0 -; SSE-NEXT: movd %ecx, %xmm1 +; SSE-NEXT: movd %edx, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movd %edx, %xmm2 +; SSE-NEXT: movd %ecx, %xmm2 ; SSE-NEXT: movd %eax, %xmm0 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSE-NEXT: retq ; ; AVX-LABEL: not_a_hsub_1: @@ -919,11 +919,11 @@ ; SSE-NEXT: movaps %xmm1, %xmm4 ; SSE-NEXT: movhlps {{.*#+}} xmm4 = xmm4[1,1] ; SSE-NEXT: subss %xmm4, %xmm3 -; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; SSE-NEXT: movshdup {{.*#+}} xmm3 = xmm1[1,1,3,3] -; SSE-NEXT: subss %xmm3, %xmm1 -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: movshdup {{.*#+}} xmm4 = xmm1[1,1,3,3] +; SSE-NEXT: subss %xmm4, %xmm1 +; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSE-NEXT: retq ; ; AVX-LABEL: not_a_hsub_2: @@ -1162,19 +1162,19 @@ ; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] ; SSE3-NEXT: movd %xmm0, %r9d ; SSE3-NEXT: addl %edx, %r9d -; SSE3-NEXT: movd %xmm2, %esi +; SSE3-NEXT: movd %xmm2, %edx ; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3] -; SSE3-NEXT: movd %xmm0, %r10d -; SSE3-NEXT: addl %esi, %r10d -; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] ; SSE3-NEXT: movd %xmm0, %esi +; SSE3-NEXT: addl %edx, %esi +; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] +; SSE3-NEXT: movd %xmm0, %edx ; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[3,1,2,3] ; SSE3-NEXT: movd %xmm0, %edi -; SSE3-NEXT: addl %esi, %edi +; SSE3-NEXT: addl %edx, %edi ; SSE3-NEXT: movd %xmm1, %eax ; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; SSE3-NEXT: movd %xmm0, %r11d -; SSE3-NEXT: addl %eax, %r11d +; SSE3-NEXT: movd %xmm0, %r10d +; SSE3-NEXT: addl %eax, %r10d ; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] ; SSE3-NEXT: movd %xmm0, %eax ; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,1,2,3] @@ -1185,24 +1185,24 @@ ; SSE3-NEXT: movd %xmm0, %edx ; SSE3-NEXT: addl %eax, %edx ; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,3,0,1] -; SSE3-NEXT: movd %xmm0, %eax +; SSE3-NEXT: movd %xmm0, %r11d ; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm3[3,1,2,3] -; SSE3-NEXT: movd %xmm0, %esi -; SSE3-NEXT: addl %eax, %esi +; SSE3-NEXT: movd %xmm0, %eax +; SSE3-NEXT: addl %r11d, %eax ; SSE3-NEXT: movd %edi, %xmm0 -; SSE3-NEXT: movd %r9d, %xmm1 +; SSE3-NEXT: movd %esi, %xmm1 ; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE3-NEXT: movd %r10d, %xmm2 +; SSE3-NEXT: movd %r9d, %xmm2 ; SSE3-NEXT: movd %r8d, %xmm0 ; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE3-NEXT: movd %esi, %xmm1 -; SSE3-NEXT: movd %ecx, %xmm2 +; SSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE3-NEXT: movd %eax, %xmm1 +; SSE3-NEXT: movd %edx, %xmm2 ; SSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; SSE3-NEXT: movd %edx, %xmm3 -; SSE3-NEXT: movd %r11d, %xmm1 +; SSE3-NEXT: movd %ecx, %xmm3 +; SSE3-NEXT: movd %r10d, %xmm1 ; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] -; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; SSE3-NEXT: retq ; ; SSSE3-LABEL: avx2_hadd_d: @@ -1293,15 +1293,14 @@ ; SSE3-NEXT: .Lcfi23: ; SSE3-NEXT: .cfi_offset %rbp, -16 ; SSE3-NEXT: movd %xmm0, %eax -; SSE3-NEXT: pextrw $1, %xmm0, %ecx -; SSE3-NEXT: addl %eax, %ecx -; SSE3-NEXT: movl %ecx, -{{[0-9]+}}(%rsp) # 4-byte Spill +; SSE3-NEXT: pextrw $1, %xmm0, %r10d +; SSE3-NEXT: addl %eax, %r10d ; SSE3-NEXT: pextrw $2, %xmm0, %eax -; SSE3-NEXT: pextrw $3, %xmm0, %r15d -; SSE3-NEXT: addl %eax, %r15d +; SSE3-NEXT: pextrw $3, %xmm0, %r11d +; SSE3-NEXT: addl %eax, %r11d ; SSE3-NEXT: pextrw $4, %xmm0, %eax -; SSE3-NEXT: pextrw $5, %xmm0, %r14d -; SSE3-NEXT: addl %eax, %r14d +; SSE3-NEXT: pextrw $5, %xmm0, %r12d +; SSE3-NEXT: addl %eax, %r12d ; SSE3-NEXT: pextrw $6, %xmm0, %eax ; SSE3-NEXT: pextrw $7, %xmm0, %r13d ; SSE3-NEXT: addl %eax, %r13d @@ -1310,70 +1309,71 @@ ; SSE3-NEXT: addl %eax, %ecx ; SSE3-NEXT: movl %ecx, -{{[0-9]+}}(%rsp) # 4-byte Spill ; SSE3-NEXT: pextrw $2, %xmm1, %eax -; SSE3-NEXT: pextrw $3, %xmm1, %r11d -; SSE3-NEXT: addl %eax, %r11d -; SSE3-NEXT: pextrw $4, %xmm1, %eax -; SSE3-NEXT: pextrw $5, %xmm1, %r10d -; SSE3-NEXT: addl %eax, %r10d -; SSE3-NEXT: pextrw $6, %xmm1, %eax -; SSE3-NEXT: pextrw $7, %xmm1, %r12d -; SSE3-NEXT: addl %eax, %r12d -; SSE3-NEXT: movd %xmm2, %eax -; SSE3-NEXT: pextrw $1, %xmm2, %ebx -; SSE3-NEXT: addl %eax, %ebx -; SSE3-NEXT: pextrw $2, %xmm2, %eax -; SSE3-NEXT: pextrw $3, %xmm2, %ecx +; SSE3-NEXT: pextrw $3, %xmm1, %ecx ; SSE3-NEXT: addl %eax, %ecx +; SSE3-NEXT: movl %ecx, -{{[0-9]+}}(%rsp) # 4-byte Spill +; SSE3-NEXT: pextrw $4, %xmm1, %eax +; SSE3-NEXT: pextrw $5, %xmm1, %r14d +; SSE3-NEXT: addl %eax, %r14d +; SSE3-NEXT: pextrw $6, %xmm1, %esi +; SSE3-NEXT: pextrw $7, %xmm1, %r15d +; SSE3-NEXT: addl %esi, %r15d +; SSE3-NEXT: movd %xmm2, %esi +; SSE3-NEXT: pextrw $1, %xmm2, %ebp +; SSE3-NEXT: addl %esi, %ebp +; SSE3-NEXT: pextrw $2, %xmm2, %esi +; SSE3-NEXT: pextrw $3, %xmm2, %edi +; SSE3-NEXT: addl %esi, %edi ; SSE3-NEXT: pextrw $4, %xmm2, %esi -; SSE3-NEXT: pextrw $5, %xmm2, %r8d -; SSE3-NEXT: addl %esi, %r8d +; SSE3-NEXT: pextrw $5, %xmm2, %eax +; SSE3-NEXT: addl %esi, %eax ; SSE3-NEXT: pextrw $6, %xmm2, %esi -; SSE3-NEXT: pextrw $7, %xmm2, %edx -; SSE3-NEXT: addl %esi, %edx -; SSE3-NEXT: movd %xmm3, %edi +; SSE3-NEXT: pextrw $7, %xmm2, %ecx +; SSE3-NEXT: addl %esi, %ecx +; SSE3-NEXT: movd %xmm3, %ebx ; SSE3-NEXT: pextrw $1, %xmm3, %r9d -; SSE3-NEXT: addl %edi, %r9d -; SSE3-NEXT: pextrw $2, %xmm3, %ebp -; SSE3-NEXT: pextrw $3, %xmm3, %edi -; SSE3-NEXT: addl %ebp, %edi -; SSE3-NEXT: pextrw $4, %xmm3, %eax -; SSE3-NEXT: pextrw $5, %xmm3, %ebp -; SSE3-NEXT: addl %eax, %ebp -; SSE3-NEXT: pextrw $6, %xmm3, %esi -; SSE3-NEXT: pextrw $7, %xmm3, %eax -; SSE3-NEXT: addl %esi, %eax -; SSE3-NEXT: movd %edx, %xmm8 -; SSE3-NEXT: movd %r13d, %xmm3 -; SSE3-NEXT: movd %ecx, %xmm9 -; SSE3-NEXT: movd %r15d, %xmm4 -; SSE3-NEXT: movd %r8d, %xmm10 -; SSE3-NEXT: movd %r14d, %xmm7 -; SSE3-NEXT: movd %ebx, %xmm11 -; SSE3-NEXT: movd -{{[0-9]+}}(%rsp), %xmm0 # 4-byte Folded Reload -; SSE3-NEXT: # xmm0 = mem[0],zero,zero,zero -; SSE3-NEXT: movd %eax, %xmm12 -; SSE3-NEXT: movd %r12d, %xmm6 -; SSE3-NEXT: movd %edi, %xmm13 -; SSE3-NEXT: movd %r11d, %xmm5 -; SSE3-NEXT: movd %ebp, %xmm14 -; SSE3-NEXT: movd %r10d, %xmm2 -; SSE3-NEXT: movd %r9d, %xmm15 +; SSE3-NEXT: addl %ebx, %r9d +; SSE3-NEXT: pextrw $2, %xmm3, %edx +; SSE3-NEXT: pextrw $3, %xmm3, %ebx +; SSE3-NEXT: addl %edx, %ebx +; SSE3-NEXT: pextrw $4, %xmm3, %edx +; SSE3-NEXT: pextrw $5, %xmm3, %esi +; SSE3-NEXT: addl %edx, %esi +; SSE3-NEXT: pextrw $6, %xmm3, %r8d +; SSE3-NEXT: pextrw $7, %xmm3, %edx +; SSE3-NEXT: addl %r8d, %edx +; SSE3-NEXT: movd %ecx, %xmm8 +; SSE3-NEXT: movd %eax, %xmm3 +; SSE3-NEXT: movd %edi, %xmm9 +; SSE3-NEXT: movd %ebp, %xmm4 +; SSE3-NEXT: movd %r13d, %xmm10 +; SSE3-NEXT: movd %r12d, %xmm7 +; SSE3-NEXT: movd %r11d, %xmm11 +; SSE3-NEXT: movd %r10d, %xmm0 +; SSE3-NEXT: movd %edx, %xmm12 +; SSE3-NEXT: movd %esi, %xmm6 +; SSE3-NEXT: movd %ebx, %xmm13 +; SSE3-NEXT: movd %r9d, %xmm5 +; SSE3-NEXT: movd %r15d, %xmm14 +; SSE3-NEXT: movd %r14d, %xmm2 +; SSE3-NEXT: movd -{{[0-9]+}}(%rsp), %xmm15 # 4-byte Folded Reload +; SSE3-NEXT: # xmm15 = mem[0],zero,zero,zero ; SSE3-NEXT: movd -{{[0-9]+}}(%rsp), %xmm1 # 4-byte Folded Reload ; SSE3-NEXT: # xmm1 = mem[0],zero,zero,zero ; SSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3] ; SSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm9[0],xmm4[1],xmm9[1],xmm4[2],xmm9[2],xmm4[3],xmm9[3] -; SSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; SSE3-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] ; SSE3-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm10[0],xmm7[1],xmm10[1],xmm7[2],xmm10[2],xmm7[3],xmm10[3] ; SSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1],xmm0[2],xmm11[2],xmm0[3],xmm11[3] -; SSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1],xmm0[2],xmm7[2],xmm0[3],xmm7[3] -; SSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] +; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1] +; SSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm4[0] ; SSE3-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm12[0],xmm6[1],xmm12[1],xmm6[2],xmm12[2],xmm6[3],xmm12[3] ; SSE3-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm13[0],xmm5[1],xmm13[1],xmm5[2],xmm13[2],xmm5[3],xmm13[3] -; SSE3-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] +; SSE3-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] ; SSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm14[0],xmm2[1],xmm14[1],xmm2[2],xmm14[2],xmm2[3],xmm14[3] ; SSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1],xmm1[2],xmm15[2],xmm1[3],xmm15[3] -; SSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; SSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3] +; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm5[0] ; SSE3-NEXT: popq %rbx ; SSE3-NEXT: popq %r12 ; SSE3-NEXT: popq %r13 Index: test/CodeGen/X86/haddsub-undef.ll =================================================================== --- test/CodeGen/X86/haddsub-undef.ll +++ test/CodeGen/X86/haddsub-undef.ll @@ -171,9 +171,8 @@ ; SSE-NEXT: movhlps {{.*#+}} xmm2 = xmm2[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,3] ; SSE-NEXT: addss %xmm2, %xmm0 -; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1,1,3] -; SSE-NEXT: movaps %xmm1, %xmm0 +; SSE-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE-NEXT: movapd %xmm1, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: test8_undef: Index: test/CodeGen/X86/merge-consecutive-loads-128.ll =================================================================== --- test/CodeGen/X86/merge-consecutive-loads-128.ll +++ test/CodeGen/X86/merge-consecutive-loads-128.ll @@ -269,10 +269,8 @@ ; SSE2-LABEL: merge_4f32_f32_012u: ; SSE2: # BB#0: ; SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; SSE2-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSE2-NEXT: retq ; ; SSE41-LABEL: merge_4f32_f32_012u: @@ -290,11 +288,11 @@ ; X32-SSE1-LABEL: merge_4f32_f32_012u: ; X32-SSE1: # BB#0: ; X32-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-SSE1-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; X32-SSE1-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X32-SSE1-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; X32-SSE1-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; X32-SSE1-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; X32-SSE1-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; X32-SSE1-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X32-SSE1-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; X32-SSE1-NEXT: retl ; ; X32-SSE41-LABEL: merge_4f32_f32_012u: @@ -320,10 +318,8 @@ ; SSE2-LABEL: merge_4f32_f32_019u: ; SSE2: # BB#0: ; SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; SSE2-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSE2-NEXT: retq ; ; SSE41-LABEL: merge_4f32_f32_019u: @@ -341,11 +337,11 @@ ; X32-SSE1-LABEL: merge_4f32_f32_019u: ; X32-SSE1: # BB#0: ; X32-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-SSE1-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; X32-SSE1-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X32-SSE1-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; X32-SSE1-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; X32-SSE1-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; X32-SSE1-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; X32-SSE1-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X32-SSE1-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; X32-SSE1-NEXT: retl ; ; X32-SSE41-LABEL: merge_4f32_f32_019u: @@ -1037,13 +1033,11 @@ define <4 x float> @merge_4f32_f32_2345_volatile(float* %ptr) nounwind uwtable noinline ssp { ; SSE2-LABEL: merge_4f32_f32_2345_volatile: ; SSE2: # BB#0: -; SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; SSE2-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE2-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero +; SSE2-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSE2-NEXT: retq ; ; SSE41-LABEL: merge_4f32_f32_2345_volatile: @@ -1065,13 +1059,13 @@ ; X32-SSE1-LABEL: merge_4f32_f32_2345_volatile: ; X32-SSE1: # BB#0: ; X32-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-SSE1-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; X32-SSE1-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X32-SSE1-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; X32-SSE1-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; X32-SSE1-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; X32-SSE1-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; X32-SSE1-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; X32-SSE1-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; X32-SSE1-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; X32-SSE1-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; X32-SSE1-NEXT: retl ; ; X32-SSE41-LABEL: merge_4f32_f32_2345_volatile: Index: test/CodeGen/X86/select.ll =================================================================== --- test/CodeGen/X86/select.ll +++ test/CodeGen/X86/select.ll @@ -314,13 +314,13 @@ ; GENERIC-NEXT: jmp LBB7_6 ; GENERIC-NEXT: LBB7_4: ; GENERIC-NEXT: movd %r9d, %xmm1 -; GENERIC-NEXT: movd %ecx, %xmm2 +; GENERIC-NEXT: movd %r8d, %xmm2 ; GENERIC-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; GENERIC-NEXT: movd %r8d, %xmm3 +; GENERIC-NEXT: movd %ecx, %xmm3 ; GENERIC-NEXT: movd %edx, %xmm1 ; GENERIC-NEXT: LBB7_6: ; GENERIC-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] -; GENERIC-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; GENERIC-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; GENERIC-NEXT: psubd {{.*}}(%rip), %xmm1 ; GENERIC-NEXT: psubd {{.*}}(%rip), %xmm0 ; GENERIC-NEXT: movq %xmm0, 16(%rsi) @@ -350,16 +350,16 @@ ; ATOM-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero ; ATOM-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] ; ATOM-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] -; ATOM-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] +; ATOM-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0] ; ATOM-NEXT: jmp LBB7_6 ; ATOM-NEXT: LBB7_4: ; ATOM-NEXT: movd %r9d, %xmm1 -; ATOM-NEXT: movd %ecx, %xmm2 +; ATOM-NEXT: movd %r8d, %xmm2 ; ATOM-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; ATOM-NEXT: movd %r8d, %xmm3 +; ATOM-NEXT: movd %ecx, %xmm3 ; ATOM-NEXT: movd %edx, %xmm1 ; ATOM-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] -; ATOM-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; ATOM-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; ATOM-NEXT: LBB7_6: ; ATOM-NEXT: psubd {{.*}}(%rip), %xmm0 ; ATOM-NEXT: psubd {{.*}}(%rip), %xmm1 Index: test/CodeGen/X86/sse-intrinsics-fast-isel.ll =================================================================== --- test/CodeGen/X86/sse-intrinsics-fast-isel.ll +++ test/CodeGen/X86/sse-intrinsics-fast-isel.ll @@ -53,17 +53,17 @@ ; X32-NEXT: movl %esi, (%esp) ; X32-NEXT: andl {{[0-9]+}}(%esp), %edx ; X32-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X32-NEXT: andl {{[0-9]+}}(%esp), %eax -; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X32-NEXT: andl {{[0-9]+}}(%esp), %ecx ; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X32-NEXT: andl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X32-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; X32-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; X32-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; X32-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero ; X32-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; X32-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; X32-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; X32-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; X32-NEXT: leal -4(%ebp), %esp ; X32-NEXT: popl %esi ; X32-NEXT: popl %ebp @@ -86,18 +86,18 @@ ; X64-NEXT: shrq $32, %rsi ; X64-NEXT: shrq $32, %rdi ; X64-NEXT: movl %ecx, -{{[0-9]+}}(%rsp) -; X64-NEXT: movl %edx, -{{[0-9]+}}(%rsp) ; X64-NEXT: andl %r8d, %edi ; X64-NEXT: movl %edi, -{{[0-9]+}}(%rsp) +; X64-NEXT: movl %edx, -{{[0-9]+}}(%rsp) ; X64-NEXT: andl %eax, %esi ; X64-NEXT: movl %esi, -{{[0-9]+}}(%rsp) ; X64-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; X64-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X64-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; X64-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X64-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; X64-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; X64-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; X64-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; X64-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; X64-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; X64-NEXT: retq %arg0 = bitcast <4 x float> %a0 to <4 x i32> %arg1 = bitcast <4 x float> %a1 to <4 x i32> @@ -121,15 +121,15 @@ ; X32-NEXT: movl {{[0-9]+}}(%esp), %esi ; X32-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) ; X32-NEXT: notl %edx -; X32-NEXT: notl %ecx ; X32-NEXT: notl %esi +; X32-NEXT: notl %ecx ; X32-NEXT: notl %eax ; X32-NEXT: andl {{[0-9]+}}(%esp), %eax ; X32-NEXT: movl %eax, (%esp) -; X32-NEXT: andl {{[0-9]+}}(%esp), %esi -; X32-NEXT: movl %esi, {{[0-9]+}}(%esp) ; X32-NEXT: andl {{[0-9]+}}(%esp), %ecx ; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X32-NEXT: andl {{[0-9]+}}(%esp), %esi +; X32-NEXT: movl %esi, {{[0-9]+}}(%esp) ; X32-NEXT: andl {{[0-9]+}}(%esp), %edx ; X32-NEXT: movl %edx, {{[0-9]+}}(%esp) ; X32-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero @@ -138,7 +138,7 @@ ; X32-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero ; X32-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; X32-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; X32-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; X32-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; X32-NEXT: leal -4(%ebp), %esp ; X32-NEXT: popl %esi ; X32-NEXT: popl %ebp @@ -165,18 +165,18 @@ ; X64-NEXT: notl %esi ; X64-NEXT: notl %edx ; X64-NEXT: movl %ecx, -{{[0-9]+}}(%rsp) -; X64-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ; X64-NEXT: andl %r8d, %edx ; X64-NEXT: movl %edx, -{{[0-9]+}}(%rsp) +; X64-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ; X64-NEXT: andl %edi, %esi ; X64-NEXT: movl %esi, -{{[0-9]+}}(%rsp) ; X64-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; X64-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X64-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; X64-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X64-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; X64-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; X64-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; X64-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; X64-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; X64-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; X64-NEXT: retq %arg0 = bitcast <4 x float> %a0 to <4 x i32> %arg1 = bitcast <4 x float> %a1 to <4 x i32> @@ -1277,17 +1277,17 @@ ; X32-NEXT: movl %esi, (%esp) ; X32-NEXT: orl {{[0-9]+}}(%esp), %edx ; X32-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X32-NEXT: orl {{[0-9]+}}(%esp), %eax -; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X32-NEXT: orl {{[0-9]+}}(%esp), %ecx ; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X32-NEXT: orl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X32-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; X32-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; X32-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; X32-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero ; X32-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; X32-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; X32-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; X32-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; X32-NEXT: leal -4(%ebp), %esp ; X32-NEXT: popl %esi ; X32-NEXT: popl %ebp @@ -1310,18 +1310,18 @@ ; X64-NEXT: shrq $32, %rsi ; X64-NEXT: shrq $32, %rdi ; X64-NEXT: movl %ecx, -{{[0-9]+}}(%rsp) -; X64-NEXT: movl %edx, -{{[0-9]+}}(%rsp) ; X64-NEXT: orl %r8d, %edi ; X64-NEXT: movl %edi, -{{[0-9]+}}(%rsp) +; X64-NEXT: movl %edx, -{{[0-9]+}}(%rsp) ; X64-NEXT: orl %eax, %esi ; X64-NEXT: movl %esi, -{{[0-9]+}}(%rsp) ; X64-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; X64-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X64-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; X64-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X64-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; X64-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; X64-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; X64-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; X64-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; X64-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; X64-NEXT: retq %arg0 = bitcast <4 x float> %a0 to <4 x i32> %arg1 = bitcast <4 x float> %a1 to <4 x i32> @@ -1538,16 +1538,16 @@ ; X32-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; X32-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero ; X32-NEXT: movss {{.*#+}} xmm3 = mem[0],zero,zero,zero -; X32-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] -; X32-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; X32-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] ; X32-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; X32-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] ; X32-NEXT: retl ; ; X64-LABEL: test_mm_set_ps: ; X64: # BB#0: -; X64-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; X64-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; X64-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; X64-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; X64-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm1[0] ; X64-NEXT: movaps %xmm3, %xmm0 ; X64-NEXT: retq %res0 = insertelement <4 x float> undef, float %a3, i32 0 @@ -1677,16 +1677,16 @@ ; X32-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero ; X32-NEXT: movss {{.*#+}} xmm3 = mem[0],zero,zero,zero ; X32-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X32-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; X32-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; X32-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; X32-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; X32-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] ; X32-NEXT: retl ; ; X64-LABEL: test_mm_setr_ps: ; X64: # BB#0: -; X64-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] -; X64-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; X64-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] ; X64-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; X64-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] ; X64-NEXT: retq %res0 = insertelement <4 x float> undef, float %a0, i32 0 %res1 = insertelement <4 x float> %res0, float %a1, i32 1 @@ -2239,17 +2239,17 @@ ; X32-NEXT: movl %esi, (%esp) ; X32-NEXT: xorl {{[0-9]+}}(%esp), %edx ; X32-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X32-NEXT: xorl {{[0-9]+}}(%esp), %eax -; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X32-NEXT: xorl {{[0-9]+}}(%esp), %ecx ; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X32-NEXT: xorl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X32-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; X32-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; X32-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; X32-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero ; X32-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; X32-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; X32-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; X32-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; X32-NEXT: leal -4(%ebp), %esp ; X32-NEXT: popl %esi ; X32-NEXT: popl %ebp @@ -2272,18 +2272,18 @@ ; X64-NEXT: shrq $32, %rsi ; X64-NEXT: shrq $32, %rdi ; X64-NEXT: movl %ecx, -{{[0-9]+}}(%rsp) -; X64-NEXT: movl %edx, -{{[0-9]+}}(%rsp) ; X64-NEXT: xorl %r8d, %edi ; X64-NEXT: movl %edi, -{{[0-9]+}}(%rsp) +; X64-NEXT: movl %edx, -{{[0-9]+}}(%rsp) ; X64-NEXT: xorl %eax, %esi ; X64-NEXT: movl %esi, -{{[0-9]+}}(%rsp) ; X64-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; X64-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X64-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; X64-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X64-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; X64-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; X64-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; X64-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; X64-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; X64-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; X64-NEXT: retq %arg0 = bitcast <4 x float> %a0 to <4 x i32> %arg1 = bitcast <4 x float> %a1 to <4 x i32> Index: test/CodeGen/X86/sse1.ll =================================================================== --- test/CodeGen/X86/sse1.ll +++ test/CodeGen/X86/sse1.ll @@ -87,17 +87,17 @@ ; X32-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; X32-NEXT: .LBB1_11: # %entry ; X32-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; X32-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; X32-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] ; X32-NEXT: retl ; ; X64-LABEL: vselect: ; X64: # BB#0: # %entry -; X64-NEXT: testl %ecx, %ecx +; X64-NEXT: testl %edx, %edx ; X64-NEXT: xorps %xmm0, %xmm0 ; X64-NEXT: je .LBB1_1 ; X64-NEXT: # BB#2: # %entry ; X64-NEXT: xorps %xmm1, %xmm1 -; X64-NEXT: testl %edx, %edx +; X64-NEXT: testl %ecx, %ecx ; X64-NEXT: jne .LBB1_5 ; X64-NEXT: .LBB1_4: ; X64-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero @@ -111,7 +111,7 @@ ; X64-NEXT: jmp .LBB1_11 ; X64-NEXT: .LBB1_1: ; X64-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; X64-NEXT: testl %edx, %edx +; X64-NEXT: testl %ecx, %ecx ; X64-NEXT: je .LBB1_4 ; X64-NEXT: .LBB1_5: # %entry ; X64-NEXT: xorps %xmm2, %xmm2 @@ -126,7 +126,7 @@ ; X64-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; X64-NEXT: .LBB1_11: # %entry ; X64-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; X64-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; X64-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] ; X64-NEXT: retq entry: %a1 = icmp eq <4 x i32> %q, zeroinitializer @@ -252,12 +252,12 @@ ; X32-NEXT: movl %eax, (%esp) ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-NEXT: movl {{[0-9]+}}(%esp), %edx -; X32-NEXT: andl %ecx, %edx -; X32-NEXT: notl %ecx -; X32-NEXT: andl {{[0-9]+}}(%esp), %ecx -; X32-NEXT: orl %edx, %ecx -; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X32-NEXT: andl %eax, %ecx +; X32-NEXT: notl %eax +; X32-NEXT: andl {{[0-9]+}}(%esp), %eax +; X32-NEXT: orl %ecx, %eax +; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X32-NEXT: movl {{[0-9]+}}(%esp), %edx ; X32-NEXT: andl %ecx, %edx @@ -277,7 +277,7 @@ ; X32-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero ; X32-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; X32-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; X32-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; X32-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; X32-NEXT: movl %ebp, %esp ; X32-NEXT: popl %ebp ; X32-NEXT: retl @@ -297,48 +297,48 @@ ; X64-NEXT: mulps %xmm1, %xmm0 ; X64-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; X64-NEXT: movq -{{[0-9]+}}(%rsp), %r8 +; X64-NEXT: movq -{{[0-9]+}}(%rsp), %rsi ; X64-NEXT: movq -{{[0-9]+}}(%rsp), %r9 -; X64-NEXT: movq -{{[0-9]+}}(%rsp), %r10 ; X64-NEXT: movq -{{[0-9]+}}(%rsp), %rdi -; X64-NEXT: movl %r9d, %esi -; X64-NEXT: andl %edi, %esi +; X64-NEXT: movl %esi, %eax +; X64-NEXT: andl %edi, %eax ; X64-NEXT: movl %edi, %ecx ; X64-NEXT: notl %ecx +; X64-NEXT: movq -{{[0-9]+}}(%rsp), %r10 ; X64-NEXT: movq -{{[0-9]+}}(%rsp), %rdx -; X64-NEXT: movq -{{[0-9]+}}(%rsp), %rax -; X64-NEXT: andl %eax, %ecx -; X64-NEXT: orl %esi, %ecx +; X64-NEXT: andl %edx, %ecx +; X64-NEXT: orl %eax, %ecx ; X64-NEXT: movl %ecx, -{{[0-9]+}}(%rsp) -; X64-NEXT: movl %r8d, %ecx -; X64-NEXT: andl %r10d, %ecx -; X64-NEXT: movl %r10d, %esi -; X64-NEXT: notl %esi -; X64-NEXT: andl %edx, %esi -; X64-NEXT: orl %ecx, %esi -; X64-NEXT: movl %esi, -{{[0-9]+}}(%rsp) -; X64-NEXT: shrq $32, %r9 +; X64-NEXT: shrq $32, %rsi ; X64-NEXT: shrq $32, %rdi -; X64-NEXT: andl %edi, %r9d +; X64-NEXT: andl %edi, %esi ; X64-NEXT: notl %edi -; X64-NEXT: shrq $32, %rax -; X64-NEXT: andl %edi, %eax -; X64-NEXT: orl %r9d, %eax -; X64-NEXT: movl %eax, -{{[0-9]+}}(%rsp) -; X64-NEXT: shrq $32, %r8 -; X64-NEXT: shrq $32, %r10 -; X64-NEXT: andl %r10d, %r8d -; X64-NEXT: notl %r10d ; X64-NEXT: shrq $32, %rdx -; X64-NEXT: andl %r10d, %edx -; X64-NEXT: orl %r8d, %edx +; X64-NEXT: andl %edi, %edx +; X64-NEXT: orl %esi, %edx ; X64-NEXT: movl %edx, -{{[0-9]+}}(%rsp) +; X64-NEXT: movl %r8d, %eax +; X64-NEXT: andl %r9d, %eax +; X64-NEXT: movl %r9d, %ecx +; X64-NEXT: notl %ecx +; X64-NEXT: andl %r10d, %ecx +; X64-NEXT: orl %eax, %ecx +; X64-NEXT: movl %ecx, -{{[0-9]+}}(%rsp) +; X64-NEXT: shrq $32, %r8 +; X64-NEXT: shrq $32, %r9 +; X64-NEXT: andl %r9d, %r8d +; X64-NEXT: notl %r9d +; X64-NEXT: shrq $32, %r10 +; X64-NEXT: andl %r9d, %r10d +; X64-NEXT: orl %r8d, %r10d +; X64-NEXT: movl %r10d, -{{[0-9]+}}(%rsp) ; X64-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; X64-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X64-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; X64-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X64-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; X64-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; X64-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; X64-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; X64-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; X64-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; X64-NEXT: retq %t0 = call fast <2 x float> @llvm.sqrt.v2f32(<2 x float> ) ret <2 x float> %t0 Index: test/CodeGen/X86/sse2-intrinsics-fast-isel.ll =================================================================== --- test/CodeGen/X86/sse2-intrinsics-fast-isel.ll +++ test/CodeGen/X86/sse2-intrinsics-fast-isel.ll @@ -2076,7 +2076,7 @@ ; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; X32-NEXT: movd %eax, %xmm2 ; X32-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; X32-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; X32-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; X32-NEXT: movd %eax, %xmm0 ; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax @@ -2087,8 +2087,8 @@ ; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; X32-NEXT: movd %eax, %xmm1 ; X32-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; X32-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] -; X32-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; X32-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; X32-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; X32-NEXT: movd %eax, %xmm0 ; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax @@ -2099,7 +2099,7 @@ ; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; X32-NEXT: movd %eax, %xmm3 ; X32-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] -; X32-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; X32-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] ; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; X32-NEXT: movd %eax, %xmm0 ; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax @@ -2110,27 +2110,27 @@ ; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; X32-NEXT: movd %eax, %xmm0 ; X32-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] -; X32-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; X32-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] -; X32-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; X32-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; X32-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; X32-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; X32-NEXT: retl ; ; X64-LABEL: test_mm_set_epi8: ; X64: # BB#0: ; X64-NEXT: movzbl %dil, %eax ; X64-NEXT: movd %eax, %xmm0 -; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax +; X64-NEXT: movzbl %sil, %eax ; X64-NEXT: movd %eax, %xmm1 ; X64-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; X64-NEXT: movzbl %r8b, %eax +; X64-NEXT: movzbl %dl, %eax ; X64-NEXT: movd %eax, %xmm0 -; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax +; X64-NEXT: movzbl %cl, %eax ; X64-NEXT: movd %eax, %xmm2 ; X64-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; X64-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; X64-NEXT: movzbl %dl, %eax +; X64-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; X64-NEXT: movzbl %r8b, %eax ; X64-NEXT: movd %eax, %xmm0 -; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax +; X64-NEXT: movzbl %r9b, %eax ; X64-NEXT: movd %eax, %xmm3 ; X64-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] ; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax @@ -2138,20 +2138,20 @@ ; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax ; X64-NEXT: movd %eax, %xmm1 ; X64-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; X64-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] -; X64-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; X64-NEXT: movzbl %sil, %eax +; X64-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; X64-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax ; X64-NEXT: movd %eax, %xmm0 ; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax ; X64-NEXT: movd %eax, %xmm2 ; X64-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; X64-NEXT: movzbl %r9b, %eax +; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax ; X64-NEXT: movd %eax, %xmm0 ; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax ; X64-NEXT: movd %eax, %xmm3 ; X64-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] -; X64-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] -; X64-NEXT: movzbl %cl, %eax +; X64-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax ; X64-NEXT: movd %eax, %xmm0 ; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax ; X64-NEXT: movd %eax, %xmm2 @@ -2161,9 +2161,9 @@ ; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax ; X64-NEXT: movd %eax, %xmm0 ; X64-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] -; X64-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; X64-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] -; X64-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; X64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; X64-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; X64-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; X64-NEXT: retq %res0 = insertelement <16 x i8> undef, i8 %a15, i32 0 %res1 = insertelement <16 x i8> %res0, i8 %a14, i32 1 @@ -2206,11 +2206,11 @@ ; X32-NEXT: movd %eax, %xmm0 ; X32-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; X32-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; X32-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] +; X32-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] ; X32-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] ; X32-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1],xmm0[2],xmm7[2],xmm0[3],xmm7[3] -; X32-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3] -; X32-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] +; X32-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1] +; X32-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm4[0] ; X32-NEXT: retl ; ; X64-LABEL: test_mm_set_epi16: @@ -2218,20 +2218,20 @@ ; X64-NEXT: movw {{[0-9]+}}(%rsp), %r10w ; X64-NEXT: movw {{[0-9]+}}(%rsp), %ax ; X64-NEXT: movd %edi, %xmm0 -; X64-NEXT: movd %r8d, %xmm1 +; X64-NEXT: movd %esi, %xmm1 ; X64-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; X64-NEXT: movd %edx, %xmm0 -; X64-NEXT: movd %eax, %xmm2 +; X64-NEXT: movd %ecx, %xmm2 ; X64-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; X64-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; X64-NEXT: movd %esi, %xmm0 +; X64-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; X64-NEXT: movd %r8d, %xmm0 ; X64-NEXT: movd %r9d, %xmm1 ; X64-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; X64-NEXT: movd %ecx, %xmm3 +; X64-NEXT: movd %eax, %xmm3 ; X64-NEXT: movd %r10d, %xmm0 ; X64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; X64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; X64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; X64-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; X64-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] ; X64-NEXT: retq %res0 = insertelement <8 x i16> undef, i16 %a7, i32 0 %res1 = insertelement <8 x i16> %res0, i16 %a6, i32 1 @@ -2254,18 +2254,18 @@ ; X32-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero ; X32-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; X32-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; X32-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; X32-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; X32-NEXT: retl ; ; X64-LABEL: test_mm_set_epi32: ; X64: # BB#0: ; X64-NEXT: movd %edi, %xmm0 -; X64-NEXT: movd %edx, %xmm1 +; X64-NEXT: movd %esi, %xmm1 ; X64-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; X64-NEXT: movd %esi, %xmm2 +; X64-NEXT: movd %edx, %xmm2 ; X64-NEXT: movd %ecx, %xmm0 ; X64-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; X64-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; X64-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; X64-NEXT: retq %res0 = insertelement <4 x i32> undef, i32 %a3, i32 0 %res1 = insertelement <4 x i32> %res0, i32 %a2, i32 1 @@ -2282,11 +2282,11 @@ ; X32: # BB#0: ; X32-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero ; X32-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X32-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; X32-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X32-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; X32-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; X32-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero -; X32-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; X32-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; X32-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; X32-NEXT: retl ; ; X64-LABEL: test_mm_set_epi64x: @@ -2441,10 +2441,9 @@ ; X32-LABEL: test_mm_set1_epi64x: ; X32: # BB#0: ; X32-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X32-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] ; X32-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; X32-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] ; X32-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; X32-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; X32-NEXT: retl ; ; X64-LABEL: test_mm_set1_epi64x: @@ -2486,7 +2485,7 @@ ; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; X32-NEXT: movd %eax, %xmm2 ; X32-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; X32-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; X32-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; X32-NEXT: movd %eax, %xmm0 ; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax @@ -2497,8 +2496,8 @@ ; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; X32-NEXT: movd %eax, %xmm1 ; X32-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; X32-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] -; X32-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; X32-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; X32-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; X32-NEXT: movd %eax, %xmm0 ; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax @@ -2509,7 +2508,7 @@ ; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; X32-NEXT: movd %eax, %xmm3 ; X32-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] -; X32-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; X32-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] ; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; X32-NEXT: movd %eax, %xmm0 ; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax @@ -2520,9 +2519,9 @@ ; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; X32-NEXT: movd %eax, %xmm0 ; X32-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] -; X32-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; X32-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] -; X32-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; X32-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; X32-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; X32-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; X32-NEXT: retl ; ; X64-LABEL: test_mm_setr_epi8: @@ -2534,46 +2533,46 @@ ; X64-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax ; X64-NEXT: movd %eax, %xmm0 -; X64-NEXT: movzbl %cl, %eax +; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax ; X64-NEXT: movd %eax, %xmm2 ; X64-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; X64-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; X64-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax ; X64-NEXT: movd %eax, %xmm0 -; X64-NEXT: movzbl %r9b, %eax +; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax ; X64-NEXT: movd %eax, %xmm3 ; X64-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] ; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax ; X64-NEXT: movd %eax, %xmm0 -; X64-NEXT: movzbl %sil, %eax +; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax ; X64-NEXT: movd %eax, %xmm1 ; X64-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; X64-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] -; X64-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; X64-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; X64-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax ; X64-NEXT: movd %eax, %xmm0 ; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax ; X64-NEXT: movd %eax, %xmm2 ; X64-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax +; X64-NEXT: movzbl %r9b, %eax ; X64-NEXT: movd %eax, %xmm0 -; X64-NEXT: movzbl %dl, %eax +; X64-NEXT: movzbl %r8b, %eax ; X64-NEXT: movd %eax, %xmm3 ; X64-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] -; X64-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] -; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax +; X64-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; X64-NEXT: movzbl %cl, %eax ; X64-NEXT: movd %eax, %xmm0 -; X64-NEXT: movzbl %r8b, %eax +; X64-NEXT: movzbl %dl, %eax ; X64-NEXT: movd %eax, %xmm2 ; X64-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; X64-NEXT: movzbl {{[0-9]+}}(%rsp), %eax +; X64-NEXT: movzbl %sil, %eax ; X64-NEXT: movd %eax, %xmm4 ; X64-NEXT: movzbl %dil, %eax ; X64-NEXT: movd %eax, %xmm0 ; X64-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] -; X64-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; X64-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] -; X64-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; X64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; X64-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; X64-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; X64-NEXT: retq %res0 = insertelement <16 x i8> undef, i8 %a0 , i32 0 %res1 = insertelement <16 x i8> %res0, i8 %a1 , i32 1 @@ -2616,11 +2615,11 @@ ; X32-NEXT: movd %eax, %xmm0 ; X32-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; X32-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; X32-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] +; X32-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] ; X32-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] ; X32-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1],xmm0[2],xmm7[2],xmm0[3],xmm7[3] -; X32-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3] -; X32-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] +; X32-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1] +; X32-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm4[0] ; X32-NEXT: retl ; ; X64-LABEL: test_mm_setr_epi16: @@ -2628,20 +2627,20 @@ ; X64-NEXT: movw {{[0-9]+}}(%rsp), %ax ; X64-NEXT: movw {{[0-9]+}}(%rsp), %r10w ; X64-NEXT: movd %eax, %xmm0 -; X64-NEXT: movd %ecx, %xmm1 +; X64-NEXT: movd %r10d, %xmm1 ; X64-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; X64-NEXT: movd %r9d, %xmm0 -; X64-NEXT: movd %esi, %xmm2 +; X64-NEXT: movd %r8d, %xmm2 ; X64-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; X64-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; X64-NEXT: movd %r10d, %xmm0 +; X64-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; X64-NEXT: movd %ecx, %xmm0 ; X64-NEXT: movd %edx, %xmm1 ; X64-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; X64-NEXT: movd %r8d, %xmm3 +; X64-NEXT: movd %esi, %xmm3 ; X64-NEXT: movd %edi, %xmm0 ; X64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; X64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; X64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; X64-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; X64-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] ; X64-NEXT: retq %res0 = insertelement <8 x i16> undef, i16 %a0, i32 0 %res1 = insertelement <8 x i16> %res0, i16 %a1, i32 1 @@ -2664,18 +2663,18 @@ ; X32-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero ; X32-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; X32-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; X32-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; X32-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; X32-NEXT: retl ; ; X64-LABEL: test_mm_setr_epi32: ; X64: # BB#0: ; X64-NEXT: movd %ecx, %xmm0 -; X64-NEXT: movd %esi, %xmm1 +; X64-NEXT: movd %edx, %xmm1 ; X64-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; X64-NEXT: movd %edx, %xmm2 +; X64-NEXT: movd %esi, %xmm2 ; X64-NEXT: movd %edi, %xmm0 ; X64-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; X64-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; X64-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; X64-NEXT: retq %res0 = insertelement <4 x i32> undef, i32 %a0, i32 0 %res1 = insertelement <4 x i32> %res0, i32 %a1, i32 1 @@ -2692,11 +2691,11 @@ ; X32: # BB#0: ; X32-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero ; X32-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X32-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; X32-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X32-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; X32-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; X32-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero -; X32-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; X32-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; X32-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; X32-NEXT: retl ; ; X64-LABEL: test_mm_setr_epi64x: Index: test/CodeGen/X86/sse3-avx-addsub-2.ll =================================================================== --- test/CodeGen/X86/sse3-avx-addsub-2.ll +++ test/CodeGen/X86/sse3-avx-addsub-2.ll @@ -342,9 +342,8 @@ ; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] ; SSE-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1] ; SSE-NEXT: subss %xmm1, %xmm0 -; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1,1,3] -; SSE-NEXT: movaps %xmm2, %xmm0 +; SSE-NEXT: unpcklpd {{.*#+}} xmm2 = xmm2[0],xmm0[0] +; SSE-NEXT: movapd %xmm2, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: test14: @@ -375,8 +374,7 @@ ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,3] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1,2,3] ; SSE-NEXT: addss %xmm0, %xmm1 -; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0,2,1] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm1[0,0] ; SSE-NEXT: movaps %xmm2, %xmm0 ; SSE-NEXT: retq ; @@ -417,10 +415,10 @@ ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,3] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1,2,3] ; SSE-NEXT: addss %xmm0, %xmm1 -; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] -; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] ; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] -; SSE-NEXT: movaps %xmm2, %xmm0 +; SSE-NEXT: unpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0] +; SSE-NEXT: movapd %xmm2, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: test16: Index: test/CodeGen/X86/vec_fp_to_int.ll =================================================================== --- test/CodeGen/X86/vec_fp_to_int.ll +++ test/CodeGen/X86/vec_fp_to_int.ll @@ -1320,17 +1320,17 @@ ; SSE-NEXT: cvttss2si %xmm1, %rax ; SSE-NEXT: movd %eax, %xmm1 ; SSE-NEXT: movaps %xmm0, %xmm2 -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1,2,3] +; SSE-NEXT: movhlps {{.*#+}} xmm2 = xmm2[1,1] ; SSE-NEXT: cvttss2si %xmm2, %rax ; SSE-NEXT: movd %eax, %xmm2 ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; SSE-NEXT: cvttss2si %xmm0, %rax ; SSE-NEXT: movd %eax, %xmm1 -; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,2,3] ; SSE-NEXT: cvttss2si %xmm0, %rax ; SSE-NEXT: movd %eax, %xmm0 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; SSE-NEXT: movdqa %xmm1, %xmm0 ; SSE-NEXT: retq ; @@ -1560,33 +1560,33 @@ ; SSE-NEXT: cvttss2si %xmm0, %rax ; SSE-NEXT: movd %eax, %xmm0 ; SSE-NEXT: movaps %xmm2, %xmm3 -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1,2,3] +; SSE-NEXT: movhlps {{.*#+}} xmm3 = xmm3[1,1] ; SSE-NEXT: cvttss2si %xmm3, %rax ; SSE-NEXT: movd %eax, %xmm3 ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] ; SSE-NEXT: cvttss2si %xmm2, %rax ; SSE-NEXT: movd %eax, %xmm0 -; SSE-NEXT: movhlps {{.*#+}} xmm2 = xmm2[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1,2,3] ; SSE-NEXT: cvttss2si %xmm2, %rax ; SSE-NEXT: movd %eax, %xmm2 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0] ; SSE-NEXT: movaps %xmm1, %xmm2 ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,1,2,3] ; SSE-NEXT: cvttss2si %xmm2, %rax ; SSE-NEXT: movd %eax, %xmm2 ; SSE-NEXT: movaps %xmm1, %xmm3 -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1,2,3] +; SSE-NEXT: movhlps {{.*#+}} xmm3 = xmm3[1,1] ; SSE-NEXT: cvttss2si %xmm3, %rax ; SSE-NEXT: movd %eax, %xmm3 ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] ; SSE-NEXT: cvttss2si %xmm1, %rax ; SSE-NEXT: movd %eax, %xmm2 -; SSE-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1] +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,2,3] ; SSE-NEXT: cvttss2si %xmm1, %rax ; SSE-NEXT: movd %eax, %xmm1 ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] ; SSE-NEXT: movdqa %xmm2, %xmm1 ; SSE-NEXT: retq ; Index: test/CodeGen/X86/vec_int_to_fp.ll =================================================================== --- test/CodeGen/X86/vec_int_to_fp.ll +++ test/CodeGen/X86/vec_int_to_fp.ll @@ -1169,16 +1169,16 @@ define <4 x float> @sitofp_4i64_to_4f32_undef(<2 x i64> %a) { ; SSE-LABEL: sitofp_4i64_to_4f32_undef: ; SSE: # BB#0: -; SSE-NEXT: cvtsi2ssq %rax, %xmm2 ; SSE-NEXT: movq %xmm0, %rax ; SSE-NEXT: cvtsi2ssq %rax, %xmm1 -; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] ; SSE-NEXT: movq %xmm0, %rax ; SSE-NEXT: xorps %xmm0, %xmm0 ; SSE-NEXT: cvtsi2ssq %rax, %xmm0 -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: xorps %xmm0, %xmm0 +; SSE-NEXT: cvtsi2ssq %rax, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0,0] ; SSE-NEXT: movaps %xmm1, %xmm0 ; SSE-NEXT: retq ; @@ -1368,21 +1368,22 @@ ; SSE-LABEL: sitofp_4i64_to_4f32: ; SSE: # BB#0: ; SSE-NEXT: movq %xmm1, %rax -; SSE-NEXT: cvtsi2ssq %rax, %xmm3 -; SSE-NEXT: movq %xmm0, %rax ; SSE-NEXT: cvtsi2ssq %rax, %xmm2 -; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] ; SSE-NEXT: movq %xmm1, %rax ; SSE-NEXT: xorps %xmm1, %xmm1 ; SSE-NEXT: cvtsi2ssq %rax, %xmm1 +; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE-NEXT: movq %xmm0, %rax +; SSE-NEXT: xorps %xmm1, %xmm1 +; SSE-NEXT: cvtsi2ssq %rax, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] ; SSE-NEXT: movq %xmm0, %rax ; SSE-NEXT: xorps %xmm0, %xmm0 ; SSE-NEXT: cvtsi2ssq %rax, %xmm0 -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE-NEXT: movaps %xmm2, %xmm0 +; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; SSE-NEXT: movapd %xmm1, %xmm0 ; SSE-NEXT: retq ; ; AVX1-LABEL: sitofp_4i64_to_4f32: @@ -1838,21 +1839,14 @@ ; SSE-LABEL: uitofp_4i64_to_4f32_undef: ; SSE: # BB#0: ; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: testq %rax, %rax -; SSE-NEXT: xorps %xmm2, %xmm2 -; SSE-NEXT: js .LBB41_2 -; SSE-NEXT: # BB#1: -; SSE-NEXT: xorps %xmm2, %xmm2 -; SSE-NEXT: cvtsi2ssq %rax, %xmm2 -; SSE-NEXT: .LBB41_2: ; SSE-NEXT: movq %xmm1, %rax ; SSE-NEXT: testq %rax, %rax -; SSE-NEXT: js .LBB41_3 -; SSE-NEXT: # BB#4: +; SSE-NEXT: js .LBB41_1 +; SSE-NEXT: # BB#2: ; SSE-NEXT: xorps %xmm0, %xmm0 ; SSE-NEXT: cvtsi2ssq %rax, %xmm0 -; SSE-NEXT: jmp .LBB41_5 -; SSE-NEXT: .LBB41_3: +; SSE-NEXT: jmp .LBB41_3 +; SSE-NEXT: .LBB41_1: ; SSE-NEXT: movq %rax, %rcx ; SSE-NEXT: shrq %rcx ; SSE-NEXT: andl $1, %eax @@ -1860,17 +1854,16 @@ ; SSE-NEXT: xorps %xmm0, %xmm0 ; SSE-NEXT: cvtsi2ssq %rax, %xmm0 ; SSE-NEXT: addss %xmm0, %xmm0 -; SSE-NEXT: .LBB41_5: -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE-NEXT: .LBB41_3: ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] ; SSE-NEXT: movq %xmm1, %rax ; SSE-NEXT: testq %rax, %rax -; SSE-NEXT: js .LBB41_6 -; SSE-NEXT: # BB#7: +; SSE-NEXT: js .LBB41_4 +; SSE-NEXT: # BB#5: ; SSE-NEXT: xorps %xmm1, %xmm1 ; SSE-NEXT: cvtsi2ssq %rax, %xmm1 -; SSE-NEXT: jmp .LBB41_8 -; SSE-NEXT: .LBB41_6: +; SSE-NEXT: jmp .LBB41_6 +; SSE-NEXT: .LBB41_4: ; SSE-NEXT: movq %rax, %rcx ; SSE-NEXT: shrq %rcx ; SSE-NEXT: andl $1, %eax @@ -1878,9 +1871,16 @@ ; SSE-NEXT: xorps %xmm1, %xmm1 ; SSE-NEXT: cvtsi2ssq %rax, %xmm1 ; SSE-NEXT: addss %xmm1, %xmm1 -; SSE-NEXT: .LBB41_8: -; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSE-NEXT: .LBB41_6: ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: testq %rax, %rax +; SSE-NEXT: xorps %xmm1, %xmm1 +; SSE-NEXT: js .LBB41_8 +; SSE-NEXT: # BB#7: +; SSE-NEXT: xorps %xmm1, %xmm1 +; SSE-NEXT: cvtsi2ssq %rax, %xmm1 +; SSE-NEXT: .LBB41_8: +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,0] ; SSE-NEXT: retq ; ; VEX-LABEL: uitofp_4i64_to_4f32_undef: @@ -2149,32 +2149,32 @@ ; SSE-NEXT: testq %rax, %rax ; SSE-NEXT: js .LBB47_1 ; SSE-NEXT: # BB#2: -; SSE-NEXT: cvtsi2ssq %rax, %xmm3 +; SSE-NEXT: cvtsi2ssq %rax, %xmm2 ; SSE-NEXT: jmp .LBB47_3 ; SSE-NEXT: .LBB47_1: ; SSE-NEXT: movq %rax, %rcx ; SSE-NEXT: shrq %rcx ; SSE-NEXT: andl $1, %eax ; SSE-NEXT: orq %rcx, %rax -; SSE-NEXT: cvtsi2ssq %rax, %xmm3 -; SSE-NEXT: addss %xmm3, %xmm3 +; SSE-NEXT: cvtsi2ssq %rax, %xmm2 +; SSE-NEXT: addss %xmm2, %xmm2 ; SSE-NEXT: .LBB47_3: -; SSE-NEXT: movq %xmm0, %rax +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE-NEXT: movq %xmm1, %rax ; SSE-NEXT: testq %rax, %rax ; SSE-NEXT: js .LBB47_4 ; SSE-NEXT: # BB#5: -; SSE-NEXT: cvtsi2ssq %rax, %xmm2 +; SSE-NEXT: cvtsi2ssq %rax, %xmm3 ; SSE-NEXT: jmp .LBB47_6 ; SSE-NEXT: .LBB47_4: ; SSE-NEXT: movq %rax, %rcx ; SSE-NEXT: shrq %rcx ; SSE-NEXT: andl $1, %eax ; SSE-NEXT: orq %rcx, %rax -; SSE-NEXT: cvtsi2ssq %rax, %xmm2 -; SSE-NEXT: addss %xmm2, %xmm2 +; SSE-NEXT: cvtsi2ssq %rax, %xmm3 +; SSE-NEXT: addss %xmm3, %xmm3 ; SSE-NEXT: .LBB47_6: -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] -; SSE-NEXT: movq %xmm1, %rax +; SSE-NEXT: movq %xmm0, %rax ; SSE-NEXT: testq %rax, %rax ; SSE-NEXT: js .LBB47_7 ; SSE-NEXT: # BB#8: @@ -2208,9 +2208,9 @@ ; SSE-NEXT: cvtsi2ssq %rax, %xmm0 ; SSE-NEXT: addss %xmm0, %xmm0 ; SSE-NEXT: .LBB47_12: -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE-NEXT: movaps %xmm2, %xmm0 +; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; SSE-NEXT: movapd %xmm1, %xmm0 ; SSE-NEXT: retq ; ; AVX1-LABEL: uitofp_4i64_to_4f32: @@ -3381,22 +3381,23 @@ ; SSE-LABEL: sitofp_load_4i64_to_4f32: ; SSE: # BB#0: ; SSE-NEXT: movdqa (%rdi), %xmm1 -; SSE-NEXT: movdqa 16(%rdi), %xmm2 -; SSE-NEXT: movq %xmm2, %rax -; SSE-NEXT: cvtsi2ssq %rax, %xmm3 +; SSE-NEXT: movdqa 16(%rdi), %xmm0 +; SSE-NEXT: movq %xmm0, %rax +; SSE-NEXT: cvtsi2ssq %rax, %xmm2 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE-NEXT: movq %xmm0, %rax +; SSE-NEXT: xorps %xmm0, %xmm0 +; SSE-NEXT: cvtsi2ssq %rax, %xmm0 +; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] ; SSE-NEXT: movq %xmm1, %rax +; SSE-NEXT: xorps %xmm0, %xmm0 ; SSE-NEXT: cvtsi2ssq %rax, %xmm0 -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] -; SSE-NEXT: movq %xmm2, %rax -; SSE-NEXT: xorps %xmm2, %xmm2 -; SSE-NEXT: cvtsi2ssq %rax, %xmm2 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] ; SSE-NEXT: movq %xmm1, %rax ; SSE-NEXT: xorps %xmm1, %xmm1 ; SSE-NEXT: cvtsi2ssq %rax, %xmm1 -; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0] ; SSE-NEXT: retq ; ; AVX1-LABEL: sitofp_load_4i64_to_4f32: @@ -3546,41 +3547,42 @@ ; SSE-LABEL: sitofp_load_8i64_to_8f32: ; SSE: # BB#0: ; SSE-NEXT: movdqa (%rdi), %xmm1 -; SSE-NEXT: movdqa 16(%rdi), %xmm2 -; SSE-NEXT: movdqa 32(%rdi), %xmm3 -; SSE-NEXT: movdqa 48(%rdi), %xmm4 -; SSE-NEXT: movq %xmm2, %rax -; SSE-NEXT: cvtsi2ssq %rax, %xmm5 +; SSE-NEXT: movdqa 16(%rdi), %xmm0 +; SSE-NEXT: movdqa 32(%rdi), %xmm2 +; SSE-NEXT: movdqa 48(%rdi), %xmm3 +; SSE-NEXT: movq %xmm0, %rax +; SSE-NEXT: cvtsi2ssq %rax, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE-NEXT: movq %xmm0, %rax +; SSE-NEXT: xorps %xmm0, %xmm0 +; SSE-NEXT: cvtsi2ssq %rax, %xmm0 +; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] ; SSE-NEXT: movq %xmm1, %rax +; SSE-NEXT: xorps %xmm0, %xmm0 ; SSE-NEXT: cvtsi2ssq %rax, %xmm0 -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] -; SSE-NEXT: movq %xmm2, %rax -; SSE-NEXT: xorps %xmm2, %xmm2 -; SSE-NEXT: cvtsi2ssq %rax, %xmm2 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] ; SSE-NEXT: movq %xmm1, %rax ; SSE-NEXT: xorps %xmm1, %xmm1 ; SSE-NEXT: cvtsi2ssq %rax, %xmm1 -; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movq %xmm4, %rax -; SSE-NEXT: xorps %xmm2, %xmm2 -; SSE-NEXT: cvtsi2ssq %rax, %xmm2 +; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm4[0] ; SSE-NEXT: movq %xmm3, %rax +; SSE-NEXT: xorps %xmm4, %xmm4 +; SSE-NEXT: cvtsi2ssq %rax, %xmm4 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,0,1] +; SSE-NEXT: movq %xmm1, %rax ; SSE-NEXT: xorps %xmm1, %xmm1 ; SSE-NEXT: cvtsi2ssq %rax, %xmm1 -; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[2,3,0,1] +; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; SSE-NEXT: movq %xmm2, %rax +; SSE-NEXT: xorps %xmm1, %xmm1 +; SSE-NEXT: cvtsi2ssq %rax, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] ; SSE-NEXT: movq %xmm2, %rax ; SSE-NEXT: xorps %xmm2, %xmm2 ; SSE-NEXT: cvtsi2ssq %rax, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,0,1] -; SSE-NEXT: movq %xmm3, %rax -; SSE-NEXT: xorps %xmm3, %xmm3 -; SSE-NEXT: cvtsi2ssq %rax, %xmm3 -; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] +; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSE-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm4[0] ; SSE-NEXT: retq ; ; AVX1-LABEL: sitofp_load_8i64_to_8f32: @@ -3822,73 +3824,73 @@ define <4 x float> @uitofp_load_4i64_to_4f32(<4 x i64> *%a) { ; SSE-LABEL: uitofp_load_4i64_to_4f32: ; SSE: # BB#0: -; SSE-NEXT: movdqa (%rdi), %xmm1 -; SSE-NEXT: movdqa 16(%rdi), %xmm3 -; SSE-NEXT: movq %xmm3, %rax +; SSE-NEXT: movdqa (%rdi), %xmm2 +; SSE-NEXT: movdqa 16(%rdi), %xmm0 +; SSE-NEXT: movq %xmm0, %rax ; SSE-NEXT: testq %rax, %rax ; SSE-NEXT: js .LBB76_1 ; SSE-NEXT: # BB#2: -; SSE-NEXT: cvtsi2ssq %rax, %xmm2 +; SSE-NEXT: cvtsi2ssq %rax, %xmm1 ; SSE-NEXT: jmp .LBB76_3 ; SSE-NEXT: .LBB76_1: ; SSE-NEXT: movq %rax, %rcx ; SSE-NEXT: shrq %rcx ; SSE-NEXT: andl $1, %eax ; SSE-NEXT: orq %rcx, %rax -; SSE-NEXT: cvtsi2ssq %rax, %xmm2 -; SSE-NEXT: addss %xmm2, %xmm2 +; SSE-NEXT: cvtsi2ssq %rax, %xmm1 +; SSE-NEXT: addss %xmm1, %xmm1 ; SSE-NEXT: .LBB76_3: -; SSE-NEXT: movq %xmm1, %rax +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE-NEXT: movq %xmm0, %rax ; SSE-NEXT: testq %rax, %rax ; SSE-NEXT: js .LBB76_4 ; SSE-NEXT: # BB#5: -; SSE-NEXT: cvtsi2ssq %rax, %xmm0 +; SSE-NEXT: cvtsi2ssq %rax, %xmm3 ; SSE-NEXT: jmp .LBB76_6 ; SSE-NEXT: .LBB76_4: ; SSE-NEXT: movq %rax, %rcx ; SSE-NEXT: shrq %rcx ; SSE-NEXT: andl $1, %eax ; SSE-NEXT: orq %rcx, %rax -; SSE-NEXT: cvtsi2ssq %rax, %xmm0 -; SSE-NEXT: addss %xmm0, %xmm0 +; SSE-NEXT: cvtsi2ssq %rax, %xmm3 +; SSE-NEXT: addss %xmm3, %xmm3 ; SSE-NEXT: .LBB76_6: -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,0,1] -; SSE-NEXT: movq %xmm3, %rax +; SSE-NEXT: movq %xmm2, %rax ; SSE-NEXT: testq %rax, %rax ; SSE-NEXT: js .LBB76_7 ; SSE-NEXT: # BB#8: -; SSE-NEXT: xorps %xmm3, %xmm3 -; SSE-NEXT: cvtsi2ssq %rax, %xmm3 +; SSE-NEXT: xorps %xmm0, %xmm0 +; SSE-NEXT: cvtsi2ssq %rax, %xmm0 ; SSE-NEXT: jmp .LBB76_9 ; SSE-NEXT: .LBB76_7: ; SSE-NEXT: movq %rax, %rcx ; SSE-NEXT: shrq %rcx ; SSE-NEXT: andl $1, %eax ; SSE-NEXT: orq %rcx, %rax -; SSE-NEXT: xorps %xmm3, %xmm3 -; SSE-NEXT: cvtsi2ssq %rax, %xmm3 -; SSE-NEXT: addss %xmm3, %xmm3 +; SSE-NEXT: xorps %xmm0, %xmm0 +; SSE-NEXT: cvtsi2ssq %rax, %xmm0 +; SSE-NEXT: addss %xmm0, %xmm0 ; SSE-NEXT: .LBB76_9: -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] -; SSE-NEXT: movq %xmm1, %rax +; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] +; SSE-NEXT: movq %xmm2, %rax ; SSE-NEXT: testq %rax, %rax ; SSE-NEXT: js .LBB76_10 ; SSE-NEXT: # BB#11: -; SSE-NEXT: xorps %xmm1, %xmm1 -; SSE-NEXT: cvtsi2ssq %rax, %xmm1 +; SSE-NEXT: xorps %xmm2, %xmm2 +; SSE-NEXT: cvtsi2ssq %rax, %xmm2 ; SSE-NEXT: jmp .LBB76_12 ; SSE-NEXT: .LBB76_10: ; SSE-NEXT: movq %rax, %rcx ; SSE-NEXT: shrq %rcx ; SSE-NEXT: andl $1, %eax ; SSE-NEXT: orq %rcx, %rax -; SSE-NEXT: xorps %xmm1, %xmm1 -; SSE-NEXT: cvtsi2ssq %rax, %xmm1 -; SSE-NEXT: addss %xmm1, %xmm1 +; SSE-NEXT: xorps %xmm2, %xmm2 +; SSE-NEXT: cvtsi2ssq %rax, %xmm2 +; SSE-NEXT: addss %xmm2, %xmm2 ; SSE-NEXT: .LBB76_12: -; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSE-NEXT: retq ; ; AVX1-LABEL: uitofp_load_4i64_to_4f32: @@ -4186,121 +4188,121 @@ define <8 x float> @uitofp_load_8i64_to_8f32(<8 x i64> *%a) { ; SSE-LABEL: uitofp_load_8i64_to_8f32: ; SSE: # BB#0: -; SSE-NEXT: movdqa (%rdi), %xmm1 -; SSE-NEXT: movdqa 16(%rdi), %xmm5 +; SSE-NEXT: movdqa (%rdi), %xmm5 +; SSE-NEXT: movdqa 16(%rdi), %xmm0 ; SSE-NEXT: movdqa 32(%rdi), %xmm2 -; SSE-NEXT: movdqa 48(%rdi), %xmm3 -; SSE-NEXT: movq %xmm5, %rax +; SSE-NEXT: movdqa 48(%rdi), %xmm1 +; SSE-NEXT: movq %xmm0, %rax ; SSE-NEXT: testq %rax, %rax ; SSE-NEXT: js .LBB80_1 ; SSE-NEXT: # BB#2: -; SSE-NEXT: cvtsi2ssq %rax, %xmm4 +; SSE-NEXT: cvtsi2ssq %rax, %xmm3 ; SSE-NEXT: jmp .LBB80_3 ; SSE-NEXT: .LBB80_1: ; SSE-NEXT: movq %rax, %rcx ; SSE-NEXT: shrq %rcx ; SSE-NEXT: andl $1, %eax ; SSE-NEXT: orq %rcx, %rax -; SSE-NEXT: cvtsi2ssq %rax, %xmm4 -; SSE-NEXT: addss %xmm4, %xmm4 +; SSE-NEXT: cvtsi2ssq %rax, %xmm3 +; SSE-NEXT: addss %xmm3, %xmm3 ; SSE-NEXT: .LBB80_3: -; SSE-NEXT: movq %xmm1, %rax +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE-NEXT: movq %xmm0, %rax ; SSE-NEXT: testq %rax, %rax ; SSE-NEXT: js .LBB80_4 ; SSE-NEXT: # BB#5: -; SSE-NEXT: cvtsi2ssq %rax, %xmm0 +; SSE-NEXT: cvtsi2ssq %rax, %xmm4 ; SSE-NEXT: jmp .LBB80_6 ; SSE-NEXT: .LBB80_4: ; SSE-NEXT: movq %rax, %rcx ; SSE-NEXT: shrq %rcx ; SSE-NEXT: andl $1, %eax ; SSE-NEXT: orq %rcx, %rax -; SSE-NEXT: cvtsi2ssq %rax, %xmm0 -; SSE-NEXT: addss %xmm0, %xmm0 +; SSE-NEXT: cvtsi2ssq %rax, %xmm4 +; SSE-NEXT: addss %xmm4, %xmm4 ; SSE-NEXT: .LBB80_6: -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,3,0,1] ; SSE-NEXT: movq %xmm5, %rax ; SSE-NEXT: testq %rax, %rax ; SSE-NEXT: js .LBB80_7 ; SSE-NEXT: # BB#8: -; SSE-NEXT: cvtsi2ssq %rax, %xmm6 +; SSE-NEXT: xorps %xmm0, %xmm0 +; SSE-NEXT: cvtsi2ssq %rax, %xmm0 ; SSE-NEXT: jmp .LBB80_9 ; SSE-NEXT: .LBB80_7: ; SSE-NEXT: movq %rax, %rcx ; SSE-NEXT: shrq %rcx ; SSE-NEXT: andl $1, %eax ; SSE-NEXT: orq %rcx, %rax -; SSE-NEXT: cvtsi2ssq %rax, %xmm6 -; SSE-NEXT: addss %xmm6, %xmm6 +; SSE-NEXT: xorps %xmm0, %xmm0 +; SSE-NEXT: cvtsi2ssq %rax, %xmm0 +; SSE-NEXT: addss %xmm0, %xmm0 ; SSE-NEXT: .LBB80_9: -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] -; SSE-NEXT: movq %xmm1, %rax +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,3,0,1] +; SSE-NEXT: movq %xmm5, %rax ; SSE-NEXT: testq %rax, %rax ; SSE-NEXT: js .LBB80_10 ; SSE-NEXT: # BB#11: -; SSE-NEXT: xorps %xmm5, %xmm5 -; SSE-NEXT: cvtsi2ssq %rax, %xmm5 +; SSE-NEXT: cvtsi2ssq %rax, %xmm6 ; SSE-NEXT: jmp .LBB80_12 ; SSE-NEXT: .LBB80_10: ; SSE-NEXT: movq %rax, %rcx ; SSE-NEXT: shrq %rcx ; SSE-NEXT: andl $1, %eax ; SSE-NEXT: orq %rcx, %rax -; SSE-NEXT: xorps %xmm5, %xmm5 -; SSE-NEXT: cvtsi2ssq %rax, %xmm5 -; SSE-NEXT: addss %xmm5, %xmm5 +; SSE-NEXT: cvtsi2ssq %rax, %xmm6 +; SSE-NEXT: addss %xmm6, %xmm6 ; SSE-NEXT: .LBB80_12: -; SSE-NEXT: movq %xmm3, %rax +; SSE-NEXT: movq %xmm1, %rax ; SSE-NEXT: testq %rax, %rax ; SSE-NEXT: js .LBB80_13 ; SSE-NEXT: # BB#14: -; SSE-NEXT: cvtsi2ssq %rax, %xmm7 +; SSE-NEXT: xorps %xmm5, %xmm5 +; SSE-NEXT: cvtsi2ssq %rax, %xmm5 ; SSE-NEXT: jmp .LBB80_15 ; SSE-NEXT: .LBB80_13: ; SSE-NEXT: movq %rax, %rcx ; SSE-NEXT: shrq %rcx ; SSE-NEXT: andl $1, %eax ; SSE-NEXT: orq %rcx, %rax -; SSE-NEXT: cvtsi2ssq %rax, %xmm7 -; SSE-NEXT: addss %xmm7, %xmm7 +; SSE-NEXT: xorps %xmm5, %xmm5 +; SSE-NEXT: cvtsi2ssq %rax, %xmm5 +; SSE-NEXT: addss %xmm5, %xmm5 ; SSE-NEXT: .LBB80_15: -; SSE-NEXT: movq %xmm2, %rax +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE-NEXT: movq %xmm1, %rax ; SSE-NEXT: testq %rax, %rax ; SSE-NEXT: js .LBB80_16 ; SSE-NEXT: # BB#17: -; SSE-NEXT: xorps %xmm1, %xmm1 -; SSE-NEXT: cvtsi2ssq %rax, %xmm1 +; SSE-NEXT: cvtsi2ssq %rax, %xmm7 ; SSE-NEXT: jmp .LBB80_18 ; SSE-NEXT: .LBB80_16: ; SSE-NEXT: movq %rax, %rcx ; SSE-NEXT: shrq %rcx ; SSE-NEXT: andl $1, %eax ; SSE-NEXT: orq %rcx, %rax -; SSE-NEXT: xorps %xmm1, %xmm1 -; SSE-NEXT: cvtsi2ssq %rax, %xmm1 -; SSE-NEXT: addss %xmm1, %xmm1 +; SSE-NEXT: cvtsi2ssq %rax, %xmm7 +; SSE-NEXT: addss %xmm7, %xmm7 ; SSE-NEXT: .LBB80_18: -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] -; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,0,1] -; SSE-NEXT: movq %xmm3, %rax +; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1] +; SSE-NEXT: movq %xmm2, %rax ; SSE-NEXT: testq %rax, %rax ; SSE-NEXT: js .LBB80_19 ; SSE-NEXT: # BB#20: -; SSE-NEXT: xorps %xmm3, %xmm3 -; SSE-NEXT: cvtsi2ssq %rax, %xmm3 +; SSE-NEXT: xorps %xmm1, %xmm1 +; SSE-NEXT: cvtsi2ssq %rax, %xmm1 ; SSE-NEXT: jmp .LBB80_21 ; SSE-NEXT: .LBB80_19: ; SSE-NEXT: movq %rax, %rcx ; SSE-NEXT: shrq %rcx ; SSE-NEXT: andl $1, %eax ; SSE-NEXT: orq %rcx, %rax -; SSE-NEXT: xorps %xmm3, %xmm3 -; SSE-NEXT: cvtsi2ssq %rax, %xmm3 -; SSE-NEXT: addss %xmm3, %xmm3 +; SSE-NEXT: xorps %xmm1, %xmm1 +; SSE-NEXT: cvtsi2ssq %rax, %xmm1 +; SSE-NEXT: addss %xmm1, %xmm1 ; SSE-NEXT: .LBB80_21: -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] -; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1] +; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm3[0] +; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1] ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] ; SSE-NEXT: movq %xmm2, %rax ; SSE-NEXT: testq %rax, %rax @@ -4318,8 +4320,8 @@ ; SSE-NEXT: cvtsi2ssq %rax, %xmm2 ; SSE-NEXT: addss %xmm2, %xmm2 ; SSE-NEXT: .LBB80_24: -; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] ; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSE-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm5[0] ; SSE-NEXT: retq ; ; AVX1-LABEL: uitofp_load_8i64_to_8f32: Index: test/CodeGen/X86/vec_set.ll =================================================================== --- test/CodeGen/X86/vec_set.ll +++ test/CodeGen/X86/vec_set.ll @@ -12,35 +12,35 @@ ; X86-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; X86-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero ; X86-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; X86-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; X86-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; X86-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; X86-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero ; X86-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; X86-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; X86-NEXT: movd {{.*#+}} xmm3 = mem[0],zero,zero,zero ; X86-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] -; X86-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] -; X86-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; X86-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; X86-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm2[0] ; X86-NEXT: movdqa %xmm3, (%eax) ; X86-NEXT: retl ; ; X64-LABEL: test: ; X64: # BB#0: -; X64-NEXT: movd %r8d, %xmm0 +; X64-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; X64-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; X64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; X64-NEXT: movd %edx, %xmm1 -; X64-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero -; X64-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] ; X64-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; X64-NEXT: movd %ecx, %xmm0 +; X64-NEXT: movd %r9d, %xmm0 ; X64-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero ; X64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; X64-NEXT: movd %r9d, %xmm2 +; X64-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; X64-NEXT: movd %r8d, %xmm1 +; X64-NEXT: movd %ecx, %xmm2 +; X64-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; X64-NEXT: movd %edx, %xmm1 ; X64-NEXT: movd %esi, %xmm3 -; X64-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; X64-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] ; X64-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] +; X64-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; X64-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm0[0] ; X64-NEXT: movdqa %xmm3, (%rdi) ; X64-NEXT: retq %tmp = insertelement <8 x i16> zeroinitializer, i16 %a0, i32 0 Index: test/CodeGen/X86/vector-rem.ll =================================================================== --- test/CodeGen/X86/vector-rem.ll +++ test/CodeGen/X86/vector-rem.ll @@ -11,9 +11,9 @@ ; CHECK-NEXT: cltd ; CHECK-NEXT: idivl %ecx ; CHECK-NEXT: movd %edx, %xmm2 -; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,2,3] +; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1] ; CHECK-NEXT: movd %xmm3, %eax -; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,2,3] +; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1] ; CHECK-NEXT: movd %xmm3, %ecx ; CHECK-NEXT: cltd ; CHECK-NEXT: idivl %ecx @@ -24,15 +24,15 @@ ; CHECK-NEXT: cltd ; CHECK-NEXT: idivl %ecx ; CHECK-NEXT: movd %edx, %xmm2 -; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] ; CHECK-NEXT: movd %xmm0, %eax -; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; CHECK-NEXT: movd %xmm0, %ecx ; CHECK-NEXT: cltd ; CHECK-NEXT: idivl %ecx ; CHECK-NEXT: movd %edx, %xmm0 ; CHECK-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; CHECK-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; CHECK-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] ; CHECK-NEXT: movdqa %xmm2, %xmm0 ; CHECK-NEXT: retq %m = srem <4 x i32> %t, %u @@ -49,9 +49,9 @@ ; CHECK-NEXT: xorl %edx, %edx ; CHECK-NEXT: divl %ecx ; CHECK-NEXT: movd %edx, %xmm2 -; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,2,3] +; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1] ; CHECK-NEXT: movd %xmm3, %eax -; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,2,3] +; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1] ; CHECK-NEXT: movd %xmm3, %ecx ; CHECK-NEXT: xorl %edx, %edx ; CHECK-NEXT: divl %ecx @@ -62,15 +62,15 @@ ; CHECK-NEXT: xorl %edx, %edx ; CHECK-NEXT: divl %ecx ; CHECK-NEXT: movd %edx, %xmm2 -; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] ; CHECK-NEXT: movd %xmm0, %eax -; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; CHECK-NEXT: movd %xmm0, %ecx ; CHECK-NEXT: xorl %edx, %edx ; CHECK-NEXT: divl %ecx ; CHECK-NEXT: movd %edx, %xmm0 ; CHECK-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; CHECK-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; CHECK-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] ; CHECK-NEXT: movdqa %xmm2, %xmm0 ; CHECK-NEXT: retq %m = urem <4 x i32> %t, %u @@ -88,9 +88,9 @@ ; CHECK-NEXT: callq fmodf ; CHECK-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill ; CHECK-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload -; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,2,3] +; CHECK-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] ; CHECK-NEXT: movaps {{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload -; CHECK-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,2,3] +; CHECK-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1] ; CHECK-NEXT: callq fmodf ; CHECK-NEXT: unpcklps (%rsp), %xmm0 # 16-byte Folded Reload ; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] @@ -100,15 +100,15 @@ ; CHECK-NEXT: callq fmodf ; CHECK-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill ; CHECK-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload -; CHECK-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] +; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,2,3] ; CHECK-NEXT: movaps {{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload -; CHECK-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1] +; CHECK-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,2,3] ; CHECK-NEXT: callq fmodf ; CHECK-NEXT: movaps {{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload ; CHECK-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; CHECK-NEXT: unpcklps (%rsp), %xmm1 # 16-byte Folded Reload -; CHECK-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] -; CHECK-NEXT: movaps %xmm1, %xmm0 +; CHECK-NEXT: unpcklpd (%rsp), %xmm1 # 16-byte Folded Reload +; CHECK-NEXT: # xmm1 = xmm1[0],mem[0] +; CHECK-NEXT: movapd %xmm1, %xmm0 ; CHECK-NEXT: addq $72, %rsp ; CHECK-NEXT: retq %m = frem <4 x float> %t, %u Index: test/CodeGen/X86/vector-sext.ll =================================================================== --- test/CodeGen/X86/vector-sext.ll +++ test/CodeGen/X86/vector-sext.ll @@ -1333,19 +1333,19 @@ ; SSE2-NEXT: sarq $63, %rcx ; SSE2-NEXT: movd %ecx, %xmm0 ; SSE2-NEXT: movq %rax, %rcx -; SSE2-NEXT: shlq $62, %rcx +; SSE2-NEXT: shlq $61, %rcx ; SSE2-NEXT: sarq $63, %rcx ; SSE2-NEXT: movd %ecx, %xmm1 ; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE2-NEXT: movq %rax, %rcx -; SSE2-NEXT: shlq $61, %rcx +; SSE2-NEXT: shlq $62, %rcx ; SSE2-NEXT: sarq $63, %rcx ; SSE2-NEXT: movd %ecx, %xmm2 ; SSE2-NEXT: shlq $63, %rax ; SSE2-NEXT: sarq $63, %rax ; SSE2-NEXT: movd %eax, %xmm0 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSE2-NEXT: retq ; ; SSSE3-LABEL: load_sext_4i1_to_4i32: @@ -1356,19 +1356,19 @@ ; SSSE3-NEXT: sarq $63, %rcx ; SSSE3-NEXT: movd %ecx, %xmm0 ; SSSE3-NEXT: movq %rax, %rcx -; SSSE3-NEXT: shlq $62, %rcx +; SSSE3-NEXT: shlq $61, %rcx ; SSSE3-NEXT: sarq $63, %rcx ; SSSE3-NEXT: movd %ecx, %xmm1 ; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSSE3-NEXT: movq %rax, %rcx -; SSSE3-NEXT: shlq $61, %rcx +; SSSE3-NEXT: shlq $62, %rcx ; SSSE3-NEXT: sarq $63, %rcx ; SSSE3-NEXT: movd %ecx, %xmm2 ; SSSE3-NEXT: shlq $63, %rax ; SSSE3-NEXT: sarq $63, %rax ; SSSE3-NEXT: movd %eax, %xmm0 ; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSSE3-NEXT: retq ; ; SSE41-LABEL: load_sext_4i1_to_4i32: @@ -1523,14 +1523,14 @@ ; SSE2-NEXT: shrl $3, %ecx ; SSE2-NEXT: movd %ecx, %xmm0 ; SSE2-NEXT: movl %eax, %ecx -; SSE2-NEXT: shrl %ecx +; SSE2-NEXT: shrl $2, %ecx ; SSE2-NEXT: movd %ecx, %xmm1 ; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSE2-NEXT: movd %eax, %xmm2 -; SSE2-NEXT: shrl $2, %eax +; SSE2-NEXT: shrl %eax ; SSE2-NEXT: movd %eax, %xmm0 ; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0] ; SSE2-NEXT: pand {{.*}}(%rip), %xmm2 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,1,1,3] ; SSE2-NEXT: psllq $63, %xmm0 @@ -1549,14 +1549,14 @@ ; SSSE3-NEXT: shrl $3, %ecx ; SSSE3-NEXT: movd %ecx, %xmm0 ; SSSE3-NEXT: movl %eax, %ecx -; SSSE3-NEXT: shrl %ecx +; SSSE3-NEXT: shrl $2, %ecx ; SSSE3-NEXT: movd %ecx, %xmm1 ; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] ; SSSE3-NEXT: movd %eax, %xmm2 -; SSSE3-NEXT: shrl $2, %eax +; SSSE3-NEXT: shrl %eax ; SSSE3-NEXT: movd %eax, %xmm0 ; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0] ; SSSE3-NEXT: pand {{.*}}(%rip), %xmm2 ; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,1,1,3] ; SSSE3-NEXT: psllq $63, %xmm0 @@ -1813,7 +1813,7 @@ ; SSE2-NEXT: shrq $7, %rcx ; SSE2-NEXT: movd %ecx, %xmm0 ; SSE2-NEXT: movq %rax, %rcx -; SSE2-NEXT: shlq $60, %rcx +; SSE2-NEXT: shlq $57, %rcx ; SSE2-NEXT: sarq $63, %rcx ; SSE2-NEXT: movd %ecx, %xmm2 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] @@ -1822,13 +1822,13 @@ ; SSE2-NEXT: sarq $63, %rcx ; SSE2-NEXT: movd %ecx, %xmm0 ; SSE2-NEXT: movq %rax, %rcx -; SSE2-NEXT: shlq $62, %rcx +; SSE2-NEXT: shlq $59, %rcx ; SSE2-NEXT: sarq $63, %rcx ; SSE2-NEXT: movd %ecx, %xmm1 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; SSE2-NEXT: movq %rax, %rcx -; SSE2-NEXT: shlq $57, %rcx +; SSE2-NEXT: shlq $60, %rcx ; SSE2-NEXT: sarq $63, %rcx ; SSE2-NEXT: movd %ecx, %xmm0 ; SSE2-NEXT: movq %rax, %rcx @@ -1837,15 +1837,15 @@ ; SSE2-NEXT: movd %ecx, %xmm2 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] ; SSE2-NEXT: movq %rax, %rcx -; SSE2-NEXT: shlq $59, %rcx +; SSE2-NEXT: shlq $62, %rcx ; SSE2-NEXT: sarq $63, %rcx ; SSE2-NEXT: movd %ecx, %xmm3 ; SSE2-NEXT: shlq $63, %rax ; SSE2-NEXT: sarq $63, %rax ; SSE2-NEXT: movd %eax, %xmm0 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSE2-NEXT: retq ; ; SSSE3-LABEL: load_sext_8i1_to_8i16: @@ -1855,7 +1855,7 @@ ; SSSE3-NEXT: shrq $7, %rcx ; SSSE3-NEXT: movd %ecx, %xmm0 ; SSSE3-NEXT: movq %rax, %rcx -; SSSE3-NEXT: shlq $60, %rcx +; SSSE3-NEXT: shlq $57, %rcx ; SSSE3-NEXT: sarq $63, %rcx ; SSSE3-NEXT: movd %ecx, %xmm2 ; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] @@ -1864,13 +1864,13 @@ ; SSSE3-NEXT: sarq $63, %rcx ; SSSE3-NEXT: movd %ecx, %xmm0 ; SSSE3-NEXT: movq %rax, %rcx -; SSSE3-NEXT: shlq $62, %rcx +; SSSE3-NEXT: shlq $59, %rcx ; SSSE3-NEXT: sarq $63, %rcx ; SSSE3-NEXT: movd %ecx, %xmm1 ; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; SSSE3-NEXT: movq %rax, %rcx -; SSSE3-NEXT: shlq $57, %rcx +; SSSE3-NEXT: shlq $60, %rcx ; SSSE3-NEXT: sarq $63, %rcx ; SSSE3-NEXT: movd %ecx, %xmm0 ; SSSE3-NEXT: movq %rax, %rcx @@ -1879,15 +1879,15 @@ ; SSSE3-NEXT: movd %ecx, %xmm2 ; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] ; SSSE3-NEXT: movq %rax, %rcx -; SSSE3-NEXT: shlq $59, %rcx +; SSSE3-NEXT: shlq $62, %rcx ; SSSE3-NEXT: sarq $63, %rcx ; SSSE3-NEXT: movd %ecx, %xmm3 ; SSSE3-NEXT: shlq $63, %rax ; SSSE3-NEXT: sarq $63, %rax ; SSSE3-NEXT: movd %eax, %xmm0 ; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSSE3-NEXT: retq ; ; SSE41-LABEL: load_sext_8i1_to_8i16: @@ -2191,7 +2191,7 @@ ; SSE2: # BB#0: # %entry ; SSE2-NEXT: movzbl (%rdi), %eax ; SSE2-NEXT: movl %eax, %ecx -; SSE2-NEXT: shrl $6, %ecx +; SSE2-NEXT: shrl $3, %ecx ; SSE2-NEXT: andl $1, %ecx ; SSE2-NEXT: movd %ecx, %xmm0 ; SSE2-NEXT: movl %eax, %ecx @@ -2203,30 +2203,30 @@ ; SSE2-NEXT: andl $1, %ecx ; SSE2-NEXT: movd %ecx, %xmm1 ; SSE2-NEXT: movl %eax, %ecx -; SSE2-NEXT: shrl $4, %ecx +; SSE2-NEXT: shrl %ecx ; SSE2-NEXT: andl $1, %ecx ; SSE2-NEXT: movd %ecx, %xmm0 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; SSE2-NEXT: movl %eax, %ecx ; SSE2-NEXT: shrl $5, %ecx ; SSE2-NEXT: andl $1, %ecx ; SSE2-NEXT: movd %ecx, %xmm0 ; SSE2-NEXT: movl %eax, %ecx -; SSE2-NEXT: shrl %ecx +; SSE2-NEXT: shrl $4, %ecx ; SSE2-NEXT: andl $1, %ecx ; SSE2-NEXT: movd %ecx, %xmm2 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] ; SSE2-NEXT: movl %eax, %ecx -; SSE2-NEXT: shrl $3, %ecx +; SSE2-NEXT: shrl $6, %ecx ; SSE2-NEXT: andl $1, %ecx ; SSE2-NEXT: movd %ecx, %xmm0 ; SSE2-NEXT: shrl $7, %eax ; SSE2-NEXT: movzwl %ax, %eax ; SSE2-NEXT: movd %eax, %xmm3 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; SSE2-NEXT: movdqa %xmm1, %xmm0 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] ; SSE2-NEXT: pslld $31, %xmm0 @@ -2240,7 +2240,7 @@ ; SSSE3: # BB#0: # %entry ; SSSE3-NEXT: movzbl (%rdi), %eax ; SSSE3-NEXT: movl %eax, %ecx -; SSSE3-NEXT: shrl $6, %ecx +; SSSE3-NEXT: shrl $3, %ecx ; SSSE3-NEXT: andl $1, %ecx ; SSSE3-NEXT: movd %ecx, %xmm0 ; SSSE3-NEXT: movl %eax, %ecx @@ -2252,30 +2252,30 @@ ; SSSE3-NEXT: andl $1, %ecx ; SSSE3-NEXT: movd %ecx, %xmm1 ; SSSE3-NEXT: movl %eax, %ecx -; SSSE3-NEXT: shrl $4, %ecx +; SSSE3-NEXT: shrl %ecx ; SSSE3-NEXT: andl $1, %ecx ; SSSE3-NEXT: movd %ecx, %xmm0 ; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; SSSE3-NEXT: movl %eax, %ecx ; SSSE3-NEXT: shrl $5, %ecx ; SSSE3-NEXT: andl $1, %ecx ; SSSE3-NEXT: movd %ecx, %xmm0 ; SSSE3-NEXT: movl %eax, %ecx -; SSSE3-NEXT: shrl %ecx +; SSSE3-NEXT: shrl $4, %ecx ; SSSE3-NEXT: andl $1, %ecx ; SSSE3-NEXT: movd %ecx, %xmm2 ; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] ; SSSE3-NEXT: movl %eax, %ecx -; SSSE3-NEXT: shrl $3, %ecx +; SSSE3-NEXT: shrl $6, %ecx ; SSSE3-NEXT: andl $1, %ecx ; SSSE3-NEXT: movd %ecx, %xmm0 ; SSSE3-NEXT: shrl $7, %eax ; SSSE3-NEXT: movzwl %ax, %eax ; SSSE3-NEXT: movd %eax, %xmm3 ; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; SSSE3-NEXT: movdqa %xmm1, %xmm0 ; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] ; SSSE3-NEXT: pslld $31, %xmm0 @@ -2546,69 +2546,69 @@ ; SSE2-NEXT: movq %rax, %rsi ; SSE2-NEXT: movq %rax, %rdi ; SSE2-NEXT: movq %rax, %rbp -; SSE2-NEXT: shlq $49, %rbp -; SSE2-NEXT: sarq $63, %rbp +; SSE2-NEXT: shrq $15, %rbp ; SSE2-NEXT: movd %ebp, %xmm0 ; SSE2-NEXT: movq %rax, %rbp ; SSE2-NEXT: movsbq %al, %rax -; SSE2-NEXT: shlq $57, %r8 +; SSE2-NEXT: shlq $49, %r8 ; SSE2-NEXT: sarq $63, %r8 ; SSE2-NEXT: movd %r8d, %xmm1 -; SSE2-NEXT: shlq $53, %r9 +; SSE2-NEXT: shlq $50, %r9 ; SSE2-NEXT: sarq $63, %r9 ; SSE2-NEXT: movd %r9d, %xmm2 -; SSE2-NEXT: shlq $61, %r10 +; SSE2-NEXT: shlq $51, %r10 ; SSE2-NEXT: sarq $63, %r10 ; SSE2-NEXT: movd %r10d, %xmm3 -; SSE2-NEXT: shlq $51, %r11 +; SSE2-NEXT: shlq $52, %r11 ; SSE2-NEXT: sarq $63, %r11 ; SSE2-NEXT: movd %r11d, %xmm4 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; SSE2-NEXT: shlq $59, %r14 +; SSE2-NEXT: shlq $53, %r14 ; SSE2-NEXT: sarq $63, %r14 -; SSE2-NEXT: movd %r14d, %xmm5 +; SSE2-NEXT: movd %r14d, %xmm0 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] -; SSE2-NEXT: shlq $55, %r15 +; SSE2-NEXT: shlq $54, %r15 ; SSE2-NEXT: sarq $63, %r15 ; SSE2-NEXT: movd %r15d, %xmm2 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] -; SSE2-NEXT: shlq $63, %r12 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] +; SSE2-NEXT: shlq $55, %r12 ; SSE2-NEXT: sarq $63, %r12 -; SSE2-NEXT: movd %r12d, %xmm0 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] -; SSE2-NEXT: shlq $50, %r13 +; SSE2-NEXT: movd %r12d, %xmm1 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] +; SSE2-NEXT: shlq $60, %r13 ; SSE2-NEXT: sarq $63, %r13 -; SSE2-NEXT: movd %r13d, %xmm1 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; SSE2-NEXT: shlq $58, %rbx +; SSE2-NEXT: movd %r13d, %xmm4 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SSE2-NEXT: shlq $61, %rbx ; SSE2-NEXT: sarq $63, %rbx ; SSE2-NEXT: movd %ebx, %xmm2 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3],xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7] -; SSE2-NEXT: shlq $54, %rcx +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE2-NEXT: shlq $62, %rcx ; SSE2-NEXT: sarq $63, %rcx -; SSE2-NEXT: movd %ecx, %xmm4 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] -; SSE2-NEXT: shlq $62, %rdx +; SSE2-NEXT: movd %ecx, %xmm5 +; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] +; SSE2-NEXT: shlq $63, %rdx ; SSE2-NEXT: sarq $63, %rdx -; SSE2-NEXT: movd %edx, %xmm3 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; SSE2-NEXT: shlq $52, %rsi +; SSE2-NEXT: movd %edx, %xmm0 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] +; SSE2-NEXT: shlq $58, %rsi ; SSE2-NEXT: sarq $63, %rsi -; SSE2-NEXT: movd %esi, %xmm1 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] -; SSE2-NEXT: shlq $60, %rdi +; SSE2-NEXT: movd %esi, %xmm3 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3],xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7] +; SSE2-NEXT: shlq $59, %rdi ; SSE2-NEXT: sarq $63, %rdi ; SSE2-NEXT: movd %edi, %xmm4 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3],xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7] -; SSE2-NEXT: shrq $15, %rbp -; SSE2-NEXT: movd %ebp, %xmm1 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] +; SSE2-NEXT: shlq $57, %rbp +; SSE2-NEXT: sarq $63, %rbp +; SSE2-NEXT: movd %ebp, %xmm2 ; SSE2-NEXT: shrq $7, %rax -; SSE2-NEXT: movd %eax, %xmm2 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] +; SSE2-NEXT: movd %eax, %xmm3 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSE2-NEXT: popq %rbx ; SSE2-NEXT: popq %r12 ; SSE2-NEXT: popq %r13 @@ -2640,69 +2640,69 @@ ; SSSE3-NEXT: movq %rax, %rsi ; SSSE3-NEXT: movq %rax, %rdi ; SSSE3-NEXT: movq %rax, %rbp -; SSSE3-NEXT: shlq $49, %rbp -; SSSE3-NEXT: sarq $63, %rbp +; SSSE3-NEXT: shrq $15, %rbp ; SSSE3-NEXT: movd %ebp, %xmm0 ; SSSE3-NEXT: movq %rax, %rbp ; SSSE3-NEXT: movsbq %al, %rax -; SSSE3-NEXT: shlq $57, %r8 +; SSSE3-NEXT: shlq $49, %r8 ; SSSE3-NEXT: sarq $63, %r8 ; SSSE3-NEXT: movd %r8d, %xmm1 -; SSSE3-NEXT: shlq $53, %r9 +; SSSE3-NEXT: shlq $50, %r9 ; SSSE3-NEXT: sarq $63, %r9 ; SSSE3-NEXT: movd %r9d, %xmm2 -; SSSE3-NEXT: shlq $61, %r10 +; SSSE3-NEXT: shlq $51, %r10 ; SSSE3-NEXT: sarq $63, %r10 ; SSSE3-NEXT: movd %r10d, %xmm3 -; SSSE3-NEXT: shlq $51, %r11 +; SSSE3-NEXT: shlq $52, %r11 ; SSSE3-NEXT: sarq $63, %r11 ; SSSE3-NEXT: movd %r11d, %xmm4 ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; SSSE3-NEXT: shlq $59, %r14 +; SSSE3-NEXT: shlq $53, %r14 ; SSSE3-NEXT: sarq $63, %r14 -; SSSE3-NEXT: movd %r14d, %xmm5 +; SSSE3-NEXT: movd %r14d, %xmm0 ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] -; SSSE3-NEXT: shlq $55, %r15 +; SSSE3-NEXT: shlq $54, %r15 ; SSSE3-NEXT: sarq $63, %r15 ; SSSE3-NEXT: movd %r15d, %xmm2 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] -; SSSE3-NEXT: shlq $63, %r12 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] +; SSSE3-NEXT: shlq $55, %r12 ; SSSE3-NEXT: sarq $63, %r12 -; SSSE3-NEXT: movd %r12d, %xmm0 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] -; SSSE3-NEXT: shlq $50, %r13 +; SSSE3-NEXT: movd %r12d, %xmm1 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] +; SSSE3-NEXT: shlq $60, %r13 ; SSSE3-NEXT: sarq $63, %r13 -; SSSE3-NEXT: movd %r13d, %xmm1 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; SSSE3-NEXT: shlq $58, %rbx +; SSSE3-NEXT: movd %r13d, %xmm4 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SSSE3-NEXT: shlq $61, %rbx ; SSSE3-NEXT: sarq $63, %rbx ; SSSE3-NEXT: movd %ebx, %xmm2 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3],xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7] -; SSSE3-NEXT: shlq $54, %rcx +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSSE3-NEXT: shlq $62, %rcx ; SSSE3-NEXT: sarq $63, %rcx -; SSSE3-NEXT: movd %ecx, %xmm4 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] -; SSSE3-NEXT: shlq $62, %rdx +; SSSE3-NEXT: movd %ecx, %xmm5 +; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] +; SSSE3-NEXT: shlq $63, %rdx ; SSSE3-NEXT: sarq $63, %rdx -; SSSE3-NEXT: movd %edx, %xmm3 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; SSSE3-NEXT: shlq $52, %rsi +; SSSE3-NEXT: movd %edx, %xmm0 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] +; SSSE3-NEXT: shlq $58, %rsi ; SSSE3-NEXT: sarq $63, %rsi -; SSSE3-NEXT: movd %esi, %xmm1 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] -; SSSE3-NEXT: shlq $60, %rdi +; SSSE3-NEXT: movd %esi, %xmm3 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3],xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7] +; SSSE3-NEXT: shlq $59, %rdi ; SSSE3-NEXT: sarq $63, %rdi ; SSSE3-NEXT: movd %edi, %xmm4 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3],xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7] -; SSSE3-NEXT: shrq $15, %rbp -; SSSE3-NEXT: movd %ebp, %xmm1 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] +; SSSE3-NEXT: shlq $57, %rbp +; SSSE3-NEXT: sarq $63, %rbp +; SSSE3-NEXT: movd %ebp, %xmm2 ; SSSE3-NEXT: shrq $7, %rax -; SSSE3-NEXT: movd %eax, %xmm2 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] +; SSSE3-NEXT: movd %eax, %xmm3 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSSE3-NEXT: popq %rbx ; SSSE3-NEXT: popq %r12 ; SSSE3-NEXT: popq %r13 @@ -3002,7 +3002,7 @@ ; SSE2: # BB#0: # %entry ; SSE2-NEXT: movzwl (%rdi), %eax ; SSE2-NEXT: movl %eax, %ecx -; SSE2-NEXT: shrl $14, %ecx +; SSE2-NEXT: shrl $7, %ecx ; SSE2-NEXT: andl $1, %ecx ; SSE2-NEXT: movd %ecx, %xmm0 ; SSE2-NEXT: movl %eax, %ecx @@ -3011,21 +3011,21 @@ ; SSE2-NEXT: movd %ecx, %xmm1 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; SSE2-NEXT: movl %eax, %ecx -; SSE2-NEXT: shrl $10, %ecx +; SSE2-NEXT: shrl $5, %ecx ; SSE2-NEXT: andl $1, %ecx ; SSE2-NEXT: movd %ecx, %xmm0 ; SSE2-NEXT: movl %eax, %ecx -; SSE2-NEXT: shrl $2, %ecx +; SSE2-NEXT: shrl $4, %ecx ; SSE2-NEXT: andl $1, %ecx ; SSE2-NEXT: movd %ecx, %xmm2 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; SSE2-NEXT: movl %eax, %ecx -; SSE2-NEXT: shrl $12, %ecx +; SSE2-NEXT: shrl $3, %ecx ; SSE2-NEXT: andl $1, %ecx ; SSE2-NEXT: movd %ecx, %xmm0 ; SSE2-NEXT: movl %eax, %ecx -; SSE2-NEXT: shrl $4, %ecx +; SSE2-NEXT: shrl $2, %ecx ; SSE2-NEXT: andl $1, %ecx ; SSE2-NEXT: movd %ecx, %xmm3 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] @@ -3033,18 +3033,18 @@ ; SSE2-NEXT: andl $1, %ecx ; SSE2-NEXT: movd %ecx, %xmm1 ; SSE2-NEXT: movl %eax, %ecx -; SSE2-NEXT: shrl $8, %ecx +; SSE2-NEXT: shrl %ecx ; SSE2-NEXT: andl $1, %ecx ; SSE2-NEXT: movd %ecx, %xmm0 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; SSE2-NEXT: movl %eax, %ecx -; SSE2-NEXT: shrl $13, %ecx +; SSE2-NEXT: shrl $11, %ecx ; SSE2-NEXT: andl $1, %ecx ; SSE2-NEXT: movd %ecx, %xmm0 ; SSE2-NEXT: movl %eax, %ecx -; SSE2-NEXT: shrl $5, %ecx +; SSE2-NEXT: shrl $10, %ecx ; SSE2-NEXT: andl $1, %ecx ; SSE2-NEXT: movd %ecx, %xmm2 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] @@ -3053,31 +3053,31 @@ ; SSE2-NEXT: andl $1, %ecx ; SSE2-NEXT: movd %ecx, %xmm3 ; SSE2-NEXT: movl %eax, %ecx -; SSE2-NEXT: shrl %ecx +; SSE2-NEXT: shrl $8, %ecx ; SSE2-NEXT: andl $1, %ecx ; SSE2-NEXT: movd %ecx, %xmm0 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] ; SSE2-NEXT: movl %eax, %ecx -; SSE2-NEXT: shrl $11, %ecx +; SSE2-NEXT: shrl $13, %ecx ; SSE2-NEXT: andl $1, %ecx ; SSE2-NEXT: movd %ecx, %xmm2 ; SSE2-NEXT: movl %eax, %ecx -; SSE2-NEXT: shrl $3, %ecx +; SSE2-NEXT: shrl $12, %ecx ; SSE2-NEXT: andl $1, %ecx ; SSE2-NEXT: movd %ecx, %xmm3 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] ; SSE2-NEXT: movl %eax, %ecx -; SSE2-NEXT: shrl $7, %ecx +; SSE2-NEXT: shrl $14, %ecx ; SSE2-NEXT: andl $1, %ecx ; SSE2-NEXT: movd %ecx, %xmm2 ; SSE2-NEXT: shrl $15, %eax ; SSE2-NEXT: movzwl %ax, %eax ; SSE2-NEXT: movd %eax, %xmm4 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSE2-NEXT: movdqa %xmm1, %xmm0 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE2-NEXT: psllw $15, %xmm0 @@ -3091,7 +3091,7 @@ ; SSSE3: # BB#0: # %entry ; SSSE3-NEXT: movzwl (%rdi), %eax ; SSSE3-NEXT: movl %eax, %ecx -; SSSE3-NEXT: shrl $14, %ecx +; SSSE3-NEXT: shrl $7, %ecx ; SSSE3-NEXT: andl $1, %ecx ; SSSE3-NEXT: movd %ecx, %xmm0 ; SSSE3-NEXT: movl %eax, %ecx @@ -3100,21 +3100,21 @@ ; SSSE3-NEXT: movd %ecx, %xmm1 ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; SSSE3-NEXT: movl %eax, %ecx -; SSSE3-NEXT: shrl $10, %ecx +; SSSE3-NEXT: shrl $5, %ecx ; SSSE3-NEXT: andl $1, %ecx ; SSSE3-NEXT: movd %ecx, %xmm0 ; SSSE3-NEXT: movl %eax, %ecx -; SSSE3-NEXT: shrl $2, %ecx +; SSSE3-NEXT: shrl $4, %ecx ; SSSE3-NEXT: andl $1, %ecx ; SSSE3-NEXT: movd %ecx, %xmm2 ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; SSSE3-NEXT: movl %eax, %ecx -; SSSE3-NEXT: shrl $12, %ecx +; SSSE3-NEXT: shrl $3, %ecx ; SSSE3-NEXT: andl $1, %ecx ; SSSE3-NEXT: movd %ecx, %xmm0 ; SSSE3-NEXT: movl %eax, %ecx -; SSSE3-NEXT: shrl $4, %ecx +; SSSE3-NEXT: shrl $2, %ecx ; SSSE3-NEXT: andl $1, %ecx ; SSSE3-NEXT: movd %ecx, %xmm3 ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] @@ -3122,18 +3122,18 @@ ; SSSE3-NEXT: andl $1, %ecx ; SSSE3-NEXT: movd %ecx, %xmm1 ; SSSE3-NEXT: movl %eax, %ecx -; SSSE3-NEXT: shrl $8, %ecx +; SSSE3-NEXT: shrl %ecx ; SSSE3-NEXT: andl $1, %ecx ; SSSE3-NEXT: movd %ecx, %xmm0 ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; SSSE3-NEXT: movl %eax, %ecx -; SSSE3-NEXT: shrl $13, %ecx +; SSSE3-NEXT: shrl $11, %ecx ; SSSE3-NEXT: andl $1, %ecx ; SSSE3-NEXT: movd %ecx, %xmm0 ; SSSE3-NEXT: movl %eax, %ecx -; SSSE3-NEXT: shrl $5, %ecx +; SSSE3-NEXT: shrl $10, %ecx ; SSSE3-NEXT: andl $1, %ecx ; SSSE3-NEXT: movd %ecx, %xmm2 ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] @@ -3142,31 +3142,31 @@ ; SSSE3-NEXT: andl $1, %ecx ; SSSE3-NEXT: movd %ecx, %xmm3 ; SSSE3-NEXT: movl %eax, %ecx -; SSSE3-NEXT: shrl %ecx +; SSSE3-NEXT: shrl $8, %ecx ; SSSE3-NEXT: andl $1, %ecx ; SSSE3-NEXT: movd %ecx, %xmm0 ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] ; SSSE3-NEXT: movl %eax, %ecx -; SSSE3-NEXT: shrl $11, %ecx +; SSSE3-NEXT: shrl $13, %ecx ; SSSE3-NEXT: andl $1, %ecx ; SSSE3-NEXT: movd %ecx, %xmm2 ; SSSE3-NEXT: movl %eax, %ecx -; SSSE3-NEXT: shrl $3, %ecx +; SSSE3-NEXT: shrl $12, %ecx ; SSSE3-NEXT: andl $1, %ecx ; SSSE3-NEXT: movd %ecx, %xmm3 ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] ; SSSE3-NEXT: movl %eax, %ecx -; SSSE3-NEXT: shrl $7, %ecx +; SSSE3-NEXT: shrl $14, %ecx ; SSSE3-NEXT: andl $1, %ecx ; SSSE3-NEXT: movd %ecx, %xmm2 ; SSSE3-NEXT: shrl $15, %eax ; SSSE3-NEXT: movzwl %ax, %eax ; SSSE3-NEXT: movd %eax, %xmm4 ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; SSSE3-NEXT: movdqa %xmm1, %xmm0 ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSSE3-NEXT: psllw $15, %xmm0 @@ -3556,162 +3556,162 @@ ; SSE2-NEXT: pushq %r13 ; SSE2-NEXT: pushq %r12 ; SSE2-NEXT: pushq %rbx -; SSE2-NEXT: movswq (%rdi), %rbx -; SSE2-NEXT: movq %rbx, %r10 -; SSE2-NEXT: movq %rbx, %r8 -; SSE2-NEXT: movq %rbx, %r9 -; SSE2-NEXT: movq %rbx, %r11 -; SSE2-NEXT: movq %rbx, %r14 -; SSE2-NEXT: movq %rbx, %r15 -; SSE2-NEXT: movq %rbx, %r12 -; SSE2-NEXT: movq %rbx, %r13 -; SSE2-NEXT: movq %rbx, %rdx -; SSE2-NEXT: movq %rbx, %rsi -; SSE2-NEXT: movq %rbx, %rcx -; SSE2-NEXT: movq %rbx, %rbp -; SSE2-NEXT: movq %rbx, %rax -; SSE2-NEXT: shlq $49, %rax -; SSE2-NEXT: sarq $63, %rax -; SSE2-NEXT: movd %eax, %xmm0 -; SSE2-NEXT: movq %rbx, %rax -; SSE2-NEXT: shlq $57, %r10 +; SSE2-NEXT: movswq (%rdi), %rax +; SSE2-NEXT: movq %rax, %r10 +; SSE2-NEXT: movq %rax, %r8 +; SSE2-NEXT: movq %rax, %r9 +; SSE2-NEXT: movq %rax, %r11 +; SSE2-NEXT: movq %rax, %r14 +; SSE2-NEXT: movq %rax, %r15 +; SSE2-NEXT: movq %rax, %r12 +; SSE2-NEXT: movq %rax, %r13 +; SSE2-NEXT: movq %rax, %rdx +; SSE2-NEXT: movq %rax, %rsi +; SSE2-NEXT: movq %rax, %rcx +; SSE2-NEXT: movq %rax, %rbp +; SSE2-NEXT: movq %rax, %rbx +; SSE2-NEXT: shrq $15, %rbx +; SSE2-NEXT: movd %ebx, %xmm0 +; SSE2-NEXT: movq %rax, %rbx +; SSE2-NEXT: shlq $49, %r10 ; SSE2-NEXT: sarq $63, %r10 ; SSE2-NEXT: movd %r10d, %xmm15 -; SSE2-NEXT: movq %rbx, %r10 -; SSE2-NEXT: movsbq %bl, %rbx +; SSE2-NEXT: movq %rax, %r10 +; SSE2-NEXT: movsbq %al, %rax ; SSE2-NEXT: punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3],xmm15[4],xmm0[4],xmm15[5],xmm0[5],xmm15[6],xmm0[6],xmm15[7],xmm0[7] -; SSE2-NEXT: shlq $53, %r8 +; SSE2-NEXT: shlq $50, %r8 ; SSE2-NEXT: sarq $63, %r8 ; SSE2-NEXT: movd %r8d, %xmm8 -; SSE2-NEXT: shlq $61, %r9 +; SSE2-NEXT: shlq $51, %r9 ; SSE2-NEXT: sarq $63, %r9 -; SSE2-NEXT: movd %r9d, %xmm2 -; SSE2-NEXT: shlq $51, %r11 +; SSE2-NEXT: movd %r9d, %xmm3 +; SSE2-NEXT: shlq $52, %r11 ; SSE2-NEXT: sarq $63, %r11 ; SSE2-NEXT: movd %r11d, %xmm9 -; SSE2-NEXT: shlq $59, %r14 +; SSE2-NEXT: shlq $53, %r14 ; SSE2-NEXT: sarq $63, %r14 -; SSE2-NEXT: movd %r14d, %xmm5 -; SSE2-NEXT: shlq $55, %r15 +; SSE2-NEXT: movd %r14d, %xmm6 +; SSE2-NEXT: shlq $54, %r15 ; SSE2-NEXT: sarq $63, %r15 ; SSE2-NEXT: movd %r15d, %xmm10 -; SSE2-NEXT: shlq $63, %r12 +; SSE2-NEXT: shlq $55, %r12 ; SSE2-NEXT: sarq $63, %r12 -; SSE2-NEXT: movd %r12d, %xmm0 -; SSE2-NEXT: shlq $50, %r13 +; SSE2-NEXT: movd %r12d, %xmm2 +; SSE2-NEXT: shlq $60, %r13 ; SSE2-NEXT: sarq $63, %r13 ; SSE2-NEXT: movd %r13d, %xmm11 -; SSE2-NEXT: shlq $58, %rdx +; SSE2-NEXT: shlq $61, %rdx ; SSE2-NEXT: sarq $63, %rdx -; SSE2-NEXT: movd %edx, %xmm4 -; SSE2-NEXT: shlq $54, %rsi +; SSE2-NEXT: movd %edx, %xmm5 +; SSE2-NEXT: shlq $62, %rsi ; SSE2-NEXT: sarq $63, %rsi ; SSE2-NEXT: movd %esi, %xmm12 -; SSE2-NEXT: shlq $62, %rcx +; SSE2-NEXT: shlq $63, %rcx ; SSE2-NEXT: sarq $63, %rcx -; SSE2-NEXT: movd %ecx, %xmm6 -; SSE2-NEXT: shlq $52, %rbp +; SSE2-NEXT: movd %ecx, %xmm0 +; SSE2-NEXT: shlq $58, %rbp ; SSE2-NEXT: sarq $63, %rbp ; SSE2-NEXT: movd %ebp, %xmm13 -; SSE2-NEXT: shlq $60, %rax -; SSE2-NEXT: sarq $63, %rax -; SSE2-NEXT: movd %eax, %xmm7 -; SSE2-NEXT: shrq $15, %r10 -; SSE2-NEXT: movd %r10d, %xmm14 -; SSE2-NEXT: shrq $7, %rbx -; SSE2-NEXT: movd %ebx, %xmm3 -; SSE2-NEXT: movswq 2(%rdi), %rdx -; SSE2-NEXT: movq %rdx, %r8 -; SSE2-NEXT: movq %rdx, %r9 -; SSE2-NEXT: movq %rdx, %r10 -; SSE2-NEXT: movq %rdx, %r11 -; SSE2-NEXT: movq %rdx, %r14 -; SSE2-NEXT: movq %rdx, %r15 -; SSE2-NEXT: movq %rdx, %r12 -; SSE2-NEXT: movq %rdx, %r13 -; SSE2-NEXT: movq %rdx, %rbx -; SSE2-NEXT: movq %rdx, %rax -; SSE2-NEXT: movq %rdx, %rcx -; SSE2-NEXT: movq %rdx, %rsi -; SSE2-NEXT: movq %rdx, %rdi -; SSE2-NEXT: movq %rdx, %rbp -; SSE2-NEXT: shlq $49, %rbp -; SSE2-NEXT: sarq $63, %rbp +; SSE2-NEXT: shlq $59, %rbx +; SSE2-NEXT: sarq $63, %rbx +; SSE2-NEXT: movd %ebx, %xmm7 +; SSE2-NEXT: shlq $57, %r10 +; SSE2-NEXT: sarq $63, %r10 +; SSE2-NEXT: movd %r10d, %xmm4 +; SSE2-NEXT: shrq $7, %rax +; SSE2-NEXT: movd %eax, %xmm14 +; SSE2-NEXT: movswq 2(%rdi), %rsi +; SSE2-NEXT: movq %rsi, %r8 +; SSE2-NEXT: movq %rsi, %r9 +; SSE2-NEXT: movq %rsi, %r10 +; SSE2-NEXT: movq %rsi, %r11 +; SSE2-NEXT: movq %rsi, %r14 +; SSE2-NEXT: movq %rsi, %r15 +; SSE2-NEXT: movq %rsi, %r12 +; SSE2-NEXT: movq %rsi, %r13 +; SSE2-NEXT: movq %rsi, %rbx +; SSE2-NEXT: movq %rsi, %rax +; SSE2-NEXT: movq %rsi, %rcx +; SSE2-NEXT: movq %rsi, %rdx +; SSE2-NEXT: movq %rsi, %rdi +; SSE2-NEXT: movq %rsi, %rbp +; SSE2-NEXT: shrq $15, %rbp ; SSE2-NEXT: movd %ebp, %xmm1 -; SSE2-NEXT: movq %rdx, %rbp -; SSE2-NEXT: movsbq %dl, %rdx -; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3],xmm2[4],xmm8[4],xmm2[5],xmm8[5],xmm2[6],xmm8[6],xmm2[7],xmm8[7] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm15[0],xmm2[1],xmm15[1],xmm2[2],xmm15[2],xmm2[3],xmm15[3],xmm2[4],xmm15[4],xmm2[5],xmm15[5],xmm2[6],xmm15[6],xmm2[7],xmm15[7] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm9[0],xmm5[1],xmm9[1],xmm5[2],xmm9[2],xmm5[3],xmm9[3],xmm5[4],xmm9[4],xmm5[5],xmm9[5],xmm5[6],xmm9[6],xmm5[7],xmm9[7] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3],xmm0[4],xmm10[4],xmm0[5],xmm10[5],xmm0[6],xmm10[6],xmm0[7],xmm10[7] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3],xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm11[0],xmm4[1],xmm11[1],xmm4[2],xmm11[2],xmm4[3],xmm11[3],xmm4[4],xmm11[4],xmm4[5],xmm11[5],xmm4[6],xmm11[6],xmm4[7],xmm11[7] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm12[0],xmm6[1],xmm12[1],xmm6[2],xmm12[2],xmm6[3],xmm12[3],xmm6[4],xmm12[4],xmm6[5],xmm12[5],xmm6[6],xmm12[6],xmm6[7],xmm12[7] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3],xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7] +; SSE2-NEXT: movq %rsi, %rbp +; SSE2-NEXT: movsbq %sil, %rsi +; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3],xmm3[4],xmm8[4],xmm3[5],xmm8[5],xmm3[6],xmm8[6],xmm3[7],xmm8[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm15[0],xmm3[1],xmm15[1],xmm3[2],xmm15[2],xmm3[3],xmm15[3] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm9[0],xmm6[1],xmm9[1],xmm6[2],xmm9[2],xmm6[3],xmm9[3],xmm6[4],xmm9[4],xmm6[5],xmm9[5],xmm6[6],xmm9[6],xmm6[7],xmm9[7] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm10[0],xmm2[1],xmm10[1],xmm2[2],xmm10[2],xmm2[3],xmm10[3],xmm2[4],xmm10[4],xmm2[5],xmm10[5],xmm2[6],xmm10[6],xmm2[7],xmm10[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm11[0],xmm5[1],xmm11[1],xmm5[2],xmm11[2],xmm5[3],xmm11[3],xmm5[4],xmm11[4],xmm5[5],xmm11[5],xmm5[6],xmm11[6],xmm5[7],xmm11[7] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1],xmm0[2],xmm12[2],xmm0[3],xmm12[3],xmm0[4],xmm12[4],xmm0[5],xmm12[5],xmm0[6],xmm12[6],xmm0[7],xmm12[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3] ; SSE2-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm13[0],xmm7[1],xmm13[1],xmm7[2],xmm13[2],xmm7[3],xmm13[3],xmm7[4],xmm13[4],xmm7[5],xmm13[5],xmm7[6],xmm13[6],xmm7[7],xmm13[7] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm14[0],xmm3[1],xmm14[1],xmm3[2],xmm14[2],xmm3[3],xmm14[3],xmm3[4],xmm14[4],xmm3[5],xmm14[5],xmm3[6],xmm14[6],xmm3[7],xmm14[7] -; SSE2-NEXT: shlq $57, %r8 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm14[0],xmm4[1],xmm14[1],xmm4[2],xmm14[2],xmm4[3],xmm14[3],xmm4[4],xmm14[4],xmm4[5],xmm14[5],xmm4[6],xmm14[6],xmm4[7],xmm14[7] +; SSE2-NEXT: shlq $49, %r8 ; SSE2-NEXT: sarq $63, %r8 -; SSE2-NEXT: movd %r8d, %xmm2 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3],xmm7[4],xmm3[4],xmm7[5],xmm3[5],xmm7[6],xmm3[6],xmm7[7],xmm3[7] -; SSE2-NEXT: shlq $53, %r9 +; SSE2-NEXT: movd %r8d, %xmm3 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3] +; SSE2-NEXT: shlq $50, %r9 ; SSE2-NEXT: sarq $63, %r9 -; SSE2-NEXT: movd %r9d, %xmm3 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3],xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7] -; SSE2-NEXT: shlq $61, %r10 +; SSE2-NEXT: movd %r9d, %xmm4 +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1] +; SSE2-NEXT: shlq $51, %r10 ; SSE2-NEXT: sarq $63, %r10 -; SSE2-NEXT: movd %r10d, %xmm4 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3],xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7] -; SSE2-NEXT: shlq $51, %r11 +; SSE2-NEXT: movd %r10d, %xmm5 +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; SSE2-NEXT: shlq $52, %r11 ; SSE2-NEXT: sarq $63, %r11 -; SSE2-NEXT: movd %r11d, %xmm5 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; SSE2-NEXT: shlq $59, %r14 +; SSE2-NEXT: movd %r11d, %xmm2 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] +; SSE2-NEXT: shlq $53, %r14 ; SSE2-NEXT: sarq $63, %r14 -; SSE2-NEXT: movd %r14d, %xmm6 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] -; SSE2-NEXT: shlq $55, %r15 +; SSE2-NEXT: movd %r14d, %xmm1 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] +; SSE2-NEXT: shlq $54, %r15 ; SSE2-NEXT: sarq $63, %r15 -; SSE2-NEXT: movd %r15d, %xmm3 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] -; SSE2-NEXT: shlq $63, %r12 +; SSE2-NEXT: movd %r15d, %xmm4 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] +; SSE2-NEXT: shlq $55, %r12 ; SSE2-NEXT: sarq $63, %r12 -; SSE2-NEXT: movd %r12d, %xmm1 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3],xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] -; SSE2-NEXT: shlq $50, %r13 +; SSE2-NEXT: movd %r12d, %xmm3 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SSE2-NEXT: shlq $60, %r13 ; SSE2-NEXT: sarq $63, %r13 ; SSE2-NEXT: movd %r13d, %xmm2 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] -; SSE2-NEXT: shlq $58, %rbx +; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] +; SSE2-NEXT: shlq $61, %rbx ; SSE2-NEXT: sarq $63, %rbx -; SSE2-NEXT: movd %ebx, %xmm3 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3],xmm1[4],xmm6[4],xmm1[5],xmm6[5],xmm1[6],xmm6[6],xmm1[7],xmm6[7] -; SSE2-NEXT: shlq $54, %rax +; SSE2-NEXT: movd %ebx, %xmm4 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] +; SSE2-NEXT: shlq $62, %rax ; SSE2-NEXT: sarq $63, %rax -; SSE2-NEXT: movd %eax, %xmm5 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] -; SSE2-NEXT: shlq $62, %rcx +; SSE2-NEXT: movd %eax, %xmm6 +; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1] +; SSE2-NEXT: shlq $63, %rcx ; SSE2-NEXT: sarq $63, %rcx -; SSE2-NEXT: movd %ecx, %xmm4 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] -; SSE2-NEXT: shlq $52, %rsi -; SSE2-NEXT: sarq $63, %rsi -; SSE2-NEXT: movd %esi, %xmm2 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3],xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] -; SSE2-NEXT: shlq $60, %rdi +; SSE2-NEXT: movd %ecx, %xmm1 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] +; SSE2-NEXT: shlq $58, %rdx +; SSE2-NEXT: sarq $63, %rdx +; SSE2-NEXT: movd %edx, %xmm2 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3],xmm1[4],xmm6[4],xmm1[5],xmm6[5],xmm1[6],xmm6[6],xmm1[7],xmm6[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] +; SSE2-NEXT: shlq $59, %rdi ; SSE2-NEXT: sarq $63, %rdi -; SSE2-NEXT: movd %edi, %xmm3 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] -; SSE2-NEXT: shrq $15, %rbp +; SSE2-NEXT: movd %edi, %xmm4 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] +; SSE2-NEXT: shlq $57, %rbp +; SSE2-NEXT: sarq $63, %rbp ; SSE2-NEXT: movd %ebp, %xmm2 -; SSE2-NEXT: shrq $7, %rdx -; SSE2-NEXT: movd %edx, %xmm5 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3],xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3],xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] +; SSE2-NEXT: shrq $7, %rsi +; SSE2-NEXT: movd %esi, %xmm5 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3],xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0] ; SSE2-NEXT: popq %rbx ; SSE2-NEXT: popq %r12 ; SSE2-NEXT: popq %r13 @@ -3728,162 +3728,162 @@ ; SSSE3-NEXT: pushq %r13 ; SSSE3-NEXT: pushq %r12 ; SSSE3-NEXT: pushq %rbx -; SSSE3-NEXT: movswq (%rdi), %rbx -; SSSE3-NEXT: movq %rbx, %r10 -; SSSE3-NEXT: movq %rbx, %r8 -; SSSE3-NEXT: movq %rbx, %r9 -; SSSE3-NEXT: movq %rbx, %r11 -; SSSE3-NEXT: movq %rbx, %r14 -; SSSE3-NEXT: movq %rbx, %r15 -; SSSE3-NEXT: movq %rbx, %r12 -; SSSE3-NEXT: movq %rbx, %r13 -; SSSE3-NEXT: movq %rbx, %rdx -; SSSE3-NEXT: movq %rbx, %rsi -; SSSE3-NEXT: movq %rbx, %rcx -; SSSE3-NEXT: movq %rbx, %rbp -; SSSE3-NEXT: movq %rbx, %rax -; SSSE3-NEXT: shlq $49, %rax -; SSSE3-NEXT: sarq $63, %rax -; SSSE3-NEXT: movd %eax, %xmm0 -; SSSE3-NEXT: movq %rbx, %rax -; SSSE3-NEXT: shlq $57, %r10 +; SSSE3-NEXT: movswq (%rdi), %rax +; SSSE3-NEXT: movq %rax, %r10 +; SSSE3-NEXT: movq %rax, %r8 +; SSSE3-NEXT: movq %rax, %r9 +; SSSE3-NEXT: movq %rax, %r11 +; SSSE3-NEXT: movq %rax, %r14 +; SSSE3-NEXT: movq %rax, %r15 +; SSSE3-NEXT: movq %rax, %r12 +; SSSE3-NEXT: movq %rax, %r13 +; SSSE3-NEXT: movq %rax, %rdx +; SSSE3-NEXT: movq %rax, %rsi +; SSSE3-NEXT: movq %rax, %rcx +; SSSE3-NEXT: movq %rax, %rbp +; SSSE3-NEXT: movq %rax, %rbx +; SSSE3-NEXT: shrq $15, %rbx +; SSSE3-NEXT: movd %ebx, %xmm0 +; SSSE3-NEXT: movq %rax, %rbx +; SSSE3-NEXT: shlq $49, %r10 ; SSSE3-NEXT: sarq $63, %r10 ; SSSE3-NEXT: movd %r10d, %xmm15 -; SSSE3-NEXT: movq %rbx, %r10 -; SSSE3-NEXT: movsbq %bl, %rbx +; SSSE3-NEXT: movq %rax, %r10 +; SSSE3-NEXT: movsbq %al, %rax ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3],xmm15[4],xmm0[4],xmm15[5],xmm0[5],xmm15[6],xmm0[6],xmm15[7],xmm0[7] -; SSSE3-NEXT: shlq $53, %r8 +; SSSE3-NEXT: shlq $50, %r8 ; SSSE3-NEXT: sarq $63, %r8 ; SSSE3-NEXT: movd %r8d, %xmm8 -; SSSE3-NEXT: shlq $61, %r9 +; SSSE3-NEXT: shlq $51, %r9 ; SSSE3-NEXT: sarq $63, %r9 -; SSSE3-NEXT: movd %r9d, %xmm2 -; SSSE3-NEXT: shlq $51, %r11 +; SSSE3-NEXT: movd %r9d, %xmm3 +; SSSE3-NEXT: shlq $52, %r11 ; SSSE3-NEXT: sarq $63, %r11 ; SSSE3-NEXT: movd %r11d, %xmm9 -; SSSE3-NEXT: shlq $59, %r14 +; SSSE3-NEXT: shlq $53, %r14 ; SSSE3-NEXT: sarq $63, %r14 -; SSSE3-NEXT: movd %r14d, %xmm5 -; SSSE3-NEXT: shlq $55, %r15 +; SSSE3-NEXT: movd %r14d, %xmm6 +; SSSE3-NEXT: shlq $54, %r15 ; SSSE3-NEXT: sarq $63, %r15 ; SSSE3-NEXT: movd %r15d, %xmm10 -; SSSE3-NEXT: shlq $63, %r12 +; SSSE3-NEXT: shlq $55, %r12 ; SSSE3-NEXT: sarq $63, %r12 -; SSSE3-NEXT: movd %r12d, %xmm0 -; SSSE3-NEXT: shlq $50, %r13 +; SSSE3-NEXT: movd %r12d, %xmm2 +; SSSE3-NEXT: shlq $60, %r13 ; SSSE3-NEXT: sarq $63, %r13 ; SSSE3-NEXT: movd %r13d, %xmm11 -; SSSE3-NEXT: shlq $58, %rdx +; SSSE3-NEXT: shlq $61, %rdx ; SSSE3-NEXT: sarq $63, %rdx -; SSSE3-NEXT: movd %edx, %xmm4 -; SSSE3-NEXT: shlq $54, %rsi +; SSSE3-NEXT: movd %edx, %xmm5 +; SSSE3-NEXT: shlq $62, %rsi ; SSSE3-NEXT: sarq $63, %rsi ; SSSE3-NEXT: movd %esi, %xmm12 -; SSSE3-NEXT: shlq $62, %rcx +; SSSE3-NEXT: shlq $63, %rcx ; SSSE3-NEXT: sarq $63, %rcx -; SSSE3-NEXT: movd %ecx, %xmm6 -; SSSE3-NEXT: shlq $52, %rbp +; SSSE3-NEXT: movd %ecx, %xmm0 +; SSSE3-NEXT: shlq $58, %rbp ; SSSE3-NEXT: sarq $63, %rbp ; SSSE3-NEXT: movd %ebp, %xmm13 -; SSSE3-NEXT: shlq $60, %rax -; SSSE3-NEXT: sarq $63, %rax -; SSSE3-NEXT: movd %eax, %xmm7 -; SSSE3-NEXT: shrq $15, %r10 -; SSSE3-NEXT: movd %r10d, %xmm14 -; SSSE3-NEXT: shrq $7, %rbx -; SSSE3-NEXT: movd %ebx, %xmm3 -; SSSE3-NEXT: movswq 2(%rdi), %rdx -; SSSE3-NEXT: movq %rdx, %r8 -; SSSE3-NEXT: movq %rdx, %r9 -; SSSE3-NEXT: movq %rdx, %r10 -; SSSE3-NEXT: movq %rdx, %r11 -; SSSE3-NEXT: movq %rdx, %r14 -; SSSE3-NEXT: movq %rdx, %r15 -; SSSE3-NEXT: movq %rdx, %r12 -; SSSE3-NEXT: movq %rdx, %r13 -; SSSE3-NEXT: movq %rdx, %rbx -; SSSE3-NEXT: movq %rdx, %rax -; SSSE3-NEXT: movq %rdx, %rcx -; SSSE3-NEXT: movq %rdx, %rsi -; SSSE3-NEXT: movq %rdx, %rdi -; SSSE3-NEXT: movq %rdx, %rbp -; SSSE3-NEXT: shlq $49, %rbp -; SSSE3-NEXT: sarq $63, %rbp +; SSSE3-NEXT: shlq $59, %rbx +; SSSE3-NEXT: sarq $63, %rbx +; SSSE3-NEXT: movd %ebx, %xmm7 +; SSSE3-NEXT: shlq $57, %r10 +; SSSE3-NEXT: sarq $63, %r10 +; SSSE3-NEXT: movd %r10d, %xmm4 +; SSSE3-NEXT: shrq $7, %rax +; SSSE3-NEXT: movd %eax, %xmm14 +; SSSE3-NEXT: movswq 2(%rdi), %rsi +; SSSE3-NEXT: movq %rsi, %r8 +; SSSE3-NEXT: movq %rsi, %r9 +; SSSE3-NEXT: movq %rsi, %r10 +; SSSE3-NEXT: movq %rsi, %r11 +; SSSE3-NEXT: movq %rsi, %r14 +; SSSE3-NEXT: movq %rsi, %r15 +; SSSE3-NEXT: movq %rsi, %r12 +; SSSE3-NEXT: movq %rsi, %r13 +; SSSE3-NEXT: movq %rsi, %rbx +; SSSE3-NEXT: movq %rsi, %rax +; SSSE3-NEXT: movq %rsi, %rcx +; SSSE3-NEXT: movq %rsi, %rdx +; SSSE3-NEXT: movq %rsi, %rdi +; SSSE3-NEXT: movq %rsi, %rbp +; SSSE3-NEXT: shrq $15, %rbp ; SSSE3-NEXT: movd %ebp, %xmm1 -; SSSE3-NEXT: movq %rdx, %rbp -; SSSE3-NEXT: movsbq %dl, %rdx -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3],xmm2[4],xmm8[4],xmm2[5],xmm8[5],xmm2[6],xmm8[6],xmm2[7],xmm8[7] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm15[0],xmm2[1],xmm15[1],xmm2[2],xmm15[2],xmm2[3],xmm15[3],xmm2[4],xmm15[4],xmm2[5],xmm15[5],xmm2[6],xmm15[6],xmm2[7],xmm15[7] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm9[0],xmm5[1],xmm9[1],xmm5[2],xmm9[2],xmm5[3],xmm9[3],xmm5[4],xmm9[4],xmm5[5],xmm9[5],xmm5[6],xmm9[6],xmm5[7],xmm9[7] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3],xmm0[4],xmm10[4],xmm0[5],xmm10[5],xmm0[6],xmm10[6],xmm0[7],xmm10[7] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3],xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm11[0],xmm4[1],xmm11[1],xmm4[2],xmm11[2],xmm4[3],xmm11[3],xmm4[4],xmm11[4],xmm4[5],xmm11[5],xmm4[6],xmm11[6],xmm4[7],xmm11[7] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm12[0],xmm6[1],xmm12[1],xmm6[2],xmm12[2],xmm6[3],xmm12[3],xmm6[4],xmm12[4],xmm6[5],xmm12[5],xmm6[6],xmm12[6],xmm6[7],xmm12[7] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3],xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7] +; SSSE3-NEXT: movq %rsi, %rbp +; SSSE3-NEXT: movsbq %sil, %rsi +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3],xmm3[4],xmm8[4],xmm3[5],xmm8[5],xmm3[6],xmm8[6],xmm3[7],xmm8[7] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm15[0],xmm3[1],xmm15[1],xmm3[2],xmm15[2],xmm3[3],xmm15[3] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm9[0],xmm6[1],xmm9[1],xmm6[2],xmm9[2],xmm6[3],xmm9[3],xmm6[4],xmm9[4],xmm6[5],xmm9[5],xmm6[6],xmm9[6],xmm6[7],xmm9[7] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm10[0],xmm2[1],xmm10[1],xmm2[2],xmm10[2],xmm2[3],xmm10[3],xmm2[4],xmm10[4],xmm2[5],xmm10[5],xmm2[6],xmm10[6],xmm2[7],xmm10[7] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm11[0],xmm5[1],xmm11[1],xmm5[2],xmm11[2],xmm5[3],xmm11[3],xmm5[4],xmm11[4],xmm5[5],xmm11[5],xmm5[6],xmm11[6],xmm5[7],xmm11[7] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1],xmm0[2],xmm12[2],xmm0[3],xmm12[3],xmm0[4],xmm12[4],xmm0[5],xmm12[5],xmm0[6],xmm12[6],xmm0[7],xmm12[7] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3] ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm13[0],xmm7[1],xmm13[1],xmm7[2],xmm13[2],xmm7[3],xmm13[3],xmm7[4],xmm13[4],xmm7[5],xmm13[5],xmm7[6],xmm13[6],xmm7[7],xmm13[7] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm14[0],xmm3[1],xmm14[1],xmm3[2],xmm14[2],xmm3[3],xmm14[3],xmm3[4],xmm14[4],xmm3[5],xmm14[5],xmm3[6],xmm14[6],xmm3[7],xmm14[7] -; SSSE3-NEXT: shlq $57, %r8 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm14[0],xmm4[1],xmm14[1],xmm4[2],xmm14[2],xmm4[3],xmm14[3],xmm4[4],xmm14[4],xmm4[5],xmm14[5],xmm4[6],xmm14[6],xmm4[7],xmm14[7] +; SSSE3-NEXT: shlq $49, %r8 ; SSSE3-NEXT: sarq $63, %r8 -; SSSE3-NEXT: movd %r8d, %xmm2 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3],xmm7[4],xmm3[4],xmm7[5],xmm3[5],xmm7[6],xmm3[6],xmm7[7],xmm3[7] -; SSSE3-NEXT: shlq $53, %r9 +; SSSE3-NEXT: movd %r8d, %xmm3 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3] +; SSSE3-NEXT: shlq $50, %r9 ; SSSE3-NEXT: sarq $63, %r9 -; SSSE3-NEXT: movd %r9d, %xmm3 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3],xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7] -; SSSE3-NEXT: shlq $61, %r10 +; SSSE3-NEXT: movd %r9d, %xmm4 +; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1] +; SSSE3-NEXT: shlq $51, %r10 ; SSSE3-NEXT: sarq $63, %r10 -; SSSE3-NEXT: movd %r10d, %xmm4 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3],xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7] -; SSSE3-NEXT: shlq $51, %r11 +; SSSE3-NEXT: movd %r10d, %xmm5 +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; SSSE3-NEXT: shlq $52, %r11 ; SSSE3-NEXT: sarq $63, %r11 -; SSSE3-NEXT: movd %r11d, %xmm5 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; SSSE3-NEXT: shlq $59, %r14 +; SSSE3-NEXT: movd %r11d, %xmm2 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] +; SSSE3-NEXT: shlq $53, %r14 ; SSSE3-NEXT: sarq $63, %r14 -; SSSE3-NEXT: movd %r14d, %xmm6 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] -; SSSE3-NEXT: shlq $55, %r15 +; SSSE3-NEXT: movd %r14d, %xmm1 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] +; SSSE3-NEXT: shlq $54, %r15 ; SSSE3-NEXT: sarq $63, %r15 -; SSSE3-NEXT: movd %r15d, %xmm3 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] -; SSSE3-NEXT: shlq $63, %r12 +; SSSE3-NEXT: movd %r15d, %xmm4 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] +; SSSE3-NEXT: shlq $55, %r12 ; SSSE3-NEXT: sarq $63, %r12 -; SSSE3-NEXT: movd %r12d, %xmm1 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3],xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] -; SSSE3-NEXT: shlq $50, %r13 +; SSSE3-NEXT: movd %r12d, %xmm3 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SSSE3-NEXT: shlq $60, %r13 ; SSSE3-NEXT: sarq $63, %r13 ; SSSE3-NEXT: movd %r13d, %xmm2 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] -; SSSE3-NEXT: shlq $58, %rbx +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] +; SSSE3-NEXT: shlq $61, %rbx ; SSSE3-NEXT: sarq $63, %rbx -; SSSE3-NEXT: movd %ebx, %xmm3 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3],xmm1[4],xmm6[4],xmm1[5],xmm6[5],xmm1[6],xmm6[6],xmm1[7],xmm6[7] -; SSSE3-NEXT: shlq $54, %rax +; SSSE3-NEXT: movd %ebx, %xmm4 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] +; SSSE3-NEXT: shlq $62, %rax ; SSSE3-NEXT: sarq $63, %rax -; SSSE3-NEXT: movd %eax, %xmm5 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] -; SSSE3-NEXT: shlq $62, %rcx +; SSSE3-NEXT: movd %eax, %xmm6 +; SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1] +; SSSE3-NEXT: shlq $63, %rcx ; SSSE3-NEXT: sarq $63, %rcx -; SSSE3-NEXT: movd %ecx, %xmm4 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] -; SSSE3-NEXT: shlq $52, %rsi -; SSSE3-NEXT: sarq $63, %rsi -; SSSE3-NEXT: movd %esi, %xmm2 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3],xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] -; SSSE3-NEXT: shlq $60, %rdi +; SSSE3-NEXT: movd %ecx, %xmm1 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] +; SSSE3-NEXT: shlq $58, %rdx +; SSSE3-NEXT: sarq $63, %rdx +; SSSE3-NEXT: movd %edx, %xmm2 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3],xmm1[4],xmm6[4],xmm1[5],xmm6[5],xmm1[6],xmm6[6],xmm1[7],xmm6[7] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] +; SSSE3-NEXT: shlq $59, %rdi ; SSSE3-NEXT: sarq $63, %rdi -; SSSE3-NEXT: movd %edi, %xmm3 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] -; SSSE3-NEXT: shrq $15, %rbp +; SSSE3-NEXT: movd %edi, %xmm4 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] +; SSSE3-NEXT: shlq $57, %rbp +; SSSE3-NEXT: sarq $63, %rbp ; SSSE3-NEXT: movd %ebp, %xmm2 -; SSSE3-NEXT: shrq $7, %rdx -; SSSE3-NEXT: movd %edx, %xmm5 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3],xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3],xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] +; SSSE3-NEXT: shrq $7, %rsi +; SSSE3-NEXT: movd %esi, %xmm5 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3],xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0] ; SSSE3-NEXT: popq %rbx ; SSSE3-NEXT: popq %r12 ; SSSE3-NEXT: popq %r13 Index: test/CodeGen/X86/vector-shuffle-variable-128.ll =================================================================== --- test/CodeGen/X86/vector-shuffle-variable-128.ll +++ test/CodeGen/X86/vector-shuffle-variable-128.ll @@ -83,7 +83,7 @@ ; SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero ; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE2-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSE2-NEXT: retq ; ; SSSE3-LABEL: var_shuffle_v4f32_v4f32_xxxx_i32: @@ -103,7 +103,7 @@ ; SSSE3-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero ; SSSE3-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; SSSE3-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSSE3-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSSE3-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSSE3-NEXT: retq ; ; SSE41-LABEL: var_shuffle_v4f32_v4f32_xxxx_i32: @@ -168,7 +168,7 @@ ; SSE2-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero ; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSE2-NEXT: retq ; ; SSSE3-LABEL: var_shuffle_v4i32_v4i32_xxxx_i32: @@ -188,7 +188,7 @@ ; SSSE3-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero ; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSSE3-NEXT: retq ; ; SSE41-LABEL: var_shuffle_v4i32_v4i32_xxxx_i32: @@ -257,27 +257,27 @@ ; SSE2-NEXT: andl $7, %eax ; SSE2-NEXT: movzwl -24(%rsp,%rax,2), %eax ; SSE2-NEXT: movd %eax, %xmm0 -; SSE2-NEXT: movzwl -24(%rsp,%rcx,2), %eax +; SSE2-NEXT: movzwl -24(%rsp,%r10,2), %eax ; SSE2-NEXT: movd %eax, %xmm1 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; SSE2-NEXT: movzwl -24(%rsp,%r9,2), %eax ; SSE2-NEXT: movd %eax, %xmm0 -; SSE2-NEXT: movzwl -24(%rsp,%rsi,2), %eax +; SSE2-NEXT: movzwl -24(%rsp,%r8,2), %eax ; SSE2-NEXT: movd %eax, %xmm2 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; SSE2-NEXT: movzwl -24(%rsp,%r10,2), %eax +; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE2-NEXT: movzwl -24(%rsp,%rcx,2), %eax ; SSE2-NEXT: movd %eax, %xmm0 ; SSE2-NEXT: movzwl -24(%rsp,%rdx,2), %eax ; SSE2-NEXT: movd %eax, %xmm1 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE2-NEXT: movzwl -24(%rsp,%r8,2), %eax +; SSE2-NEXT: movzwl -24(%rsp,%rsi,2), %eax ; SSE2-NEXT: movd %eax, %xmm3 ; SSE2-NEXT: movzwl -24(%rsp,%rdi,2), %eax ; SSE2-NEXT: movd %eax, %xmm0 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] ; SSE2-NEXT: retq ; ; SSSE3-LABEL: var_shuffle_v8i16_v8i16_xxxxxxxx_i16: @@ -301,27 +301,27 @@ ; SSSE3-NEXT: andl $7, %eax ; SSSE3-NEXT: movzwl -24(%rsp,%rax,2), %eax ; SSSE3-NEXT: movd %eax, %xmm0 -; SSSE3-NEXT: movzwl -24(%rsp,%rcx,2), %eax +; SSSE3-NEXT: movzwl -24(%rsp,%r10,2), %eax ; SSSE3-NEXT: movd %eax, %xmm1 ; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; SSSE3-NEXT: movzwl -24(%rsp,%r9,2), %eax ; SSSE3-NEXT: movd %eax, %xmm0 -; SSSE3-NEXT: movzwl -24(%rsp,%rsi,2), %eax +; SSSE3-NEXT: movzwl -24(%rsp,%r8,2), %eax ; SSSE3-NEXT: movd %eax, %xmm2 ; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; SSSE3-NEXT: movzwl -24(%rsp,%r10,2), %eax +; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSSE3-NEXT: movzwl -24(%rsp,%rcx,2), %eax ; SSSE3-NEXT: movd %eax, %xmm0 ; SSSE3-NEXT: movzwl -24(%rsp,%rdx,2), %eax ; SSSE3-NEXT: movd %eax, %xmm1 ; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSSE3-NEXT: movzwl -24(%rsp,%r8,2), %eax +; SSSE3-NEXT: movzwl -24(%rsp,%rsi,2), %eax ; SSSE3-NEXT: movd %eax, %xmm3 ; SSSE3-NEXT: movzwl -24(%rsp,%rdi,2), %eax ; SSSE3-NEXT: movd %eax, %xmm0 ; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] ; SSSE3-NEXT: retq ; ; SSE41-LABEL: var_shuffle_v8i16_v8i16_xxxxxxxx_i16: @@ -425,67 +425,67 @@ ; SSE2-NEXT: andl $15, %eax ; SSE2-NEXT: movzbl (%rax,%r10), %eax ; SSE2-NEXT: movd %eax, %xmm9 -; SSE2-NEXT: andl $15, %ecx -; SSE2-NEXT: movzbl (%rcx,%r10), %eax +; SSE2-NEXT: movzbl {{[0-9]+}}(%rsp), %eax +; SSE2-NEXT: andl $15, %eax +; SSE2-NEXT: movzbl (%rax,%r10), %eax ; SSE2-NEXT: movd %eax, %xmm3 ; SSE2-NEXT: movzbl {{[0-9]+}}(%rsp), %eax ; SSE2-NEXT: andl $15, %eax ; SSE2-NEXT: movzbl (%rax,%r10), %eax ; SSE2-NEXT: movd %eax, %xmm10 -; SSE2-NEXT: andl $15, %r9d -; SSE2-NEXT: movzbl (%r9,%r10), %eax -; SSE2-NEXT: movd %eax, %xmm7 ; SSE2-NEXT: movzbl {{[0-9]+}}(%rsp), %eax ; SSE2-NEXT: andl $15, %eax ; SSE2-NEXT: movzbl (%rax,%r10), %eax -; SSE2-NEXT: movd %eax, %xmm11 -; SSE2-NEXT: andl $15, %esi -; SSE2-NEXT: movzbl (%rsi,%r10), %eax -; SSE2-NEXT: movd %eax, %xmm6 +; SSE2-NEXT: movd %eax, %xmm7 ; SSE2-NEXT: movzbl {{[0-9]+}}(%rsp), %eax ; SSE2-NEXT: andl $15, %eax ; SSE2-NEXT: movzbl (%rax,%r10), %eax -; SSE2-NEXT: movd %eax, %xmm12 +; SSE2-NEXT: movd %eax, %xmm11 ; SSE2-NEXT: movzbl {{[0-9]+}}(%rsp), %eax ; SSE2-NEXT: andl $15, %eax ; SSE2-NEXT: movzbl (%rax,%r10), %eax -; SSE2-NEXT: movd %eax, %xmm5 +; SSE2-NEXT: movd %eax, %xmm6 ; SSE2-NEXT: movzbl {{[0-9]+}}(%rsp), %eax ; SSE2-NEXT: andl $15, %eax ; SSE2-NEXT: movzbl (%rax,%r10), %eax -; SSE2-NEXT: movd %eax, %xmm13 -; SSE2-NEXT: andl $15, %edx -; SSE2-NEXT: movzbl (%rdx,%r10), %eax -; SSE2-NEXT: movd %eax, %xmm4 +; SSE2-NEXT: movd %eax, %xmm12 ; SSE2-NEXT: movzbl {{[0-9]+}}(%rsp), %eax ; SSE2-NEXT: andl $15, %eax ; SSE2-NEXT: movzbl (%rax,%r10), %eax -; SSE2-NEXT: movd %eax, %xmm14 +; SSE2-NEXT: movd %eax, %xmm5 +; SSE2-NEXT: andl $15, %r9d +; SSE2-NEXT: movzbl (%r9,%r10), %eax +; SSE2-NEXT: movd %eax, %xmm13 ; SSE2-NEXT: andl $15, %r8d ; SSE2-NEXT: movzbl (%r8,%r10), %eax +; SSE2-NEXT: movd %eax, %xmm4 +; SSE2-NEXT: andl $15, %ecx +; SSE2-NEXT: movzbl (%rcx,%r10), %eax +; SSE2-NEXT: movd %eax, %xmm14 +; SSE2-NEXT: andl $15, %edx +; SSE2-NEXT: movzbl (%rdx,%r10), %eax ; SSE2-NEXT: movd %eax, %xmm1 -; SSE2-NEXT: movzbl {{[0-9]+}}(%rsp), %eax -; SSE2-NEXT: andl $15, %eax -; SSE2-NEXT: movzbl (%rax,%r10), %eax +; SSE2-NEXT: andl $15, %esi +; SSE2-NEXT: movzbl (%rsi,%r10), %eax ; SSE2-NEXT: movd %eax, %xmm2 ; SSE2-NEXT: andl $15, %edi ; SSE2-NEXT: movzbl (%rdi,%r10), %eax ; SSE2-NEXT: movd %eax, %xmm0 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm8[0],xmm15[1],xmm8[1],xmm15[2],xmm8[2],xmm15[3],xmm8[3],xmm15[4],xmm8[4],xmm15[5],xmm8[5],xmm15[6],xmm8[6],xmm15[7],xmm8[7] ; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3],xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm15[0],xmm3[1],xmm15[1],xmm3[2],xmm15[2],xmm3[3],xmm15[3],xmm3[4],xmm15[4],xmm3[5],xmm15[5],xmm3[6],xmm15[6],xmm3[7],xmm15[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm15[0],xmm3[1],xmm15[1],xmm3[2],xmm15[2],xmm3[3],xmm15[3] ; SSE2-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm10[0],xmm7[1],xmm10[1],xmm7[2],xmm10[2],xmm7[3],xmm10[3],xmm7[4],xmm10[4],xmm7[5],xmm10[5],xmm7[6],xmm10[6],xmm7[7],xmm10[7] ; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm11[0],xmm6[1],xmm11[1],xmm6[2],xmm11[2],xmm6[3],xmm11[3],xmm6[4],xmm11[4],xmm6[5],xmm11[5],xmm6[6],xmm11[6],xmm6[7],xmm11[7] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3],xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3],xmm6[4],xmm3[4],xmm6[5],xmm3[5],xmm6[6],xmm3[6],xmm6[7],xmm3[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1] ; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm12[0],xmm5[1],xmm12[1],xmm5[2],xmm12[2],xmm5[3],xmm12[3],xmm5[4],xmm12[4],xmm5[5],xmm12[5],xmm5[6],xmm12[6],xmm5[7],xmm12[7] ; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm13[0],xmm4[1],xmm13[1],xmm4[2],xmm13[2],xmm4[3],xmm13[3],xmm4[4],xmm13[4],xmm4[5],xmm13[5],xmm4[6],xmm13[6],xmm4[7],xmm13[7] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3],xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] ; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm14[0],xmm1[1],xmm14[1],xmm1[2],xmm14[2],xmm1[3],xmm14[3],xmm1[4],xmm14[4],xmm1[5],xmm14[5],xmm1[6],xmm14[6],xmm1[7],xmm14[7] ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3],xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm6[0] ; SSE2-NEXT: retq ; ; SSSE3-LABEL: var_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8: @@ -510,67 +510,67 @@ ; SSSE3-NEXT: andl $15, %eax ; SSSE3-NEXT: movzbl (%rax,%r10), %eax ; SSSE3-NEXT: movd %eax, %xmm9 -; SSSE3-NEXT: andl $15, %ecx -; SSSE3-NEXT: movzbl (%rcx,%r10), %eax +; SSSE3-NEXT: movzbl {{[0-9]+}}(%rsp), %eax +; SSSE3-NEXT: andl $15, %eax +; SSSE3-NEXT: movzbl (%rax,%r10), %eax ; SSSE3-NEXT: movd %eax, %xmm3 ; SSSE3-NEXT: movzbl {{[0-9]+}}(%rsp), %eax ; SSSE3-NEXT: andl $15, %eax ; SSSE3-NEXT: movzbl (%rax,%r10), %eax ; SSSE3-NEXT: movd %eax, %xmm10 -; SSSE3-NEXT: andl $15, %r9d -; SSSE3-NEXT: movzbl (%r9,%r10), %eax -; SSSE3-NEXT: movd %eax, %xmm7 ; SSSE3-NEXT: movzbl {{[0-9]+}}(%rsp), %eax ; SSSE3-NEXT: andl $15, %eax ; SSSE3-NEXT: movzbl (%rax,%r10), %eax -; SSSE3-NEXT: movd %eax, %xmm11 -; SSSE3-NEXT: andl $15, %esi -; SSSE3-NEXT: movzbl (%rsi,%r10), %eax -; SSSE3-NEXT: movd %eax, %xmm6 +; SSSE3-NEXT: movd %eax, %xmm7 ; SSSE3-NEXT: movzbl {{[0-9]+}}(%rsp), %eax ; SSSE3-NEXT: andl $15, %eax ; SSSE3-NEXT: movzbl (%rax,%r10), %eax -; SSSE3-NEXT: movd %eax, %xmm12 +; SSSE3-NEXT: movd %eax, %xmm11 ; SSSE3-NEXT: movzbl {{[0-9]+}}(%rsp), %eax ; SSSE3-NEXT: andl $15, %eax ; SSSE3-NEXT: movzbl (%rax,%r10), %eax -; SSSE3-NEXT: movd %eax, %xmm5 +; SSSE3-NEXT: movd %eax, %xmm6 ; SSSE3-NEXT: movzbl {{[0-9]+}}(%rsp), %eax ; SSSE3-NEXT: andl $15, %eax ; SSSE3-NEXT: movzbl (%rax,%r10), %eax -; SSSE3-NEXT: movd %eax, %xmm13 -; SSSE3-NEXT: andl $15, %edx -; SSSE3-NEXT: movzbl (%rdx,%r10), %eax -; SSSE3-NEXT: movd %eax, %xmm4 +; SSSE3-NEXT: movd %eax, %xmm12 ; SSSE3-NEXT: movzbl {{[0-9]+}}(%rsp), %eax ; SSSE3-NEXT: andl $15, %eax ; SSSE3-NEXT: movzbl (%rax,%r10), %eax -; SSSE3-NEXT: movd %eax, %xmm14 +; SSSE3-NEXT: movd %eax, %xmm5 +; SSSE3-NEXT: andl $15, %r9d +; SSSE3-NEXT: movzbl (%r9,%r10), %eax +; SSSE3-NEXT: movd %eax, %xmm13 ; SSSE3-NEXT: andl $15, %r8d ; SSSE3-NEXT: movzbl (%r8,%r10), %eax +; SSSE3-NEXT: movd %eax, %xmm4 +; SSSE3-NEXT: andl $15, %ecx +; SSSE3-NEXT: movzbl (%rcx,%r10), %eax +; SSSE3-NEXT: movd %eax, %xmm14 +; SSSE3-NEXT: andl $15, %edx +; SSSE3-NEXT: movzbl (%rdx,%r10), %eax ; SSSE3-NEXT: movd %eax, %xmm1 -; SSSE3-NEXT: movzbl {{[0-9]+}}(%rsp), %eax -; SSSE3-NEXT: andl $15, %eax -; SSSE3-NEXT: movzbl (%rax,%r10), %eax +; SSSE3-NEXT: andl $15, %esi +; SSSE3-NEXT: movzbl (%rsi,%r10), %eax ; SSSE3-NEXT: movd %eax, %xmm2 ; SSSE3-NEXT: andl $15, %edi ; SSSE3-NEXT: movzbl (%rdi,%r10), %eax ; SSSE3-NEXT: movd %eax, %xmm0 ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm8[0],xmm15[1],xmm8[1],xmm15[2],xmm8[2],xmm15[3],xmm8[3],xmm15[4],xmm8[4],xmm15[5],xmm8[5],xmm15[6],xmm8[6],xmm15[7],xmm8[7] ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3],xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm15[0],xmm3[1],xmm15[1],xmm3[2],xmm15[2],xmm3[3],xmm15[3],xmm3[4],xmm15[4],xmm3[5],xmm15[5],xmm3[6],xmm15[6],xmm3[7],xmm15[7] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm15[0],xmm3[1],xmm15[1],xmm3[2],xmm15[2],xmm3[3],xmm15[3] ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm10[0],xmm7[1],xmm10[1],xmm7[2],xmm10[2],xmm7[3],xmm10[3],xmm7[4],xmm10[4],xmm7[5],xmm10[5],xmm7[6],xmm10[6],xmm7[7],xmm10[7] ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm11[0],xmm6[1],xmm11[1],xmm6[2],xmm11[2],xmm6[3],xmm11[3],xmm6[4],xmm11[4],xmm6[5],xmm11[5],xmm6[6],xmm11[6],xmm6[7],xmm11[7] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3],xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3],xmm6[4],xmm3[4],xmm6[5],xmm3[5],xmm6[6],xmm3[6],xmm6[7],xmm3[7] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1] ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm12[0],xmm5[1],xmm12[1],xmm5[2],xmm12[2],xmm5[3],xmm12[3],xmm5[4],xmm12[4],xmm5[5],xmm12[5],xmm5[6],xmm12[6],xmm5[7],xmm12[7] ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm13[0],xmm4[1],xmm13[1],xmm4[2],xmm13[2],xmm4[3],xmm13[3],xmm4[4],xmm13[4],xmm4[5],xmm13[5],xmm4[6],xmm13[6],xmm4[7],xmm13[7] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3],xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm14[0],xmm1[1],xmm14[1],xmm1[2],xmm14[2],xmm1[3],xmm14[3],xmm1[4],xmm14[4],xmm1[5],xmm14[5],xmm1[6],xmm14[6],xmm1[7],xmm14[7] ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3],xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm6[0] ; SSSE3-NEXT: retq ; ; SSE41-LABEL: var_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8: @@ -739,7 +739,7 @@ ; SSE2-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero ; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSE2-NEXT: retq ; ; SSSE3-LABEL: mem_shuffle_v4i32_v4i32_xxxx_i32: @@ -759,7 +759,7 @@ ; SSSE3-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero ; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; SSSE3-NEXT: retq ; ; SSE41-LABEL: mem_shuffle_v4i32_v4i32_xxxx_i32: @@ -824,23 +824,23 @@ ; SSE2-NEXT: leaq -{{[0-9]+}}(%rsp), %rcx ; SSE2-NEXT: movzbl (%rdx,%rcx), %edx ; SSE2-NEXT: movd %edx, %xmm8 -; SSE2-NEXT: movzbl 7(%rdi), %edx +; SSE2-NEXT: movzbl 14(%rdi), %edx ; SSE2-NEXT: andl $15, %edx ; SSE2-NEXT: movzbl (%rdx,%rcx), %edx ; SSE2-NEXT: movd %edx, %xmm15 -; SSE2-NEXT: movzbl 11(%rdi), %edx +; SSE2-NEXT: movzbl 13(%rdi), %edx ; SSE2-NEXT: andl $15, %edx ; SSE2-NEXT: movzbl (%rdx,%rcx), %edx ; SSE2-NEXT: movd %edx, %xmm9 -; SSE2-NEXT: movzbl 3(%rdi), %edx +; SSE2-NEXT: movzbl 12(%rdi), %edx ; SSE2-NEXT: andl $15, %edx ; SSE2-NEXT: movzbl (%rdx,%rcx), %edx ; SSE2-NEXT: movd %edx, %xmm3 -; SSE2-NEXT: movzbl 13(%rdi), %edx +; SSE2-NEXT: movzbl 11(%rdi), %edx ; SSE2-NEXT: andl $15, %edx ; SSE2-NEXT: movzbl (%rdx,%rcx), %edx ; SSE2-NEXT: movd %edx, %xmm10 -; SSE2-NEXT: movzbl 5(%rdi), %edx +; SSE2-NEXT: movzbl 10(%rdi), %edx ; SSE2-NEXT: andl $15, %edx ; SSE2-NEXT: movzbl (%rdx,%rcx), %edx ; SSE2-NEXT: movd %edx, %xmm7 @@ -848,11 +848,11 @@ ; SSE2-NEXT: andl $15, %edx ; SSE2-NEXT: movzbl (%rdx,%rcx), %edx ; SSE2-NEXT: movd %edx, %xmm11 -; SSE2-NEXT: movzbl 1(%rdi), %edx +; SSE2-NEXT: movzbl 8(%rdi), %edx ; SSE2-NEXT: andl $15, %edx ; SSE2-NEXT: movzbl (%rdx,%rcx), %edx ; SSE2-NEXT: movd %edx, %xmm6 -; SSE2-NEXT: movzbl 14(%rdi), %edx +; SSE2-NEXT: movzbl 7(%rdi), %edx ; SSE2-NEXT: andl $15, %edx ; SSE2-NEXT: movzbl (%rdx,%rcx), %edx ; SSE2-NEXT: movd %edx, %xmm12 @@ -860,23 +860,23 @@ ; SSE2-NEXT: andl $15, %edx ; SSE2-NEXT: movzbl (%rdx,%rcx), %edx ; SSE2-NEXT: movd %edx, %xmm5 -; SSE2-NEXT: movzbl 10(%rdi), %edx +; SSE2-NEXT: movzbl 5(%rdi), %edx ; SSE2-NEXT: andl $15, %edx ; SSE2-NEXT: movzbl (%rdx,%rcx), %edx ; SSE2-NEXT: movd %edx, %xmm13 -; SSE2-NEXT: movzbl 2(%rdi), %edx +; SSE2-NEXT: movzbl 4(%rdi), %edx ; SSE2-NEXT: andl $15, %edx ; SSE2-NEXT: movzbl (%rdx,%rcx), %edx ; SSE2-NEXT: movd %edx, %xmm4 -; SSE2-NEXT: movzbl 12(%rdi), %edx +; SSE2-NEXT: movzbl 3(%rdi), %edx ; SSE2-NEXT: andl $15, %edx ; SSE2-NEXT: movzbl (%rdx,%rcx), %edx ; SSE2-NEXT: movd %edx, %xmm14 -; SSE2-NEXT: movzbl 4(%rdi), %edx +; SSE2-NEXT: movzbl 2(%rdi), %edx ; SSE2-NEXT: andl $15, %edx ; SSE2-NEXT: movzbl (%rdx,%rcx), %edx ; SSE2-NEXT: movd %edx, %xmm1 -; SSE2-NEXT: movzbl 8(%rdi), %edx +; SSE2-NEXT: movzbl 1(%rdi), %edx ; SSE2-NEXT: andl $15, %edx ; SSE2-NEXT: movzbl (%rdx,%rcx), %edx ; SSE2-NEXT: movd %edx, %xmm2 @@ -885,19 +885,19 @@ ; SSE2-NEXT: movd %eax, %xmm0 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm8[0],xmm15[1],xmm8[1],xmm15[2],xmm8[2],xmm15[3],xmm8[3],xmm15[4],xmm8[4],xmm15[5],xmm8[5],xmm15[6],xmm8[6],xmm15[7],xmm8[7] ; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3],xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm15[0],xmm3[1],xmm15[1],xmm3[2],xmm15[2],xmm3[3],xmm15[3],xmm3[4],xmm15[4],xmm3[5],xmm15[5],xmm3[6],xmm15[6],xmm3[7],xmm15[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm15[0],xmm3[1],xmm15[1],xmm3[2],xmm15[2],xmm3[3],xmm15[3] ; SSE2-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm10[0],xmm7[1],xmm10[1],xmm7[2],xmm10[2],xmm7[3],xmm10[3],xmm7[4],xmm10[4],xmm7[5],xmm10[5],xmm7[6],xmm10[6],xmm7[7],xmm10[7] ; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm11[0],xmm6[1],xmm11[1],xmm6[2],xmm11[2],xmm6[3],xmm11[3],xmm6[4],xmm11[4],xmm6[5],xmm11[5],xmm6[6],xmm11[6],xmm6[7],xmm11[7] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3],xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3],xmm6[4],xmm3[4],xmm6[5],xmm3[5],xmm6[6],xmm3[6],xmm6[7],xmm3[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1] ; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm12[0],xmm5[1],xmm12[1],xmm5[2],xmm12[2],xmm5[3],xmm12[3],xmm5[4],xmm12[4],xmm5[5],xmm12[5],xmm5[6],xmm12[6],xmm5[7],xmm12[7] ; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm13[0],xmm4[1],xmm13[1],xmm4[2],xmm13[2],xmm4[3],xmm13[3],xmm4[4],xmm13[4],xmm4[5],xmm13[5],xmm4[6],xmm13[6],xmm4[7],xmm13[7] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3],xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] ; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm14[0],xmm1[1],xmm14[1],xmm1[2],xmm14[2],xmm1[3],xmm14[3],xmm1[4],xmm14[4],xmm1[5],xmm14[5],xmm1[6],xmm14[6],xmm1[7],xmm14[7] ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3],xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm6[0] ; SSE2-NEXT: retq ; ; SSSE3-LABEL: mem_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8: @@ -909,23 +909,23 @@ ; SSSE3-NEXT: leaq -{{[0-9]+}}(%rsp), %rcx ; SSSE3-NEXT: movzbl (%rdx,%rcx), %edx ; SSSE3-NEXT: movd %edx, %xmm8 -; SSSE3-NEXT: movzbl 7(%rdi), %edx +; SSSE3-NEXT: movzbl 14(%rdi), %edx ; SSSE3-NEXT: andl $15, %edx ; SSSE3-NEXT: movzbl (%rdx,%rcx), %edx ; SSSE3-NEXT: movd %edx, %xmm15 -; SSSE3-NEXT: movzbl 11(%rdi), %edx +; SSSE3-NEXT: movzbl 13(%rdi), %edx ; SSSE3-NEXT: andl $15, %edx ; SSSE3-NEXT: movzbl (%rdx,%rcx), %edx ; SSSE3-NEXT: movd %edx, %xmm9 -; SSSE3-NEXT: movzbl 3(%rdi), %edx +; SSSE3-NEXT: movzbl 12(%rdi), %edx ; SSSE3-NEXT: andl $15, %edx ; SSSE3-NEXT: movzbl (%rdx,%rcx), %edx ; SSSE3-NEXT: movd %edx, %xmm3 -; SSSE3-NEXT: movzbl 13(%rdi), %edx +; SSSE3-NEXT: movzbl 11(%rdi), %edx ; SSSE3-NEXT: andl $15, %edx ; SSSE3-NEXT: movzbl (%rdx,%rcx), %edx ; SSSE3-NEXT: movd %edx, %xmm10 -; SSSE3-NEXT: movzbl 5(%rdi), %edx +; SSSE3-NEXT: movzbl 10(%rdi), %edx ; SSSE3-NEXT: andl $15, %edx ; SSSE3-NEXT: movzbl (%rdx,%rcx), %edx ; SSSE3-NEXT: movd %edx, %xmm7 @@ -933,11 +933,11 @@ ; SSSE3-NEXT: andl $15, %edx ; SSSE3-NEXT: movzbl (%rdx,%rcx), %edx ; SSSE3-NEXT: movd %edx, %xmm11 -; SSSE3-NEXT: movzbl 1(%rdi), %edx +; SSSE3-NEXT: movzbl 8(%rdi), %edx ; SSSE3-NEXT: andl $15, %edx ; SSSE3-NEXT: movzbl (%rdx,%rcx), %edx ; SSSE3-NEXT: movd %edx, %xmm6 -; SSSE3-NEXT: movzbl 14(%rdi), %edx +; SSSE3-NEXT: movzbl 7(%rdi), %edx ; SSSE3-NEXT: andl $15, %edx ; SSSE3-NEXT: movzbl (%rdx,%rcx), %edx ; SSSE3-NEXT: movd %edx, %xmm12 @@ -945,23 +945,23 @@ ; SSSE3-NEXT: andl $15, %edx ; SSSE3-NEXT: movzbl (%rdx,%rcx), %edx ; SSSE3-NEXT: movd %edx, %xmm5 -; SSSE3-NEXT: movzbl 10(%rdi), %edx +; SSSE3-NEXT: movzbl 5(%rdi), %edx ; SSSE3-NEXT: andl $15, %edx ; SSSE3-NEXT: movzbl (%rdx,%rcx), %edx ; SSSE3-NEXT: movd %edx, %xmm13 -; SSSE3-NEXT: movzbl 2(%rdi), %edx +; SSSE3-NEXT: movzbl 4(%rdi), %edx ; SSSE3-NEXT: andl $15, %edx ; SSSE3-NEXT: movzbl (%rdx,%rcx), %edx ; SSSE3-NEXT: movd %edx, %xmm4 -; SSSE3-NEXT: movzbl 12(%rdi), %edx +; SSSE3-NEXT: movzbl 3(%rdi), %edx ; SSSE3-NEXT: andl $15, %edx ; SSSE3-NEXT: movzbl (%rdx,%rcx), %edx ; SSSE3-NEXT: movd %edx, %xmm14 -; SSSE3-NEXT: movzbl 4(%rdi), %edx +; SSSE3-NEXT: movzbl 2(%rdi), %edx ; SSSE3-NEXT: andl $15, %edx ; SSSE3-NEXT: movzbl (%rdx,%rcx), %edx ; SSSE3-NEXT: movd %edx, %xmm1 -; SSSE3-NEXT: movzbl 8(%rdi), %edx +; SSSE3-NEXT: movzbl 1(%rdi), %edx ; SSSE3-NEXT: andl $15, %edx ; SSSE3-NEXT: movzbl (%rdx,%rcx), %edx ; SSSE3-NEXT: movd %edx, %xmm2 @@ -970,19 +970,19 @@ ; SSSE3-NEXT: movd %eax, %xmm0 ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm8[0],xmm15[1],xmm8[1],xmm15[2],xmm8[2],xmm15[3],xmm8[3],xmm15[4],xmm8[4],xmm15[5],xmm8[5],xmm15[6],xmm8[6],xmm15[7],xmm8[7] ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3],xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm15[0],xmm3[1],xmm15[1],xmm3[2],xmm15[2],xmm3[3],xmm15[3],xmm3[4],xmm15[4],xmm3[5],xmm15[5],xmm3[6],xmm15[6],xmm3[7],xmm15[7] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm15[0],xmm3[1],xmm15[1],xmm3[2],xmm15[2],xmm3[3],xmm15[3] ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm10[0],xmm7[1],xmm10[1],xmm7[2],xmm10[2],xmm7[3],xmm10[3],xmm7[4],xmm10[4],xmm7[5],xmm10[5],xmm7[6],xmm10[6],xmm7[7],xmm10[7] ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm11[0],xmm6[1],xmm11[1],xmm6[2],xmm11[2],xmm6[3],xmm11[3],xmm6[4],xmm11[4],xmm6[5],xmm11[5],xmm6[6],xmm11[6],xmm6[7],xmm11[7] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3],xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3],xmm6[4],xmm3[4],xmm6[5],xmm3[5],xmm6[6],xmm3[6],xmm6[7],xmm3[7] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1] ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm12[0],xmm5[1],xmm12[1],xmm5[2],xmm12[2],xmm5[3],xmm12[3],xmm5[4],xmm12[4],xmm5[5],xmm12[5],xmm5[6],xmm12[6],xmm5[7],xmm12[7] ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm13[0],xmm4[1],xmm13[1],xmm4[2],xmm13[2],xmm4[3],xmm13[3],xmm4[4],xmm13[4],xmm4[5],xmm13[5],xmm4[6],xmm13[6],xmm4[7],xmm13[7] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3],xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm14[0],xmm1[1],xmm14[1],xmm1[2],xmm14[2],xmm1[3],xmm14[3],xmm1[4],xmm14[4],xmm1[5],xmm14[5],xmm1[6],xmm14[6],xmm1[7],xmm14[7] ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3],xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm6[0] ; SSSE3-NEXT: retq ; ; SSE41-LABEL: mem_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8: @@ -1225,28 +1225,27 @@ ; SSE2-NEXT: andl $7, %ecx ; SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; SSE2-NEXT: andl $7, %r8d -; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) +; SSE2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) ; SSE2-NEXT: andl $7, %r9d ; SSE2-NEXT: movzwl -24(%rsp,%rcx,2), %eax ; SSE2-NEXT: movd %eax, %xmm0 -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE2-NEXT: movzwl -24(%rsp,%r9,2), %eax -; SSE2-NEXT: movd %eax, %xmm2 -; SSE2-NEXT: movzwl -24(%rsp,%rsi,2), %eax -; SSE2-NEXT: movd %eax, %xmm3 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] ; SSE2-NEXT: movzwl -40(%rsp,%rdx,2), %eax -; SSE2-NEXT: movd %eax, %xmm2 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; SSE2-NEXT: movzwl -40(%rsp,%r8,2), %eax ; SSE2-NEXT: movd %eax, %xmm1 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE2-NEXT: movzwl -24(%rsp,%rsi,2), %eax +; SSE2-NEXT: movd %eax, %xmm2 ; SSE2-NEXT: movzwl -40(%rsp,%rdi,2), %eax ; SSE2-NEXT: movd %eax, %xmm0 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE2-NEXT: movzwl -24(%rsp,%r9,2), %eax +; SSE2-NEXT: movd %eax, %xmm1 +; SSE2-NEXT: movzwl -40(%rsp,%r8,2), %eax +; SSE2-NEXT: movd %eax, %xmm2 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] ; SSE2-NEXT: retq ; ; SSSE3-LABEL: var_shuffle_v8i16_v8i16_xyxyxy00_i16: @@ -1263,28 +1262,27 @@ ; SSSE3-NEXT: andl $7, %ecx ; SSSE3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; SSSE3-NEXT: andl $7, %r8d -; SSSE3-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) +; SSSE3-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) ; SSSE3-NEXT: andl $7, %r9d ; SSSE3-NEXT: movzwl -24(%rsp,%rcx,2), %eax ; SSSE3-NEXT: movd %eax, %xmm0 -; SSSE3-NEXT: pxor %xmm1, %xmm1 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSSE3-NEXT: movzwl -24(%rsp,%r9,2), %eax -; SSSE3-NEXT: movd %eax, %xmm2 -; SSSE3-NEXT: movzwl -24(%rsp,%rsi,2), %eax -; SSSE3-NEXT: movd %eax, %xmm3 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] ; SSSE3-NEXT: movzwl -40(%rsp,%rdx,2), %eax -; SSSE3-NEXT: movd %eax, %xmm2 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; SSSE3-NEXT: movzwl -40(%rsp,%r8,2), %eax ; SSSE3-NEXT: movd %eax, %xmm1 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSSE3-NEXT: movzwl -24(%rsp,%rsi,2), %eax +; SSSE3-NEXT: movd %eax, %xmm2 ; SSSE3-NEXT: movzwl -40(%rsp,%rdi,2), %eax ; SSSE3-NEXT: movd %eax, %xmm0 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSSE3-NEXT: movzwl -24(%rsp,%r9,2), %eax +; SSSE3-NEXT: movd %eax, %xmm1 +; SSSE3-NEXT: movzwl -40(%rsp,%r8,2), %eax +; SSSE3-NEXT: movd %eax, %xmm2 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSSE3-NEXT: pxor %xmm1, %xmm1 +; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] ; SSSE3-NEXT: retq ; ; SSE41-LABEL: var_shuffle_v8i16_v8i16_xyxyxy00_i16: Index: test/CodeGen/X86/vshift-1.ll =================================================================== --- test/CodeGen/X86/vshift-1.ll +++ test/CodeGen/X86/vshift-1.ll @@ -28,12 +28,9 @@ ; X32-LABEL: shift1b: ; X32: # BB#0: # %entry ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; X32-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] -; X32-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero -; X32-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,1,1] -; X32-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; X32-NEXT: psllq %xmm2, %xmm0 +; X32-NEXT: movq {{.*#+}} xmm1 = mem[0],zero +; X32-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] +; X32-NEXT: psllq %xmm1, %xmm0 ; X32-NEXT: movdqa %xmm0, (%eax) ; X32-NEXT: retl ; Index: test/CodeGen/X86/vshift-2.ll =================================================================== --- test/CodeGen/X86/vshift-2.ll +++ test/CodeGen/X86/vshift-2.ll @@ -28,12 +28,9 @@ ; X32-LABEL: shift1b: ; X32: # BB#0: # %entry ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; X32-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] -; X32-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero -; X32-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,1,1] -; X32-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; X32-NEXT: psrlq %xmm2, %xmm0 +; X32-NEXT: movq {{.*#+}} xmm1 = mem[0],zero +; X32-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] +; X32-NEXT: psrlq %xmm1, %xmm0 ; X32-NEXT: movdqa %xmm0, (%eax) ; X32-NEXT: retl ;