Index: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp =================================================================== --- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp +++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp @@ -4460,6 +4460,29 @@ SDLoc dl(Op); SDValue V; bool First = true; + + // SSE4.1 - use PINSRB to insert each byte directly. + if (Subtarget->hasSSE41()) { + for (unsigned i = 0; i < 16; ++i) { + bool isNonZero = (NonZeros & (1 << i)) != 0; + if (isNonZero) { + if (First) { + if (NumZero) + V = getZeroVector(MVT::v16i8, Subtarget, DAG, dl); + else + V = DAG.getUNDEF(MVT::v16i8); + First = false; + } + V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, + MVT::v16i8, V, Op.getOperand(i), + DAG.getIntPtrConstant(i)); + } + } + + return V; + } + + // Pre-SSE4.1 - merge byte pairs and insert with PINSRW. for (unsigned i = 0; i < 16; ++i) { bool ThisIsNonZero = (NonZeros & (1 << i)) != 0; if (ThisIsNonZero && First) { Index: llvm/trunk/test/CodeGen/X86/vec_cast2.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/vec_cast2.ll +++ llvm/trunk/test/CodeGen/X86/vec_cast2.ll @@ -100,37 +100,29 @@ ; ; CHECK-WIDE-LABEL: foo3_8: ; CHECK-WIDE: ## BB#0: -; CHECK-WIDE-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3] -; CHECK-WIDE-NEXT: vcvttss2si %xmm1, %eax -; CHECK-WIDE-NEXT: shll $8, %eax -; CHECK-WIDE-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; CHECK-WIDE-NEXT: vcvttss2si %xmm1, %ecx -; CHECK-WIDE-NEXT: movzbl %cl, %ecx -; CHECK-WIDE-NEXT: orl %eax, %ecx -; CHECK-WIDE-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-WIDE-NEXT: vcvttss2si %xmm1, %eax -; CHECK-WIDE-NEXT: shll $8, %eax -; CHECK-WIDE-NEXT: vcvttss2si %xmm0, %edx -; CHECK-WIDE-NEXT: movzbl %dl, %edx -; CHECK-WIDE-NEXT: orl %eax, %edx -; CHECK-WIDE-NEXT: vpinsrw $0, %edx, %xmm0, %xmm1 -; CHECK-WIDE-NEXT: vpinsrw $1, %ecx, %xmm1, %xmm1 +; CHECK-WIDE-NEXT: vcvttss2si %xmm0, %eax +; CHECK-WIDE-NEXT: vpinsrb $0, %eax, %xmm0, %xmm1 +; CHECK-WIDE-NEXT: vmovshdup %xmm0, %xmm2 ## xmm2 = xmm0[1,1,3,3] +; CHECK-WIDE-NEXT: vcvttss2si %xmm2, %eax +; CHECK-WIDE-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1 +; CHECK-WIDE-NEXT: vpermilpd $1, %xmm0, %xmm2 ## xmm2 = xmm0[1,0] +; CHECK-WIDE-NEXT: vcvttss2si %xmm2, %eax +; CHECK-WIDE-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1 +; CHECK-WIDE-NEXT: vpermilps $231, %xmm0, %xmm2 ## xmm2 = xmm0[3,1,2,3] +; CHECK-WIDE-NEXT: vcvttss2si %xmm2, %eax +; CHECK-WIDE-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1 ; CHECK-WIDE-NEXT: vextractf128 $1, %ymm0, %xmm0 -; CHECK-WIDE-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] +; CHECK-WIDE-NEXT: vcvttss2si %xmm0, %eax +; CHECK-WIDE-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1 +; CHECK-WIDE-NEXT: vmovshdup %xmm0, %xmm2 ## xmm2 = xmm0[1,1,3,3] ; CHECK-WIDE-NEXT: vcvttss2si %xmm2, %eax -; CHECK-WIDE-NEXT: shll $8, %eax -; CHECK-WIDE-NEXT: vcvttss2si %xmm0, %ecx -; CHECK-WIDE-NEXT: movzbl %cl, %ecx -; CHECK-WIDE-NEXT: orl %eax, %ecx -; CHECK-WIDE-NEXT: vpinsrw $2, %ecx, %xmm1, %xmm1 -; CHECK-WIDE-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[3,1,2,3] -; CHECK-WIDE-NEXT: vcvttss2si %xmm2, %eax -; CHECK-WIDE-NEXT: shll $8, %eax -; CHECK-WIDE-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] -; CHECK-WIDE-NEXT: vcvttss2si %xmm0, %ecx -; CHECK-WIDE-NEXT: movzbl %cl, %ecx -; CHECK-WIDE-NEXT: orl %eax, %ecx -; CHECK-WIDE-NEXT: vpinsrw $3, %ecx, %xmm1, %xmm0 +; CHECK-WIDE-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1 +; CHECK-WIDE-NEXT: vpermilpd $1, %xmm0, %xmm2 ## xmm2 = xmm0[1,0] +; CHECK-WIDE-NEXT: vcvttss2si %xmm2, %eax +; CHECK-WIDE-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1 +; CHECK-WIDE-NEXT: vpermilps $231, %xmm0, %xmm0 ## xmm0 = xmm0[3,1,2,3] +; CHECK-WIDE-NEXT: vcvttss2si %xmm0, %eax +; CHECK-WIDE-NEXT: vpinsrb $7, %eax, %xmm1, %xmm0 ; CHECK-WIDE-NEXT: vzeroupper ; CHECK-WIDE-NEXT: retl %res = fptosi <8 x float> %src to <8 x i8> @@ -145,21 +137,17 @@ ; ; CHECK-WIDE-LABEL: foo3_4: ; CHECK-WIDE: ## BB#0: -; CHECK-WIDE-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3] -; CHECK-WIDE-NEXT: vcvttss2si %xmm1, %eax -; CHECK-WIDE-NEXT: shll $8, %eax -; CHECK-WIDE-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; CHECK-WIDE-NEXT: vcvttss2si %xmm1, %ecx -; CHECK-WIDE-NEXT: movzbl %cl, %ecx -; CHECK-WIDE-NEXT: orl %eax, %ecx -; CHECK-WIDE-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-WIDE-NEXT: vcvttss2si %xmm1, %eax -; CHECK-WIDE-NEXT: shll $8, %eax -; CHECK-WIDE-NEXT: vcvttss2si %xmm0, %edx -; CHECK-WIDE-NEXT: movzbl %dl, %edx -; CHECK-WIDE-NEXT: orl %eax, %edx -; CHECK-WIDE-NEXT: vpinsrw $0, %edx, %xmm0, %xmm0 -; CHECK-WIDE-NEXT: vpinsrw $1, %ecx, %xmm0, %xmm0 +; CHECK-WIDE-NEXT: vcvttss2si %xmm0, %eax +; CHECK-WIDE-NEXT: vpinsrb $0, %eax, %xmm0, %xmm1 +; CHECK-WIDE-NEXT: vmovshdup %xmm0, %xmm2 ## xmm2 = xmm0[1,1,3,3] +; CHECK-WIDE-NEXT: vcvttss2si %xmm2, %eax +; CHECK-WIDE-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1 +; CHECK-WIDE-NEXT: vpermilpd $1, %xmm0, %xmm2 ## xmm2 = xmm0[1,0] +; CHECK-WIDE-NEXT: vcvttss2si %xmm2, %eax +; CHECK-WIDE-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1 +; CHECK-WIDE-NEXT: vpermilps $231, %xmm0, %xmm0 ## xmm0 = xmm0[3,1,2,3] +; CHECK-WIDE-NEXT: vcvttss2si %xmm0, %eax +; CHECK-WIDE-NEXT: vpinsrb $3, %eax, %xmm1, %xmm0 ; CHECK-WIDE-NEXT: retl %res = fptosi <4 x float> %src to <4 x i8> ret <4 x i8> %res Index: llvm/trunk/test/CodeGen/X86/vector-shuffle-128-v16.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/vector-shuffle-128-v16.ll +++ llvm/trunk/test/CodeGen/X86/vector-shuffle-128-v16.ll @@ -651,18 +651,30 @@ } define <16 x i8> @shuffle_v16i8_zz_zz_zz_zz_zz_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz(i8 %i) { -; SSE-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz: -; SSE: # BB#0: -; SSE-NEXT: shll $8, %edi -; SSE-NEXT: pxor %xmm0, %xmm0 -; SSE-NEXT: pinsrw $2, %edi, %xmm0 -; SSE-NEXT: retq - +; SSE2-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz: +; SSE2: # BB#0: +; SSE2-NEXT: shll $8, %edi +; SSE2-NEXT: pxor %xmm0, %xmm0 +; SSE2-NEXT: pinsrw $2, %edi, %xmm0 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz: +; SSSE3: # BB#0: +; SSSE3-NEXT: shll $8, %edi +; SSSE3-NEXT: pxor %xmm0, %xmm0 +; SSSE3-NEXT: pinsrw $2, %edi, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz: +; SSE41: # BB#0: +; SSE41-NEXT: pxor %xmm0, %xmm0 +; SSE41-NEXT: pinsrb $5, %edi, %xmm0 +; SSE41-NEXT: retq +; ; AVX-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz: ; AVX: # BB#0: -; AVX-NEXT: shll $8, %edi ; AVX-NEXT: vpxor %xmm0, %xmm0 -; AVX-NEXT: vpinsrw $2, %edi, %xmm0 +; AVX-NEXT: vpinsrb $5, %edi, %xmm0 ; AVX-NEXT: retq %a = insertelement <16 x i8> undef, i8 %i, i32 0 %shuffle = shufflevector <16 x i8> zeroinitializer, <16 x i8> %a, <16 x i32> @@ -670,18 +682,30 @@ } define <16 x i8> @shuffle_v16i8_zz_uu_uu_zz_uu_uu_zz_zz_zz_zz_zz_zz_zz_zz_zz_16(i8 %i) { -; SSE-LABEL: shuffle_v16i8_zz_uu_uu_zz_uu_uu_zz_zz_zz_zz_zz_zz_zz_zz_zz_16: -; SSE: # BB#0: -; SSE-NEXT: shll $8, %edi -; SSE-NEXT: pxor %xmm0, %xmm0 -; SSE-NEXT: pinsrw $7, %edi, %xmm0 -; SSE-NEXT: retq +; SSE2-LABEL: shuffle_v16i8_zz_uu_uu_zz_uu_uu_zz_zz_zz_zz_zz_zz_zz_zz_zz_16: +; SSE2: # BB#0: +; SSE2-NEXT: shll $8, %edi +; SSE2-NEXT: pxor %xmm0, %xmm0 +; SSE2-NEXT: pinsrw $7, %edi, %xmm0 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: shuffle_v16i8_zz_uu_uu_zz_uu_uu_zz_zz_zz_zz_zz_zz_zz_zz_zz_16: +; SSSE3: # BB#0: +; SSSE3-NEXT: shll $8, %edi +; SSSE3-NEXT: pxor %xmm0, %xmm0 +; SSSE3-NEXT: pinsrw $7, %edi, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuffle_v16i8_zz_uu_uu_zz_uu_uu_zz_zz_zz_zz_zz_zz_zz_zz_zz_16: +; SSE41: # BB#0: +; SSE41-NEXT: pxor %xmm0, %xmm0 +; SSE41-NEXT: pinsrb $15, %edi, %xmm0 +; SSE41-NEXT: retq ; ; AVX-LABEL: shuffle_v16i8_zz_uu_uu_zz_uu_uu_zz_zz_zz_zz_zz_zz_zz_zz_zz_16: ; AVX: # BB#0: -; AVX-NEXT: shll $8, %edi ; AVX-NEXT: vpxor %xmm0, %xmm0 -; AVX-NEXT: vpinsrw $7, %edi, %xmm0 +; AVX-NEXT: vpinsrb $15, %edi, %xmm0 ; AVX-NEXT: retq %a = insertelement <16 x i8> undef, i8 %i, i32 0 %shuffle = shufflevector <16 x i8> zeroinitializer, <16 x i8> %a, <16 x i32> @@ -689,18 +713,30 @@ } define <16 x i8> @shuffle_v16i8_zz_zz_19_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz(i8 %i) { -; SSE-LABEL: shuffle_v16i8_zz_zz_19_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz: -; SSE: # BB#0: -; SSE-NEXT: movzbl %dil, %eax -; SSE-NEXT: pxor %xmm0, %xmm0 -; SSE-NEXT: pinsrw $1, %eax, %xmm0 -; SSE-NEXT: retq +; SSE2-LABEL: shuffle_v16i8_zz_zz_19_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz: +; SSE2: # BB#0: +; SSE2-NEXT: movzbl %dil, %eax +; SSE2-NEXT: pxor %xmm0, %xmm0 +; SSE2-NEXT: pinsrw $1, %eax, %xmm0 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: shuffle_v16i8_zz_zz_19_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz: +; SSSE3: # BB#0: +; SSSE3-NEXT: movzbl %dil, %eax +; SSSE3-NEXT: pxor %xmm0, %xmm0 +; SSSE3-NEXT: pinsrw $1, %eax, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: shuffle_v16i8_zz_zz_19_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz: +; SSE41: # BB#0: +; SSE41-NEXT: pxor %xmm0, %xmm0 +; SSE41-NEXT: pinsrb $2, %edi, %xmm0 +; SSE41-NEXT: retq ; ; AVX-LABEL: shuffle_v16i8_zz_zz_19_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz: ; AVX: # BB#0: -; AVX-NEXT: movzbl %dil, %eax ; AVX-NEXT: vpxor %xmm0, %xmm0 -; AVX-NEXT: vpinsrw $1, %eax, %xmm0 +; AVX-NEXT: vpinsrb $2, %edi, %xmm0 ; AVX-NEXT: retq %a = insertelement <16 x i8> undef, i8 %i, i32 3 %shuffle = shufflevector <16 x i8> zeroinitializer, <16 x i8> %a, <16 x i32>