This patch transforms truncation from v8i32/v16i32 to v8i8/v16i8 into bitand and X86ISD::PACKUS operations during DAG combine. We don't do it in lowering phase because after type legalization, the original truncation will be turned into a BUILD_VECTOR with each element that is extracted from a vector and then truncated, and from them it is difficult to do this optimization. This greatly improves the performance of those two truncations. For example, for the following IR:

define void @truncate_v16i32_to_v16i8(<16 x i32> %a) {

%1 = trunc <16 x i32> %a to <16 x i8> store <16 x i8> %1, <16 x i8>* undef, align 4 ret void

}

On SSE2 previously it will be compiled into 33 instructions:

movdqa %xmm3, -24(%rsp)

movdqa %xmm1, -56(%rsp)

movdqa %xmm2, -40(%rsp)

movdqa %xmm0, -72(%rsp)

punpcklbw %xmm3, %xmm1

punpcklbw %xmm2, %xmm0

punpcklbw %xmm1, %xmm0

movd -20(%rsp), %xmm1

movd -52(%rsp), %xmm2

movd -16(%rsp), %xmm3

movd -48(%rsp), %xmm4

punpcklbw %xmm3, %xmm4

movd -36(%rsp), %xmm3

movd -68(%rsp), %xmm5

movd -32(%rsp), %xmm6

movd -64(%rsp), %xmm7

punpcklbw %xmm6, %xmm7

punpcklbw %xmm4, %xmm7

punpcklbw %xmm7, %xmm0

punpcklbw %xmm1, %xmm2

punpcklbw %xmm3, %xmm5

punpcklbw %xmm2, %xmm5

movd -12(%rsp), %xmm1

movd -44(%rsp), %xmm2

punpcklbw %xmm1, %xmm2

movd -28(%rsp), %xmm1

movd -60(%rsp), %xmm3

punpcklbw %xmm1, %xmm3

punpcklbw %xmm2, %xmm3

punpcklbw %xmm3, %xmm5

punpcklbw %xmm5, %xmm0

movdqu %xmm0, (%rax)

retq

and now it is compiled into 10 instructions:

movdqa LCPI0_0(%rip), %xmm4

pand %xmm4, %xmm3

pand %xmm4, %xmm2

packuswb %xmm3, %xmm2

pand %xmm4, %xmm1

pand %xmm4, %xmm0

packuswb %xmm1, %xmm0

packuswb %xmm2, %xmm0

movdqu %xmm0, (%rax)

retq

which saves 22 instructions (many of them are memops).