This patch transforms truncation from v8i32/v16i32 to v8i8/v16i8 into bitand and X86ISD::PACKUS operations during DAG combine. We don't do it in lowering phase because after type legalization, the original truncation will be turned into a BUILD_VECTOR with each element that is extracted from a vector and then truncated, and from them it is difficult to do this optimization. This greatly improves the performance of those two truncations. For example, for the following IR:
define void @truncate_v16i32_to_v16i8(<16 x i32> %a) {
%1 = trunc <16 x i32> %a to <16 x i8> store <16 x i8> %1, <16 x i8>* undef, align 4 ret void
}
On SSE2 previously it will be compiled into 33 instructions:
movdqa %xmm3, -24(%rsp)
movdqa %xmm1, -56(%rsp)
movdqa %xmm2, -40(%rsp)
movdqa %xmm0, -72(%rsp)
punpcklbw %xmm3, %xmm1
punpcklbw %xmm2, %xmm0
punpcklbw %xmm1, %xmm0
movd -20(%rsp), %xmm1
movd -52(%rsp), %xmm2
movd -16(%rsp), %xmm3
movd -48(%rsp), %xmm4
punpcklbw %xmm3, %xmm4
movd -36(%rsp), %xmm3
movd -68(%rsp), %xmm5
movd -32(%rsp), %xmm6
movd -64(%rsp), %xmm7
punpcklbw %xmm6, %xmm7
punpcklbw %xmm4, %xmm7
punpcklbw %xmm7, %xmm0
punpcklbw %xmm1, %xmm2
punpcklbw %xmm3, %xmm5
punpcklbw %xmm2, %xmm5
movd -12(%rsp), %xmm1
movd -44(%rsp), %xmm2
punpcklbw %xmm1, %xmm2
movd -28(%rsp), %xmm1
movd -60(%rsp), %xmm3
punpcklbw %xmm1, %xmm3
punpcklbw %xmm2, %xmm3
punpcklbw %xmm3, %xmm5
punpcklbw %xmm5, %xmm0
movdqu %xmm0, (%rax)
retq
and now it is compiled into 10 instructions:
movdqa LCPI0_0(%rip), %xmm4
pand %xmm4, %xmm3
pand %xmm4, %xmm2
packuswb %xmm3, %xmm2
pand %xmm4, %xmm1
pand %xmm4, %xmm0
packuswb %xmm1, %xmm0
packuswb %xmm2, %xmm0
movdqu %xmm0, (%rax)
retq
which saves 22 instructions (many of them are memops).
Maybe use the same naming convention for the out / in VTs?
OutVT + OutSVT
InVT + InSVT
Makes it easier to track.