diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -39841,6 +39841,21 @@ if (auto USatVal = detectUSatPattern(In, VT, DAG, DL)) return DAG.getNode(X86ISD::VTRUNCUS, DL, VT, USatVal); } + + // If we're clamping a signed 32-bit vector to 0-255 and the 32-bit vector is + // split across two registers. We can use a packusdw+perm to clamp to 0-65535 + // and concatenate at the same time. Then we can use a final vpmovuswb to + // clip to 0-255. + if (Subtarget.hasBWI() && !Subtarget.useAVX512Regs() && + InVT == MVT::v16i32 && VT == MVT::v16i8) { + if (auto USatVal = detectSSatPattern(In, VT, true)) { + // Emit a VPACKUSDW+VPERMQ followed by a VPMOVUSWB. + SDValue Mid = truncateVectorWithPACK(X86ISD::PACKUS, MVT::v16i16, USatVal, + DL, DAG, Subtarget); + return DAG.getNode(X86ISD::VTRUNCUS, DL, VT, Mid); + } + } + if (VT.isVector() && isPowerOf2_32(VT.getVectorNumElements()) && !(Subtarget.hasAVX512() && InSVT == MVT::i32) && !(Subtarget.hasBWI() && InSVT == MVT::i16) && diff --git a/llvm/test/CodeGen/X86/min-legal-vector-width.ll b/llvm/test/CodeGen/X86/min-legal-vector-width.ll --- a/llvm/test/CodeGen/X86/min-legal-vector-width.ll +++ b/llvm/test/CodeGen/X86/min-legal-vector-width.ll @@ -1083,12 +1083,10 @@ define <16 x i8> @trunc_packus_v16i32_v16i8(<16 x i32>* %p, <16 x i8>* %q) "min-legal-vector-width"="256" { ; CHECK-LABEL: trunc_packus_v16i32_v16i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; CHECK-NEXT: vpmaxsd 32(%rdi), %ymm0, %ymm1 -; CHECK-NEXT: vpmovusdb %ymm1, %xmm1 -; CHECK-NEXT: vpmaxsd (%rdi), %ymm0, %ymm0 -; CHECK-NEXT: vpmovusdb %ymm0, %xmm0 -; CHECK-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; CHECK-NEXT: vmovdqa (%rdi), %ymm0 +; CHECK-NEXT: vpackusdw 32(%rdi), %ymm0, %ymm0 +; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] +; CHECK-NEXT: vpmovuswb %ymm0, %xmm0 ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %a = load <16 x i32>, <16 x i32>* %p