Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -34599,24 +34599,58 @@ return false; } -/// Detect a pattern of truncation with unsigned saturation: -/// (truncate (umin (x, unsigned_max_of_dest_type)) to dest_type). -/// Return the source value to be truncated or SDValue() if the pattern was not -/// matched. -static SDValue detectUSatPattern(SDValue In, EVT VT) { - if (In.getOpcode() != ISD::UMIN) - return SDValue(); - +/// Detect patterns of truncation with unsigned saturation: +/// +/// 1. (truncate (umin (x, unsigned_max_of_dest_type)) to dest_type). +/// Return the source value x to be truncated or SDValue() if the pattern was +/// not matched. +/// +/// 2. (truncate (smin (smax (x, C1), C2)) to dest_type), +/// where C1 >= 0 and C2 is unsigned max of destination type. +/// +/// (truncate (smax (smin (x, C2), C1)) to dest_type) +/// where C1 >= 0, C2 is unsigned max of destination type and C1 <= C2. +/// +/// These two patterns are equivalent to: +/// (truncate (umin (smax(x, C1), unsigned_max_of_dest_type)) to dest_type) +/// So return the smax(x, C1) value to be truncated or SDValue() if the +/// pattern was not matched. +static SDValue detectUSatPattern(SDValue In, EVT VT, SelectionDAG &DAG, + const SDLoc &DL) { // Saturation with truncation. We truncate from InVT to VT. assert(In.getScalarValueSizeInBits() > VT.getScalarSizeInBits() && "Unexpected types for truncate operation"); - APInt C; - if (ISD::isConstantSplatVector(In.getOperand(1).getNode(), C)) { - // C should be equal to UINT32_MAX / UINT16_MAX / UINT8_MAX according + // Match min/max and return limit value as a parameter. + auto MatchMinMax = [](SDValue V, unsigned Opcode, APInt &Limit) -> SDValue { + if (V.getOpcode() == Opcode && + ISD::isConstantSplatVector(V.getOperand(1).getNode(), Limit)) + return V.getOperand(0); + return SDValue(); + }; + + APInt C1, C2; + if (SDValue UMin = MatchMinMax(In, ISD::UMIN, C2)) + // C2 should be equal to UINT32_MAX / UINT16_MAX / UINT8_MAX according // the element size of the destination type. - return C.isMask(VT.getScalarSizeInBits()) ? In.getOperand(0) : SDValue(); - } + if (C2.isMask(VT.getScalarSizeInBits())) + return UMin; + + if (SDValue SMin = MatchMinMax(In, ISD::SMIN, C2)) + if (SDValue SMax = MatchMinMax(SMin, ISD::SMAX, C1)) + if (C1.isNonNegative() && C2.isMask(VT.getScalarSizeInBits())) + return SMin; + + if (SDValue SMax = MatchMinMax(In, ISD::SMAX, C1)) + if (SDValue SMin = MatchMinMax(SMax, ISD::SMIN, C2)) + if (C1.isNonNegative() && C2.isMask(VT.getScalarSizeInBits()) && + C2.uge(C1)) { + SDValue Ops[2]; + Ops[0] = SMin; + Ops[1] = DAG.getConstant(C1, DL, In.getValueType()); + return DAG.getNode(ISD::SMAX, DL, Ops[0].getValueType(), Ops); + } + return SDValue(); } @@ -34682,14 +34716,15 @@ /// The types should allow to use VPMOVUS* instruction on AVX512. /// Return the source value to be truncated or SDValue() if the pattern was not /// matched. -static SDValue detectAVX512USatPattern(SDValue In, EVT VT, +static SDValue detectAVX512USatPattern(SDValue In, EVT VT, SelectionDAG &DAG, + const SDLoc &DL, const X86Subtarget &Subtarget, const TargetLowering &TLI) { if (!TLI.isTypeLegal(In.getValueType())) return SDValue(); if (!isSATValidOnAVX512Subtarget(In.getValueType(), VT, Subtarget)) return SDValue(); - return detectUSatPattern(In, VT); + return detectUSatPattern(In, VT, DAG, DL); } static SDValue combineTruncateWithSat(SDValue In, EVT VT, const SDLoc &DL, @@ -34703,7 +34738,7 @@ isSATValidOnAVX512Subtarget(InVT, VT, Subtarget)) { if (auto SSatVal = detectSSatPattern(In, VT)) return DAG.getNode(X86ISD::VTRUNCS, DL, VT, SSatVal); - if (auto USatVal = detectUSatPattern(In, VT)) + if (auto USatVal = detectUSatPattern(In, VT, DAG, DL)) return DAG.getNode(X86ISD::VTRUNCUS, DL, VT, USatVal); } if (VT.isVector() && isPowerOf2_32(VT.getVectorNumElements()) && @@ -35378,9 +35413,8 @@ return EmitTruncSStore(true /* Signed saturation */, St->getChain(), dl, Val, St->getBasePtr(), St->getMemoryVT(), St->getMemOperand(), DAG); - if (SDValue Val = - detectAVX512USatPattern(St->getValue(), St->getMemoryVT(), Subtarget, - TLI)) + if (SDValue Val = detectAVX512USatPattern(St->getValue(), St->getMemoryVT(), + DAG, dl, Subtarget, TLI)) return EmitTruncSStore(false /* Unsigned saturation */, St->getChain(), dl, Val, St->getBasePtr(), St->getMemoryVT(), St->getMemOperand(), DAG); Index: test/CodeGen/X86/avx512-trunc.ll =================================================================== --- test/CodeGen/X86/avx512-trunc.ll +++ test/CodeGen/X86/avx512-trunc.ll @@ -775,9 +775,8 @@ ; %res = trunc %d - -define void @smax_usat_trunc_wb_256_mem(<16 x i16> %i, <16 x i8>* %res) { -; KNL-LABEL: smax_usat_trunc_wb_256_mem: +define void @smax_usat_trunc_wb_256_mem1(<16 x i16> %i, <16 x i8>* %res) { +; KNL-LABEL: smax_usat_trunc_wb_256_mem1: ; KNL: ## %bb.0: ; KNL-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; KNL-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 @@ -787,12 +786,11 @@ ; KNL-NEXT: vzeroupper ; KNL-NEXT: retq ; -; SKX-LABEL: smax_usat_trunc_wb_256_mem: +; SKX-LABEL: smax_usat_trunc_wb_256_mem1: ; SKX: ## %bb.0: ; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; SKX-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 -; SKX-NEXT: vpminsw {{.*}}(%rip), %ymm0, %ymm0 -; SKX-NEXT: vpmovwb %ymm0, (%rdi) +; SKX-NEXT: vpmovuswb %ymm0, (%rdi) ; SKX-NEXT: vzeroupper ; SKX-NEXT: retq %x1 = icmp sgt <16 x i16> %i, @@ -804,6 +802,34 @@ ret void } +; Test for smax(smin(x, C2), C1). +define void @smax_usat_trunc_wb_256_mem2(<16 x i16> %i, <16 x i8>* %res) { +; KNL-LABEL: smax_usat_trunc_wb_256_mem2: +; KNL: ## %bb.0: +; KNL-NEXT: vpminsw {{.*}}(%rip), %ymm0, %ymm0 +; KNL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; KNL-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 +; KNL-NEXT: vpmovsxwd %ymm0, %zmm0 +; KNL-NEXT: vpmovdb %zmm0, (%rdi) +; KNL-NEXT: vzeroupper +; KNL-NEXT: retq +; +; SKX-LABEL: smax_usat_trunc_wb_256_mem2: +; SKX: ## %bb.0: +; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; SKX-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 +; SKX-NEXT: vpmovuswb %ymm0, (%rdi) +; SKX-NEXT: vzeroupper +; SKX-NEXT: retq + %x1 = icmp slt <16 x i16> %i, + %x2 = select <16 x i1> %x1, <16 x i16> %i, <16 x i16> + %x3 = icmp sgt <16 x i16> %x2, + %x5 = select <16 x i1> %x3, <16 x i16> %x2, <16 x i16> + %x6 = trunc <16 x i16> %x5 to <16 x i8> + store <16 x i8> %x6, <16 x i8>* %res, align 1 + ret void +} + define <16 x i8> @smax_usat_trunc_wb_256(<16 x i16> %i) { ; KNL-LABEL: smax_usat_trunc_wb_256: ; KNL: ## %bb.0: @@ -819,8 +845,7 @@ ; SKX: ## %bb.0: ; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; SKX-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 -; SKX-NEXT: vpminsw {{.*}}(%rip), %ymm0, %ymm0 -; SKX-NEXT: vpmovwb %ymm0, %xmm0 +; SKX-NEXT: vpmovuswb %ymm0, %xmm0 ; SKX-NEXT: vzeroupper ; SKX-NEXT: retq %x1 = icmp sgt <16 x i16> %i, @@ -845,8 +870,7 @@ ; SKX: ## %bb.0: ; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; SKX-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 -; SKX-NEXT: vpminsw {{.*}}(%rip), %xmm0, %xmm0 -; SKX-NEXT: vpmovwb %xmm0, (%rdi) +; SKX-NEXT: vpmovuswb %xmm0, (%rdi) ; SKX-NEXT: retq %x1 = icmp sgt <8 x i16> %i, %x2 = select <8 x i1> %x1, <8 x i16> %i, <8 x i16> @@ -862,8 +886,7 @@ ; ALL: ## %bb.0: ; ALL-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; ALL-NEXT: vpmaxsd %zmm1, %zmm0, %zmm0 -; ALL-NEXT: vpminsd {{.*}}(%rip){1to16}, %zmm0, %zmm0 -; ALL-NEXT: vpmovdb %zmm0, (%rdi) +; ALL-NEXT: vpmovusdb %zmm0, (%rdi) ; ALL-NEXT: vzeroupper ; ALL-NEXT: retq %x1 = icmp sgt <16 x i32> %i, @@ -880,8 +903,7 @@ ; ALL: ## %bb.0: ; ALL-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; ALL-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 -; ALL-NEXT: vpminsq {{.*}}(%rip){1to8}, %zmm0, %zmm0 -; ALL-NEXT: vpmovqb %zmm0, (%rdi) +; ALL-NEXT: vpmovusqb %zmm0, (%rdi) ; ALL-NEXT: vzeroupper ; ALL-NEXT: retq %x1 = icmp sgt <8 x i64> %i, @@ -898,8 +920,7 @@ ; ALL: ## %bb.0: ; ALL-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; ALL-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 -; ALL-NEXT: vpminsq {{.*}}(%rip){1to8}, %zmm0, %zmm0 -; ALL-NEXT: vpmovqd %zmm0, (%rdi) +; ALL-NEXT: vpmovusqd %zmm0, (%rdi) ; ALL-NEXT: vzeroupper ; ALL-NEXT: retq %x1 = icmp sgt <8 x i64> %i, @@ -916,8 +937,7 @@ ; ALL: ## %bb.0: ; ALL-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; ALL-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 -; ALL-NEXT: vpminsq {{.*}}(%rip){1to8}, %zmm0, %zmm0 -; ALL-NEXT: vpmovqw %zmm0, (%rdi) +; ALL-NEXT: vpmovusqw %zmm0, (%rdi) ; ALL-NEXT: vzeroupper ; ALL-NEXT: retq %x1 = icmp sgt <8 x i64> %i, @@ -933,13 +953,10 @@ ; KNL-LABEL: smax_usat_trunc_db_1024: ; KNL: ## %bb.0: ; KNL-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; KNL-NEXT: vpmaxsd %zmm2, %zmm0, %zmm0 ; KNL-NEXT: vpmaxsd %zmm2, %zmm1, %zmm1 -; KNL-NEXT: vpbroadcastd {{.*#+}} zmm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; KNL-NEXT: vpminsd %zmm2, %zmm1, %zmm1 -; KNL-NEXT: vpminsd %zmm2, %zmm0, %zmm0 -; KNL-NEXT: vpmovdb %zmm0, %xmm0 -; KNL-NEXT: vpmovdb %zmm1, %xmm1 +; KNL-NEXT: vpmaxsd %zmm2, %zmm0, %zmm0 +; KNL-NEXT: vpmovusdb %zmm0, %xmm0 +; KNL-NEXT: vpmovusdb %zmm1, %xmm1 ; KNL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; KNL-NEXT: retq ; @@ -968,13 +985,10 @@ ; KNL-LABEL: smax_usat_trunc_db_1024_mem: ; KNL: ## %bb.0: ; KNL-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; KNL-NEXT: vpmaxsd %zmm2, %zmm0, %zmm0 ; KNL-NEXT: vpmaxsd %zmm2, %zmm1, %zmm1 -; KNL-NEXT: vpbroadcastd {{.*#+}} zmm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; KNL-NEXT: vpminsd %zmm2, %zmm1, %zmm1 -; KNL-NEXT: vpminsd %zmm2, %zmm0, %zmm0 -; KNL-NEXT: vpmovdb %zmm0, %xmm0 -; KNL-NEXT: vpmovdb %zmm1, %xmm1 +; KNL-NEXT: vpmaxsd %zmm2, %zmm0, %zmm0 +; KNL-NEXT: vpmovusdb %zmm0, %xmm0 +; KNL-NEXT: vpmovusdb %zmm1, %xmm1 ; KNL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; KNL-NEXT: vmovdqu %ymm0, (%rdi) ; KNL-NEXT: vzeroupper @@ -1008,8 +1022,7 @@ ; ALL: ## %bb.0: ; ALL-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; ALL-NEXT: vpmaxsd %zmm1, %zmm0, %zmm0 -; ALL-NEXT: vpminsd {{.*}}(%rip){1to16}, %zmm0, %zmm0 -; ALL-NEXT: vpmovdw %zmm0, %ymm0 +; ALL-NEXT: vpmovusdw %zmm0, %ymm0 ; ALL-NEXT: retq %x1 = icmp sgt <16 x i32> %i, %x2 = select <16 x i1> %x1, <16 x i32> %i, <16 x i32> Index: test/CodeGen/X86/vector-trunc-packus.ll =================================================================== --- test/CodeGen/X86/vector-trunc-packus.ll +++ test/CodeGen/X86/vector-trunc-packus.ll @@ -244,10 +244,9 @@ ; ; AVX512VL-LABEL: trunc_packus_v4i64_v4i32: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpminsq {{.*}}(%rip){1to4}, %ymm0, %ymm0 ; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512VL-NEXT: vpmaxsq %ymm1, %ymm0, %ymm0 -; AVX512VL-NEXT: vpmovqd %ymm0, %xmm0 +; AVX512VL-NEXT: vpmovusqd %ymm0, %xmm0 ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq ; @@ -265,10 +264,9 @@ ; ; AVX512BWVL-LABEL: trunc_packus_v4i64_v4i32: ; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpminsq {{.*}}(%rip){1to4}, %ymm0, %ymm0 ; AVX512BWVL-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512BWVL-NEXT: vpmaxsq %ymm1, %ymm0, %ymm0 -; AVX512BWVL-NEXT: vpmovqd %ymm0, %xmm0 +; AVX512BWVL-NEXT: vpmovusqd %ymm0, %xmm0 ; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq %1 = icmp slt <4 x i64> %a0, @@ -665,10 +663,9 @@ ; ; AVX512-LABEL: trunc_packus_v8i64_v8i32: ; AVX512: # %bb.0: -; AVX512-NEXT: vpminsq {{.*}}(%rip){1to8}, %zmm0, %zmm0 ; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vpmovqd %zmm0, %ymm0 +; AVX512-NEXT: vpmovusqd %zmm0, %ymm0 ; AVX512-NEXT: retq %1 = icmp slt <8 x i64> %a0, %2 = select <8 x i1> %1, <8 x i64> %a0, <8 x i64> @@ -1070,10 +1067,9 @@ ; ; AVX512-LABEL: trunc_packus_v8i64_v8i16: ; AVX512: # %bb.0: -; AVX512-NEXT: vpminsq {{.*}}(%rip){1to8}, %zmm0, %zmm0 ; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vpmovqw %zmm0, %xmm0 +; AVX512-NEXT: vpmovusqw %zmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %1 = icmp slt <8 x i64> %a0, @@ -1170,10 +1166,9 @@ ; ; AVX512VL-LABEL: trunc_packus_v8i32_v8i16: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpminsd {{.*}}(%rip){1to8}, %ymm0, %ymm0 ; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512VL-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 -; AVX512VL-NEXT: vpmovdw %ymm0, %xmm0 +; AVX512VL-NEXT: vpmovusdw %ymm0, %xmm0 ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq ; @@ -1190,10 +1185,9 @@ ; ; AVX512BWVL-LABEL: trunc_packus_v8i32_v8i16: ; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpminsd {{.*}}(%rip){1to8}, %ymm0, %ymm0 ; AVX512BWVL-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512BWVL-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 -; AVX512BWVL-NEXT: vpmovdw %ymm0, %xmm0 +; AVX512BWVL-NEXT: vpmovusdw %ymm0, %xmm0 ; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq %1 = icmp slt <8 x i32> %a0, @@ -1325,10 +1319,9 @@ ; ; AVX512-LABEL: trunc_packus_v16i32_v16i16: ; AVX512: # %bb.0: -; AVX512-NEXT: vpminsd {{.*}}(%rip){1to16}, %zmm0, %zmm0 ; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512-NEXT: vpmaxsd %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512-NEXT: vpmovusdw %zmm0, %ymm0 ; AVX512-NEXT: retq %1 = icmp slt <16 x i32> %a0, %2 = select <16 x i1> %1, <16 x i32> %a0, <16 x i32> @@ -2130,10 +2123,9 @@ ; ; AVX512-LABEL: trunc_packus_v8i64_v8i8_store: ; AVX512: # %bb.0: -; AVX512-NEXT: vpminsq {{.*}}(%rip){1to8}, %zmm0, %zmm0 ; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vpmovqb %zmm0, (%rdi) +; AVX512-NEXT: vpmovusqb %zmm0, (%rdi) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %1 = icmp slt <8 x i64> %a0, @@ -3042,10 +3034,9 @@ ; ; AVX512VL-LABEL: trunc_packus_v8i32_v8i8_store: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpminsd {{.*}}(%rip){1to8}, %ymm0, %ymm0 ; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512VL-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 -; AVX512VL-NEXT: vpmovdb %ymm0, (%rdi) +; AVX512VL-NEXT: vpmovusdb %ymm0, (%rdi) ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq ; @@ -3063,10 +3054,9 @@ ; ; AVX512BWVL-LABEL: trunc_packus_v8i32_v8i8_store: ; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpminsd {{.*}}(%rip){1to8}, %ymm0, %ymm0 ; AVX512BWVL-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512BWVL-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 -; AVX512BWVL-NEXT: vpmovdb %ymm0, (%rdi) +; AVX512BWVL-NEXT: vpmovusdb %ymm0, (%rdi) ; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq %1 = icmp slt <8 x i32> %a0, @@ -3107,10 +3097,9 @@ ; ; AVX512-LABEL: trunc_packus_v16i32_v16i8: ; AVX512: # %bb.0: -; AVX512-NEXT: vpminsd {{.*}}(%rip){1to16}, %zmm0, %zmm0 ; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512-NEXT: vpmaxsd %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512-NEXT: vpmovusdb %zmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %1 = icmp slt <16 x i32> %a0, @@ -3173,10 +3162,9 @@ ; ; AVX512BWVL-LABEL: trunc_packus_v16i16_v16i8: ; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpminsw {{.*}}(%rip), %ymm0, %ymm0 ; AVX512BWVL-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512BWVL-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 -; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0 +; AVX512BWVL-NEXT: vpmovuswb %ymm0, %xmm0 ; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq %1 = icmp slt <16 x i16> %a0, @@ -3242,18 +3230,16 @@ ; ; AVX512BW-LABEL: trunc_packus_v32i16_v32i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpminsw {{.*}}(%rip), %zmm0, %zmm0 ; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512BW-NEXT: vpmaxsw %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 +; AVX512BW-NEXT: vpmovuswb %zmm0, %ymm0 ; AVX512BW-NEXT: retq ; ; AVX512BWVL-LABEL: trunc_packus_v32i16_v32i8: ; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpminsw {{.*}}(%rip), %zmm0, %zmm0 ; AVX512BWVL-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512BWVL-NEXT: vpmaxsw %zmm1, %zmm0, %zmm0 -; AVX512BWVL-NEXT: vpmovwb %zmm0, %ymm0 +; AVX512BWVL-NEXT: vpmovuswb %zmm0, %ymm0 ; AVX512BWVL-NEXT: retq %1 = icmp slt <32 x i16> %a0, %2 = select <32 x i1> %1, <32 x i16> %a0, <32 x i16>