Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -34599,24 +34599,58 @@ return false; } -/// Detect a pattern of truncation with unsigned saturation: -/// (truncate (umin (x, unsigned_max_of_dest_type)) to dest_type). -/// Return the source value to be truncated or SDValue() if the pattern was not -/// matched. -static SDValue detectUSatPattern(SDValue In, EVT VT) { - if (In.getOpcode() != ISD::UMIN) - return SDValue(); - +/// Detect patterns of truncation with unsigned saturation: +/// +/// 1. (truncate (umin (x, unsigned_max_of_dest_type)) to dest_type). +/// Return the source value x to be truncated or SDValue() if the pattern was +/// not matched. +/// +/// 2. (truncate (smin (smax (x, C1), C2)) to dest_type), +/// where C1 >= 0 and C2 is unsigned max of destination type. +/// +/// (truncate (smax (smin (x, C2), C1)) to dest_type) +/// where C1 >= 0, C2 is unsigned max of destination type and C1 <= C2. +/// +/// These two patterns are equivalent to: +/// (truncate (umin (smax(x, C1), unsigned_max_of_dest_type)) to dest_type) +/// So return the smax(x, C1) value to be truncated or SDValue() if the +/// pattern was not matched. +static SDValue detectUSatPattern(SDValue In, EVT VT, SelectionDAG &DAG, + const SDLoc &DL) { // Saturation with truncation. We truncate from InVT to VT. assert(In.getScalarValueSizeInBits() > VT.getScalarSizeInBits() && "Unexpected types for truncate operation"); - APInt C; - if (ISD::isConstantSplatVector(In.getOperand(1).getNode(), C)) { - // C should be equal to UINT32_MAX / UINT16_MAX / UINT8_MAX according + // Match min/max and return limit value as a parameter. + auto MatchMinMax = [](SDValue V, unsigned Opcode, APInt &Limit) -> SDValue { + if (V.getOpcode() == Opcode && + ISD::isConstantSplatVector(V.getOperand(1).getNode(), Limit)) + return V.getOperand(0); + return SDValue(); + }; + + APInt C1, C2; + if (SDValue UMin = MatchMinMax(In, ISD::UMIN, C2)) + // C2 should be equal to UINT32_MAX / UINT16_MAX / UINT8_MAX according // the element size of the destination type. - return C.isMask(VT.getScalarSizeInBits()) ? In.getOperand(0) : SDValue(); - } + if (C2.isMask(VT.getScalarSizeInBits())) + return UMin; + + if (SDValue SMin = MatchMinMax(In, ISD::SMIN, C2)) + if (SDValue SMax = MatchMinMax(SMin, ISD::SMAX, C1)) + if (C1.isNonNegative() && C2.isMask(VT.getScalarSizeInBits())) + return SMin; + + if (SDValue SMax = MatchMinMax(In, ISD::SMAX, C1)) + if (SDValue SMin = MatchMinMax(SMax, ISD::SMIN, C2)) + if (C1.isNonNegative() && C2.isMask(VT.getScalarSizeInBits()) && + C2.uge(C1)) { + SDValue Ops[2]; + Ops[0] = SMin; + Ops[1] = DAG.getConstant(C1, DL, In.getValueType()); + return DAG.getNode(ISD::SMAX, DL, Ops[0].getValueType(), Ops); + } + return SDValue(); } @@ -34682,14 +34716,15 @@ /// The types should allow to use VPMOVUS* instruction on AVX512. /// Return the source value to be truncated or SDValue() if the pattern was not /// matched. -static SDValue detectAVX512USatPattern(SDValue In, EVT VT, +static SDValue detectAVX512USatPattern(SDValue In, EVT VT, SelectionDAG &DAG, + const SDLoc &DL, const X86Subtarget &Subtarget, const TargetLowering &TLI) { if (!TLI.isTypeLegal(In.getValueType())) return SDValue(); if (!isSATValidOnAVX512Subtarget(In.getValueType(), VT, Subtarget)) return SDValue(); - return detectUSatPattern(In, VT); + return detectUSatPattern(In, VT, DAG, DL); } static SDValue combineTruncateWithSat(SDValue In, EVT VT, const SDLoc &DL, @@ -34703,7 +34738,7 @@ isSATValidOnAVX512Subtarget(InVT, VT, Subtarget)) { if (auto SSatVal = detectSSatPattern(In, VT)) return DAG.getNode(X86ISD::VTRUNCS, DL, VT, SSatVal); - if (auto USatVal = detectUSatPattern(In, VT)) + if (auto USatVal = detectUSatPattern(In, VT, DAG, DL)) return DAG.getNode(X86ISD::VTRUNCUS, DL, VT, USatVal); } if (VT.isVector() && isPowerOf2_32(VT.getVectorNumElements()) && @@ -35378,9 +35413,8 @@ return EmitTruncSStore(true /* Signed saturation */, St->getChain(), dl, Val, St->getBasePtr(), St->getMemoryVT(), St->getMemOperand(), DAG); - if (SDValue Val = - detectAVX512USatPattern(St->getValue(), St->getMemoryVT(), Subtarget, - TLI)) + if (SDValue Val = detectAVX512USatPattern(St->getValue(), St->getMemoryVT(), + DAG, dl, Subtarget, TLI)) return EmitTruncSStore(false /* Unsigned saturation */, St->getChain(), dl, Val, St->getBasePtr(), St->getMemoryVT(), St->getMemOperand(), DAG); Index: test/CodeGen/X86/avx512-trunc.ll =================================================================== --- test/CodeGen/X86/avx512-trunc.ll +++ test/CodeGen/X86/avx512-trunc.ll @@ -790,8 +790,7 @@ ; SKX: ## %bb.0: ; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; SKX-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 -; SKX-NEXT: vpminsw {{.*}}(%rip), %ymm0, %ymm0 -; SKX-NEXT: vpmovwb %ymm0, (%rdi) +; SKX-NEXT: vpmovuswb %ymm0, (%rdi) ; SKX-NEXT: vzeroupper ; SKX-NEXT: retq %x1 = icmp sgt <16 x i16> %i, @@ -817,10 +816,9 @@ ; ; SKX-LABEL: smax_usat_trunc_wb_256_mem2: ; SKX: ## %bb.0: -; SKX-NEXT: vpminsw {{.*}}(%rip), %ymm0, %ymm0 ; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; SKX-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 -; SKX-NEXT: vpmovwb %ymm0, (%rdi) +; SKX-NEXT: vpmovuswb %ymm0, (%rdi) ; SKX-NEXT: vzeroupper ; SKX-NEXT: retq %x1 = icmp slt <16 x i16> %i, @@ -847,8 +845,7 @@ ; SKX: ## %bb.0: ; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; SKX-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 -; SKX-NEXT: vpminsw {{.*}}(%rip), %ymm0, %ymm0 -; SKX-NEXT: vpmovwb %ymm0, %xmm0 +; SKX-NEXT: vpmovuswb %ymm0, %xmm0 ; SKX-NEXT: vzeroupper ; SKX-NEXT: retq %x1 = icmp sgt <16 x i16> %i, @@ -873,8 +870,7 @@ ; SKX: ## %bb.0: ; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; SKX-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 -; SKX-NEXT: vpminsw {{.*}}(%rip), %xmm0, %xmm0 -; SKX-NEXT: vpmovwb %xmm0, (%rdi) +; SKX-NEXT: vpmovuswb %xmm0, (%rdi) ; SKX-NEXT: retq %x1 = icmp sgt <8 x i16> %i, %x2 = select <8 x i1> %x1, <8 x i16> %i, <8 x i16> @@ -890,8 +886,7 @@ ; ALL: ## %bb.0: ; ALL-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; ALL-NEXT: vpmaxsd %zmm1, %zmm0, %zmm0 -; ALL-NEXT: vpminsd {{.*}}(%rip){1to16}, %zmm0, %zmm0 -; ALL-NEXT: vpmovdb %zmm0, (%rdi) +; ALL-NEXT: vpmovusdb %zmm0, (%rdi) ; ALL-NEXT: vzeroupper ; ALL-NEXT: retq %x1 = icmp sgt <16 x i32> %i, @@ -908,8 +903,7 @@ ; ALL: ## %bb.0: ; ALL-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; ALL-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 -; ALL-NEXT: vpminsq {{.*}}(%rip){1to8}, %zmm0, %zmm0 -; ALL-NEXT: vpmovqb %zmm0, (%rdi) +; ALL-NEXT: vpmovusqb %zmm0, (%rdi) ; ALL-NEXT: vzeroupper ; ALL-NEXT: retq %x1 = icmp sgt <8 x i64> %i, @@ -926,8 +920,7 @@ ; ALL: ## %bb.0: ; ALL-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; ALL-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 -; ALL-NEXT: vpminsq {{.*}}(%rip){1to8}, %zmm0, %zmm0 -; ALL-NEXT: vpmovqd %zmm0, (%rdi) +; ALL-NEXT: vpmovusqd %zmm0, (%rdi) ; ALL-NEXT: vzeroupper ; ALL-NEXT: retq %x1 = icmp sgt <8 x i64> %i, @@ -944,8 +937,7 @@ ; ALL: ## %bb.0: ; ALL-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; ALL-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 -; ALL-NEXT: vpminsq {{.*}}(%rip){1to8}, %zmm0, %zmm0 -; ALL-NEXT: vpmovqw %zmm0, (%rdi) +; ALL-NEXT: vpmovusqw %zmm0, (%rdi) ; ALL-NEXT: vzeroupper ; ALL-NEXT: retq %x1 = icmp sgt <8 x i64> %i, @@ -961,13 +953,10 @@ ; KNL-LABEL: smax_usat_trunc_db_1024: ; KNL: ## %bb.0: ; KNL-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; KNL-NEXT: vpmaxsd %zmm2, %zmm0, %zmm0 ; KNL-NEXT: vpmaxsd %zmm2, %zmm1, %zmm1 -; KNL-NEXT: vpbroadcastd {{.*#+}} zmm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; KNL-NEXT: vpminsd %zmm2, %zmm1, %zmm1 -; KNL-NEXT: vpminsd %zmm2, %zmm0, %zmm0 -; KNL-NEXT: vpmovdb %zmm0, %xmm0 -; KNL-NEXT: vpmovdb %zmm1, %xmm1 +; KNL-NEXT: vpmaxsd %zmm2, %zmm0, %zmm0 +; KNL-NEXT: vpmovusdb %zmm0, %xmm0 +; KNL-NEXT: vpmovusdb %zmm1, %xmm1 ; KNL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; KNL-NEXT: retq ; @@ -996,13 +985,10 @@ ; KNL-LABEL: smax_usat_trunc_db_1024_mem: ; KNL: ## %bb.0: ; KNL-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; KNL-NEXT: vpmaxsd %zmm2, %zmm0, %zmm0 ; KNL-NEXT: vpmaxsd %zmm2, %zmm1, %zmm1 -; KNL-NEXT: vpbroadcastd {{.*#+}} zmm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; KNL-NEXT: vpminsd %zmm2, %zmm1, %zmm1 -; KNL-NEXT: vpminsd %zmm2, %zmm0, %zmm0 -; KNL-NEXT: vpmovdb %zmm0, %xmm0 -; KNL-NEXT: vpmovdb %zmm1, %xmm1 +; KNL-NEXT: vpmaxsd %zmm2, %zmm0, %zmm0 +; KNL-NEXT: vpmovusdb %zmm0, %xmm0 +; KNL-NEXT: vpmovusdb %zmm1, %xmm1 ; KNL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; KNL-NEXT: vmovdqu %ymm0, (%rdi) ; KNL-NEXT: vzeroupper @@ -1036,8 +1022,7 @@ ; ALL: ## %bb.0: ; ALL-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; ALL-NEXT: vpmaxsd %zmm1, %zmm0, %zmm0 -; ALL-NEXT: vpminsd {{.*}}(%rip){1to16}, %zmm0, %zmm0 -; ALL-NEXT: vpmovdw %zmm0, %ymm0 +; ALL-NEXT: vpmovusdw %zmm0, %ymm0 ; ALL-NEXT: retq %x1 = icmp sgt <16 x i32> %i, %x2 = select <16 x i1> %x1, <16 x i32> %i, <16 x i32> Index: test/CodeGen/X86/vector-trunc-packus.ll =================================================================== --- test/CodeGen/X86/vector-trunc-packus.ll +++ test/CodeGen/X86/vector-trunc-packus.ll @@ -244,10 +244,9 @@ ; ; AVX512VL-LABEL: trunc_packus_v4i64_v4i32: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpminsq {{.*}}(%rip){1to4}, %ymm0, %ymm0 ; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512VL-NEXT: vpmaxsq %ymm1, %ymm0, %ymm0 -; AVX512VL-NEXT: vpmovqd %ymm0, %xmm0 +; AVX512VL-NEXT: vpmovusqd %ymm0, %xmm0 ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq ; @@ -265,10 +264,9 @@ ; ; AVX512BWVL-LABEL: trunc_packus_v4i64_v4i32: ; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpminsq {{.*}}(%rip){1to4}, %ymm0, %ymm0 ; AVX512BWVL-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512BWVL-NEXT: vpmaxsq %ymm1, %ymm0, %ymm0 -; AVX512BWVL-NEXT: vpmovqd %ymm0, %xmm0 +; AVX512BWVL-NEXT: vpmovusqd %ymm0, %xmm0 ; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq %1 = icmp slt <4 x i64> %a0, @@ -665,10 +663,9 @@ ; ; AVX512-LABEL: trunc_packus_v8i64_v8i32: ; AVX512: # %bb.0: -; AVX512-NEXT: vpminsq {{.*}}(%rip){1to8}, %zmm0, %zmm0 ; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vpmovqd %zmm0, %ymm0 +; AVX512-NEXT: vpmovusqd %zmm0, %ymm0 ; AVX512-NEXT: retq %1 = icmp slt <8 x i64> %a0, %2 = select <8 x i1> %1, <8 x i64> %a0, <8 x i64> @@ -1070,10 +1067,9 @@ ; ; AVX512-LABEL: trunc_packus_v8i64_v8i16: ; AVX512: # %bb.0: -; AVX512-NEXT: vpminsq {{.*}}(%rip){1to8}, %zmm0, %zmm0 ; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vpmovqw %zmm0, %xmm0 +; AVX512-NEXT: vpmovusqw %zmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %1 = icmp slt <8 x i64> %a0, @@ -1170,10 +1166,9 @@ ; ; AVX512VL-LABEL: trunc_packus_v8i32_v8i16: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpminsd {{.*}}(%rip){1to8}, %ymm0, %ymm0 ; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512VL-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 -; AVX512VL-NEXT: vpmovdw %ymm0, %xmm0 +; AVX512VL-NEXT: vpmovusdw %ymm0, %xmm0 ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq ; @@ -1190,10 +1185,9 @@ ; ; AVX512BWVL-LABEL: trunc_packus_v8i32_v8i16: ; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpminsd {{.*}}(%rip){1to8}, %ymm0, %ymm0 ; AVX512BWVL-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512BWVL-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 -; AVX512BWVL-NEXT: vpmovdw %ymm0, %xmm0 +; AVX512BWVL-NEXT: vpmovusdw %ymm0, %xmm0 ; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq %1 = icmp slt <8 x i32> %a0, @@ -1325,10 +1319,9 @@ ; ; AVX512-LABEL: trunc_packus_v16i32_v16i16: ; AVX512: # %bb.0: -; AVX512-NEXT: vpminsd {{.*}}(%rip){1to16}, %zmm0, %zmm0 ; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512-NEXT: vpmaxsd %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512-NEXT: vpmovusdw %zmm0, %ymm0 ; AVX512-NEXT: retq %1 = icmp slt <16 x i32> %a0, %2 = select <16 x i1> %1, <16 x i32> %a0, <16 x i32> @@ -2130,10 +2123,9 @@ ; ; AVX512-LABEL: trunc_packus_v8i64_v8i8_store: ; AVX512: # %bb.0: -; AVX512-NEXT: vpminsq {{.*}}(%rip){1to8}, %zmm0, %zmm0 ; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vpmovqb %zmm0, (%rdi) +; AVX512-NEXT: vpmovusqb %zmm0, (%rdi) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %1 = icmp slt <8 x i64> %a0, @@ -3042,10 +3034,9 @@ ; ; AVX512VL-LABEL: trunc_packus_v8i32_v8i8_store: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpminsd {{.*}}(%rip){1to8}, %ymm0, %ymm0 ; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512VL-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 -; AVX512VL-NEXT: vpmovdb %ymm0, (%rdi) +; AVX512VL-NEXT: vpmovusdb %ymm0, (%rdi) ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq ; @@ -3063,10 +3054,9 @@ ; ; AVX512BWVL-LABEL: trunc_packus_v8i32_v8i8_store: ; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpminsd {{.*}}(%rip){1to8}, %ymm0, %ymm0 ; AVX512BWVL-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512BWVL-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 -; AVX512BWVL-NEXT: vpmovdb %ymm0, (%rdi) +; AVX512BWVL-NEXT: vpmovusdb %ymm0, (%rdi) ; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq %1 = icmp slt <8 x i32> %a0, @@ -3107,10 +3097,9 @@ ; ; AVX512-LABEL: trunc_packus_v16i32_v16i8: ; AVX512: # %bb.0: -; AVX512-NEXT: vpminsd {{.*}}(%rip){1to16}, %zmm0, %zmm0 ; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512-NEXT: vpmaxsd %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vpmovdb %zmm0, %xmm0 +; AVX512-NEXT: vpmovusdb %zmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq %1 = icmp slt <16 x i32> %a0, @@ -3173,10 +3162,9 @@ ; ; AVX512BWVL-LABEL: trunc_packus_v16i16_v16i8: ; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpminsw {{.*}}(%rip), %ymm0, %ymm0 ; AVX512BWVL-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512BWVL-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 -; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0 +; AVX512BWVL-NEXT: vpmovuswb %ymm0, %xmm0 ; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq %1 = icmp slt <16 x i16> %a0, @@ -3242,18 +3230,16 @@ ; ; AVX512BW-LABEL: trunc_packus_v32i16_v32i8: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpminsw {{.*}}(%rip), %zmm0, %zmm0 ; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512BW-NEXT: vpmaxsw %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 +; AVX512BW-NEXT: vpmovuswb %zmm0, %ymm0 ; AVX512BW-NEXT: retq ; ; AVX512BWVL-LABEL: trunc_packus_v32i16_v32i8: ; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpminsw {{.*}}(%rip), %zmm0, %zmm0 ; AVX512BWVL-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512BWVL-NEXT: vpmaxsw %zmm1, %zmm0, %zmm0 -; AVX512BWVL-NEXT: vpmovwb %zmm0, %ymm0 +; AVX512BWVL-NEXT: vpmovuswb %zmm0, %ymm0 ; AVX512BWVL-NEXT: retq %1 = icmp slt <32 x i16> %a0, %2 = select <32 x i1> %1, <32 x i16> %a0, <32 x i16>