Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -34554,13 +34554,31 @@ return false; } -/// Detect a pattern of truncation with unsigned saturation: -/// (truncate (umin (x, unsigned_max_of_dest_type)) to dest_type). -/// Return the source value to be truncated or SDValue() if the pattern was not -/// matched. +/// Detect patterns of truncation with unsigned saturation: +/// +/// 1. (truncate (umin (x, unsigned_max_of_dest_type)) to dest_type). +/// Return the source value x to be truncated or SDValue() if the pattern was +/// not matched. +/// +/// 2. (truncate (smin (smax (x, C1), C2)) to dest_type). +/// Where C1 >= 0 and C2 >= 0 and C2 is unsigned max of destination type. +/// Return the smax(x, C1) value to be truncated or SDValue() if the pattern +/// was not matched. static SDValue detectUSatPattern(SDValue In, EVT VT) { - if (In.getOpcode() != ISD::UMIN) + if (In.getOpcode() == ISD::SMIN) { + SDValue MinOp = In.getOperand(0); + + if (MinOp.getOpcode() != ISD::SMAX) + return SDValue(); + + APInt C; + if (!ISD::isConstantSplatVector(MinOp.getOperand(1).getNode(), C) || + !C.isNonNegative()) + return SDValue(); + + } else if (In.getOpcode() != ISD::UMIN) { return SDValue(); + } // Saturation with truncation. We truncate from InVT to VT. assert(In.getScalarValueSizeInBits() > VT.getScalarSizeInBits() && @@ -34569,7 +34587,8 @@ APInt C; if (ISD::isConstantSplatVector(In.getOperand(1).getNode(), C)) { // C should be equal to UINT32_MAX / UINT16_MAX / UINT8_MAX according - // the element size of the destination type. + // the element size of the destination type. Also in case of smin(smax) + // pattern this will guarantee that constant in SMIN is non-negative. return C.isMask(VT.getScalarSizeInBits()) ? In.getOperand(0) : SDValue(); } return SDValue(); Index: test/CodeGen/X86/avx512-trunc.ll =================================================================== --- test/CodeGen/X86/avx512-trunc.ll +++ test/CodeGen/X86/avx512-trunc.ll @@ -764,3 +764,299 @@ ret <16 x i8> %tmp4 } + + +; Tests for the following unsigned saturation pattern: + +; %a = icmp sgt %x, C1 +; %b = select %a, %x, C2 +; %c = icmp slt %b, C2 +; %d = select %c, %b, C2 +; %res = trunc %d + + + +define void @smax_usat_trunc_wb_256_mem(<16 x i16> %i, <16 x i8>* %res) { +; KNL-LABEL: smax_usat_trunc_wb_256_mem: +; KNL: ## %bb.0: +; KNL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; KNL-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 +; KNL-NEXT: vpminsw {{.*}}(%rip), %ymm0, %ymm0 +; KNL-NEXT: vpmovsxwd %ymm0, %zmm0 +; KNL-NEXT: vpmovdb %zmm0, (%rdi) +; KNL-NEXT: vzeroupper +; KNL-NEXT: retq +; +; SKX-LABEL: smax_usat_trunc_wb_256_mem: +; SKX: ## %bb.0: +; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; SKX-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 +; SKX-NEXT: vpmovuswb %ymm0, (%rdi) +; SKX-NEXT: vzeroupper +; SKX-NEXT: retq + %x1 = icmp sgt <16 x i16> %i, + %x2 = select <16 x i1> %x1, <16 x i16> %i, <16 x i16> + %x3 = icmp slt <16 x i16> %x2, + %x5 = select <16 x i1> %x3, <16 x i16> %x2, <16 x i16> + %x6 = trunc <16 x i16> %x5 to <16 x i8> + store <16 x i8> %x6, <16 x i8>* %res, align 1 + ret void +} + +define <16 x i8> @smax_usat_trunc_wb_256(<16 x i16> %i) { +; KNL-LABEL: smax_usat_trunc_wb_256: +; KNL: ## %bb.0: +; KNL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; KNL-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 +; KNL-NEXT: vpminsw {{.*}}(%rip), %ymm0, %ymm0 +; KNL-NEXT: vpmovsxwd %ymm0, %zmm0 +; KNL-NEXT: vpmovdb %zmm0, %xmm0 +; KNL-NEXT: vzeroupper +; KNL-NEXT: retq +; +; SKX-LABEL: smax_usat_trunc_wb_256: +; SKX: ## %bb.0: +; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; SKX-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 +; SKX-NEXT: vpmovuswb %ymm0, %xmm0 +; SKX-NEXT: vzeroupper +; SKX-NEXT: retq + %x1 = icmp sgt <16 x i16> %i, + %x2 = select <16 x i1> %x1, <16 x i16> %i, <16 x i16> + %x3 = icmp slt <16 x i16> %x2, + %x5 = select <16 x i1> %x3, <16 x i16> %x2, <16 x i16> + %x6 = trunc <16 x i16> %x5 to <16 x i8> + ret <16 x i8> %x6 + } + +define void @smax_usat_trunc_wb_128_mem(<8 x i16> %i, <8 x i8>* %res) { +; KNL-LABEL: smax_usat_trunc_wb_128_mem: +; KNL: ## %bb.0: +; KNL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; KNL-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 +; KNL-NEXT: vpminsw {{.*}}(%rip), %xmm0, %xmm0 +; KNL-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 +; KNL-NEXT: vmovq %xmm0, (%rdi) +; KNL-NEXT: retq +; +; SKX-LABEL: smax_usat_trunc_wb_128_mem: +; SKX: ## %bb.0: +; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; SKX-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 +; SKX-NEXT: vpmovuswb %xmm0, (%rdi) +; SKX-NEXT: retq + %x1 = icmp sgt <8 x i16> %i, + %x2 = select <8 x i1> %x1, <8 x i16> %i, <8 x i16> + %x3 = icmp slt <8 x i16> %x2, + %x5 = select <8 x i1> %x3, <8 x i16> %x2, <8 x i16> + %x6 = trunc <8 x i16> %x5 to <8 x i8> + store <8 x i8> %x6, <8 x i8>* %res, align 1 + ret void +} + +define void @smax_usat_trunc_db_512_mem(<16 x i32> %i, <16 x i8>* %res) { +; ALL-LABEL: smax_usat_trunc_db_512_mem: +; ALL: ## %bb.0: +; ALL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; ALL-NEXT: vpmaxsd %zmm1, %zmm0, %zmm0 +; ALL-NEXT: vpmovusdb %zmm0, (%rdi) +; ALL-NEXT: vzeroupper +; ALL-NEXT: retq + %x1 = icmp sgt <16 x i32> %i, + %x2 = select <16 x i1> %x1, <16 x i32> %i, <16 x i32> + %x3 = icmp slt <16 x i32> %x2, + %x5 = select <16 x i1> %x3, <16 x i32> %x2, <16 x i32> + %x6 = trunc <16 x i32> %x5 to <16 x i8> + store <16 x i8> %x6, <16 x i8>* %res, align 1 + ret void +} + +define void @smax_usat_trunc_qb_512_mem(<8 x i64> %i, <8 x i8>* %res) { +; ALL-LABEL: smax_usat_trunc_qb_512_mem: +; ALL: ## %bb.0: +; ALL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; ALL-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 +; ALL-NEXT: vpmovusqb %zmm0, (%rdi) +; ALL-NEXT: vzeroupper +; ALL-NEXT: retq + %x1 = icmp sgt <8 x i64> %i, + %x2 = select <8 x i1> %x1, <8 x i64> %i, <8 x i64> + %x3 = icmp slt <8 x i64> %x2, + %x5 = select <8 x i1> %x3, <8 x i64> %x2, <8 x i64> + %x6 = trunc <8 x i64> %x5 to <8 x i8> + store <8 x i8> %x6, <8 x i8>* %res, align 1 + ret void +} + +define void @smax_usat_trunc_qd_512_mem(<8 x i64> %i, <8 x i32>* %res) { +; ALL-LABEL: smax_usat_trunc_qd_512_mem: +; ALL: ## %bb.0: +; ALL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; ALL-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 +; ALL-NEXT: vpmovusqd %zmm0, (%rdi) +; ALL-NEXT: vzeroupper +; ALL-NEXT: retq + %x1 = icmp sgt <8 x i64> %i, + %x2 = select <8 x i1> %x1, <8 x i64> %i, <8 x i64> + %x3 = icmp slt <8 x i64> %x2, + %x5 = select <8 x i1> %x3, <8 x i64> %x2, <8 x i64> + %x6 = trunc <8 x i64> %x5 to <8 x i32> + store <8 x i32> %x6, <8 x i32>* %res, align 1 + ret void +} + +define void @smax_usat_trunc_qw_512_mem(<8 x i64> %i, <8 x i16>* %res) { +; ALL-LABEL: smax_usat_trunc_qw_512_mem: +; ALL: ## %bb.0: +; ALL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; ALL-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 +; ALL-NEXT: vpmovusqw %zmm0, (%rdi) +; ALL-NEXT: vzeroupper +; ALL-NEXT: retq + %x1 = icmp sgt <8 x i64> %i, + %x2 = select <8 x i1> %x1, <8 x i64> %i, <8 x i64> + %x3 = icmp slt <8 x i64> %x2, + %x5 = select <8 x i1> %x3, <8 x i64> %x2, <8 x i64> + %x6 = trunc <8 x i64> %x5 to <8 x i16> + store <8 x i16> %x6, <8 x i16>* %res, align 1 + ret void +} + +define <32 x i8> @smax_usat_trunc_db_1024(<32 x i32> %i) { +; KNL-LABEL: smax_usat_trunc_db_1024: +; KNL: ## %bb.0: +; KNL-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; KNL-NEXT: vpmaxsd %zmm2, %zmm1, %zmm1 +; KNL-NEXT: vpmaxsd %zmm2, %zmm0, %zmm0 +; KNL-NEXT: vpmovusdb %zmm0, %xmm0 +; KNL-NEXT: vpmovusdb %zmm1, %xmm1 +; KNL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; KNL-NEXT: retq +; +; SKX-LABEL: smax_usat_trunc_db_1024: +; SKX: ## %bb.0: +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; SKX-NEXT: vpmaxsd %zmm2, %zmm0, %zmm0 +; SKX-NEXT: vpmaxsd %zmm2, %zmm1, %zmm1 +; SKX-NEXT: vpbroadcastd {{.*#+}} zmm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; SKX-NEXT: vpminsd %zmm2, %zmm1, %zmm1 +; SKX-NEXT: vpminsd %zmm2, %zmm0, %zmm0 +; SKX-NEXT: vpmovdw %zmm0, %ymm0 +; SKX-NEXT: vpmovdw %zmm1, %ymm1 +; SKX-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; SKX-NEXT: vpmovwb %zmm0, %ymm0 +; SKX-NEXT: retq + %x1 = icmp sgt <32 x i32> %i, + %x2 = select <32 x i1> %x1, <32 x i32> %i, <32 x i32> + %x3 = icmp slt <32 x i32> %x2, + %x5 = select <32 x i1> %x3, <32 x i32> %x2, <32 x i32> + %x6 = trunc <32 x i32> %x5 to <32 x i8> + ret <32 x i8> %x6 +} + +define void @smax_usat_trunc_db_1024_mem(<32 x i32> %i, <32 x i8>* %p) { +; KNL-LABEL: smax_usat_trunc_db_1024_mem: +; KNL: ## %bb.0: +; KNL-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; KNL-NEXT: vpmaxsd %zmm2, %zmm1, %zmm1 +; KNL-NEXT: vpmaxsd %zmm2, %zmm0, %zmm0 +; KNL-NEXT: vpmovusdb %zmm0, %xmm0 +; KNL-NEXT: vpmovusdb %zmm1, %xmm1 +; KNL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; KNL-NEXT: vmovdqu %ymm0, (%rdi) +; KNL-NEXT: vzeroupper +; KNL-NEXT: retq +; +; SKX-LABEL: smax_usat_trunc_db_1024_mem: +; SKX: ## %bb.0: +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; SKX-NEXT: vpmaxsd %zmm2, %zmm0, %zmm0 +; SKX-NEXT: vpmaxsd %zmm2, %zmm1, %zmm1 +; SKX-NEXT: vpbroadcastd {{.*#+}} zmm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; SKX-NEXT: vpminsd %zmm2, %zmm1, %zmm1 +; SKX-NEXT: vpminsd %zmm2, %zmm0, %zmm0 +; SKX-NEXT: vpmovdw %zmm0, %ymm0 +; SKX-NEXT: vpmovdw %zmm1, %ymm1 +; SKX-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; SKX-NEXT: vpmovwb %zmm0, (%rdi) +; SKX-NEXT: vzeroupper +; SKX-NEXT: retq + %x1 = icmp sgt <32 x i32> %i, + %x2 = select <32 x i1> %x1, <32 x i32> %i, <32 x i32> + %x3 = icmp slt <32 x i32> %x2, + %x5 = select <32 x i1> %x3, <32 x i32> %x2, <32 x i32> + %x6 = trunc <32 x i32> %x5 to <32 x i8> + store <32 x i8>%x6, <32 x i8>* %p, align 1 + ret void +} + +define <16 x i16> @smax_usat_trunc_dw_512(<16 x i32> %i) { +; ALL-LABEL: smax_usat_trunc_dw_512: +; ALL: ## %bb.0: +; ALL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; ALL-NEXT: vpmaxsd %zmm1, %zmm0, %zmm0 +; ALL-NEXT: vpmovusdw %zmm0, %ymm0 +; ALL-NEXT: retq + %x1 = icmp sgt <16 x i32> %i, + %x2 = select <16 x i1> %x1, <16 x i32> %i, <16 x i32> + %x3 = icmp slt <16 x i32> %x2, + %x5 = select <16 x i1> %x3, <16 x i32> %x2, <16 x i32> + %x6 = trunc <16 x i32> %x5 to <16 x i16> + ret <16 x i16> %x6 +} + +define void @negative_test1_smax_usat_trunc_wb_256_mem(<16 x i16> %i, <16 x i8>* %res) { +; KNL-LABEL: negative_test1_smax_usat_trunc_wb_256_mem: +; KNL: ## %bb.0: +; KNL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; KNL-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 +; KNL-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; KNL-NEXT: vpminsw %ymm1, %ymm0, %ymm0 +; KNL-NEXT: vpmovsxwd %ymm0, %zmm0 +; KNL-NEXT: vpmovdb %zmm0, (%rdi) +; KNL-NEXT: vzeroupper +; KNL-NEXT: retq +; +; SKX-LABEL: negative_test1_smax_usat_trunc_wb_256_mem: +; SKX: ## %bb.0: +; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; SKX-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 +; SKX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; SKX-NEXT: vpminsw %ymm1, %ymm0, %ymm0 +; SKX-NEXT: vpmovwb %ymm0, (%rdi) +; SKX-NEXT: vzeroupper +; SKX-NEXT: retq + %x1 = icmp sgt <16 x i16> %i, + %x2 = select <16 x i1> %x1, <16 x i16> %i, <16 x i16> + %x3 = icmp slt <16 x i16> %x2, + %x5 = select <16 x i1> %x3, <16 x i16> %x2, <16 x i16> + %x6 = trunc <16 x i16> %x5 to <16 x i8> + store <16 x i8> %x6, <16 x i8>* %res, align 1 + ret void +} + +define void @negative_test2_smax_usat_trunc_wb_256_mem(<16 x i16> %i, <16 x i8>* %res) { +; KNL-LABEL: negative_test2_smax_usat_trunc_wb_256_mem: +; KNL: ## %bb.0: +; KNL-NEXT: vpmaxsw {{.*}}(%rip), %ymm0, %ymm0 +; KNL-NEXT: vpminsw {{.*}}(%rip), %ymm0, %ymm0 +; KNL-NEXT: vpmovsxwd %ymm0, %zmm0 +; KNL-NEXT: vpmovdb %zmm0, (%rdi) +; KNL-NEXT: vzeroupper +; KNL-NEXT: retq +; +; SKX-LABEL: negative_test2_smax_usat_trunc_wb_256_mem: +; SKX: ## %bb.0: +; SKX-NEXT: vpmaxsw {{.*}}(%rip), %ymm0, %ymm0 +; SKX-NEXT: vpminsw {{.*}}(%rip), %ymm0, %ymm0 +; SKX-NEXT: vpmovwb %ymm0, (%rdi) +; SKX-NEXT: vzeroupper +; SKX-NEXT: retq + %x1 = icmp sgt <16 x i16> %i, + %x2 = select <16 x i1> %x1, <16 x i16> %i, <16 x i16> + %x3 = icmp slt <16 x i16> %x2, + %x5 = select <16 x i1> %x3, <16 x i16> %x2, <16 x i16> + %x6 = trunc <16 x i16> %x5 to <16 x i8> + store <16 x i8> %x6, <16 x i8>* %res, align 1 + ret void +}