Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -36728,34 +36728,20 @@ EVT VT = N->getValueType(0); EVT SrcVT = Src.getValueType(); - auto IsRepeatedOpOrFreeTruncation = [VT](SDValue Op0, SDValue Op1) { + auto IsFreeTruncation = [VT](SDValue Op) { unsigned TruncSizeInBits = VT.getScalarSizeInBits(); - // Repeated operand, so we are only trading one output truncation for - // one input truncation. - if (Op0 == Op1) - return true; - - // See if either operand has been extended from a smaller/equal size to + // See if this has been extended from a smaller/equal size to // the truncation size, allowing a truncation to combine with the extend. - unsigned Opcode0 = Op0.getOpcode(); - if ((Opcode0 == ISD::ANY_EXTEND || Opcode0 == ISD::SIGN_EXTEND || - Opcode0 == ISD::ZERO_EXTEND) && - Op0.getOperand(0).getScalarValueSizeInBits() <= TruncSizeInBits) - return true; - - unsigned Opcode1 = Op1.getOpcode(); - if ((Opcode1 == ISD::ANY_EXTEND || Opcode1 == ISD::SIGN_EXTEND || - Opcode1 == ISD::ZERO_EXTEND) && - Op1.getOperand(0).getScalarValueSizeInBits() <= TruncSizeInBits) + unsigned Opcode = Op.getOpcode(); + if ((Opcode == ISD::ANY_EXTEND || Opcode == ISD::SIGN_EXTEND || + Opcode == ISD::ZERO_EXTEND) && + Op.getOperand(0).getScalarValueSizeInBits() <= TruncSizeInBits) return true; - // See if either operand is a single use constant which can be constant - // folded. - SDValue BC0 = peekThroughOneUseBitcasts(Op0); - SDValue BC1 = peekThroughOneUseBitcasts(Op1); - return ISD::isBuildVectorOfConstantSDNodes(BC0.getNode()) || - ISD::isBuildVectorOfConstantSDNodes(BC1.getNode()); + // See if this is a single use constant which can be constant folded. + SDValue BC = peekThroughOneUseBitcasts(Op); + return ISD::isBuildVectorOfConstantSDNodes(BC.getNode()); }; auto TruncateArithmetic = [&](SDValue N0, SDValue N1) { @@ -36783,7 +36769,7 @@ SDValue Op0 = Src.getOperand(0); SDValue Op1 = Src.getOperand(1); if (TLI.isOperationLegalOrPromote(Opcode, VT) && - IsRepeatedOpOrFreeTruncation(Op0, Op1)) + (Op0 == Op1 || IsFreeTruncation(Op0) || IsFreeTruncation(Op1))) return TruncateArithmetic(Op0, Op1); break; } @@ -36796,11 +36782,20 @@ return TruncateArithmetic(Src.getOperand(0), Src.getOperand(1)); LLVM_FALLTHROUGH; case ISD::ADD: { - // TODO: ISD::SUB should be here but interferes with combineSubToSubus. SDValue Op0 = Src.getOperand(0); SDValue Op1 = Src.getOperand(1); if (TLI.isOperationLegal(Opcode, VT) && - IsRepeatedOpOrFreeTruncation(Op0, Op1)) + (Op0 == Op1 || IsFreeTruncation(Op0) || IsFreeTruncation(Op1))) + return TruncateArithmetic(Op0, Op1); + break; + } + case ISD::SUB: { + // TODO: ISD::SUB We are conservative and require both sides to be freely + // truncatable to avoid interfering with combineSubToSubus. + SDValue Op0 = Src.getOperand(0); + SDValue Op1 = Src.getOperand(1); + if (TLI.isOperationLegal(Opcode, VT) && + (Op0 == Op1 || (IsFreeTruncation(Op0) && IsFreeTruncation(Op1)))) return TruncateArithmetic(Op0, Op1); break; } Index: test/CodeGen/X86/vector-trunc-math.ll =================================================================== --- test/CodeGen/X86/vector-trunc-math.ll +++ test/CodeGen/X86/vector-trunc-math.ll @@ -1321,6 +1321,23 @@ ret <16 x i8> %2 } +define <16 x i8> @trunc_ext_sub_v16i16_v16i8(<16 x i8> %x, <16 x i8> %y) { +; SSE-LABEL: trunc_ext_sub_v16i16_v16i8: +; SSE: # %bb.0: +; SSE-NEXT: psubb %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: trunc_ext_sub_v16i16_v16i8: +; AVX: # %bb.0: +; AVX-NEXT: vpsubb %xmm1, %xmm0, %xmm0 +; AVX-NEXT: retq + %a = zext <16 x i8> %x to <16 x i16> + %b = zext <16 x i8> %y to <16 x i16> + %c = sub <16 x i16> %a, %b + %d = trunc <16 x i16> %c to <16 x i8> + ret <16 x i8> %d +} + ; ; sub to constant ; @@ -1770,6 +1787,41 @@ ret <16 x i8> %2 } +define <16 x i8> @trunc_ext_sub_const_rhs_v16i16_v16i8(<16 x i8> %x) { +; SSE-LABEL: trunc_ext_sub_const_rhs_v16i16_v16i8: +; SSE: # %bb.0: +; SSE-NEXT: psubb {{.*}}(%rip), %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: trunc_ext_sub_const_rhs_v16i16_v16i8: +; AVX: # %bb.0: +; AVX-NEXT: vpsubb {{.*}}(%rip), %xmm0, %xmm0 +; AVX-NEXT: retq + %a = zext <16 x i8> %x to <16 x i16> + %b = sub <16 x i16> %a, + %c = trunc <16 x i16> %b to <16 x i8> + ret <16 x i8> %c +} + +define <16 x i8> @trunc_ext_sub_const_lhs_v16i16_v16i8(<16 x i8> %x) { +; SSE-LABEL: trunc_ext_sub_const_lhs_v16i16_v16i8: +; SSE: # %bb.0: +; SSE-NEXT: movdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; SSE-NEXT: psubb %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: trunc_ext_sub_const_lhs_v16i16_v16i8: +; AVX: # %bb.0: +; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; AVX-NEXT: vpsubb %xmm0, %xmm1, %xmm0 +; AVX-NEXT: retq + %a = zext <16 x i8> %x to <16 x i16> + %b = sub <16 x i16> , %a + %c = trunc <16 x i16> %b to <16 x i8> + ret <16 x i8> %c +} + ; ; mul ;