Index: llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp =================================================================== --- llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp +++ llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp @@ -661,7 +661,7 @@ EVT PromotedType = Op1Promoted.getValueType(); unsigned NewBits = PromotedType.getScalarSizeInBits(); - if (TLI.isOperationLegalOrCustom(Opcode, PromotedType)) { + auto GenerateShiftedPromotion = [&]() { unsigned ShiftOp; switch (Opcode) { case ISD::SADDSAT: @@ -688,32 +688,52 @@ SDValue Result = DAG.getNode(Opcode, dl, PromotedType, Op1Promoted, Op2Promoted); return DAG.getNode(ShiftOp, dl, PromotedType, Result, ShiftAmount); - } else { - if (Opcode == ISD::USUBSAT) { - SDValue Max = - DAG.getNode(ISD::UMAX, dl, PromotedType, Op1Promoted, Op2Promoted); - return DAG.getNode(ISD::SUB, dl, PromotedType, Max, Op2Promoted); - } + }; - if (Opcode == ISD::UADDSAT) { - APInt MaxVal = APInt::getAllOnesValue(OldBits).zext(NewBits); - SDValue SatMax = DAG.getConstant(MaxVal, dl, PromotedType); - SDValue Add = - DAG.getNode(ISD::ADD, dl, PromotedType, Op1Promoted, Op2Promoted); - return DAG.getNode(ISD::UMIN, dl, PromotedType, Add, SatMax); - } + if (Opcode == ISD::UADDSAT) { + // Via a shift into the higher bits if the larger type has a legal + // UADDSAT and UMIN is not known to be legal. Otherwise use a standard add + // and UMIN if it is over the limit. + if (TLI.isOperationLegalOrCustom(Opcode, PromotedType) && + !TLI.isOperationLegal(ISD::UMIN, PromotedType)) + return GenerateShiftedPromotion(); - unsigned AddOp = Opcode == ISD::SADDSAT ? ISD::ADD : ISD::SUB; - APInt MinVal = APInt::getSignedMinValue(OldBits).sext(NewBits); - APInt MaxVal = APInt::getSignedMaxValue(OldBits).sext(NewBits); - SDValue SatMin = DAG.getConstant(MinVal, dl, PromotedType); + APInt MaxVal = APInt::getAllOnesValue(OldBits).zext(NewBits); SDValue SatMax = DAG.getConstant(MaxVal, dl, PromotedType); - SDValue Result = - DAG.getNode(AddOp, dl, PromotedType, Op1Promoted, Op2Promoted); - Result = DAG.getNode(ISD::SMIN, dl, PromotedType, Result, SatMax); - Result = DAG.getNode(ISD::SMAX, dl, PromotedType, Result, SatMin); - return Result; + SDValue Add = + DAG.getNode(ISD::ADD, dl, PromotedType, Op1Promoted, Op2Promoted); + return DAG.getNode(ISD::UMIN, dl, PromotedType, Add, SatMax); } + + if (Opcode == ISD::USUBSAT) { + // If the USUBSAT is legal in the higher type we can just use it + // (zero is zero, after all). + if (TLI.isOperationLegalOrCustom(Opcode, PromotedType)) + return DAG.getNode(Opcode, dl, PromotedType, Op1Promoted, Op2Promoted); + + SDValue Max = + DAG.getNode(ISD::UMAX, dl, PromotedType, Op1Promoted, Op2Promoted); + return DAG.getNode(ISD::SUB, dl, PromotedType, Max, Op2Promoted); + } + + // SADDSAT or SSUBSAT. Either shifted to the higher bits if the sat intrinsic + // is legal and min/max are not known to be legal. Else expanded to a min/max + // clamp using normal add/sub. + if (TLI.isOperationLegalOrCustom(Opcode, PromotedType) && + (!TLI.isOperationLegal(ISD::SMIN, PromotedType) || + !TLI.isOperationLegal(ISD::SMAX, PromotedType))) + return GenerateShiftedPromotion(); + + unsigned AddOp = Opcode == ISD::SADDSAT ? ISD::ADD : ISD::SUB; + APInt MinVal = APInt::getSignedMinValue(OldBits).sext(NewBits); + APInt MaxVal = APInt::getSignedMaxValue(OldBits).sext(NewBits); + SDValue SatMin = DAG.getConstant(MinVal, dl, PromotedType); + SDValue SatMax = DAG.getConstant(MaxVal, dl, PromotedType); + SDValue Result = + DAG.getNode(AddOp, dl, PromotedType, Op1Promoted, Op2Promoted); + Result = DAG.getNode(ISD::SMIN, dl, PromotedType, Result, SatMax); + Result = DAG.getNode(ISD::SMAX, dl, PromotedType, Result, SatMin); + return Result; } SDValue DAGTypeLegalizer::PromoteIntRes_MULFIX(SDNode *N) { Index: llvm/test/CodeGen/Thumb2/mve-saturating-arith.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-saturating-arith.ll +++ llvm/test/CodeGen/Thumb2/mve-saturating-arith.ll @@ -14,10 +14,11 @@ define arm_aapcs_vfpcc <16 x i4> @sadd_int4_t(<16 x i4> %src1, <16 x i4> %src2) { ; CHECK-LABEL: sadd_int4_t: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vshl.i8 q1, q1, #4 -; CHECK-NEXT: vshl.i8 q0, q0, #4 -; CHECK-NEXT: vqadd.s8 q0, q0, q1 -; CHECK-NEXT: vshr.s8 q0, q0, #4 +; CHECK-NEXT: vadd.i8 q0, q0, q1 +; CHECK-NEXT: vmov.i8 q1, #0x7 +; CHECK-NEXT: vmin.s8 q0, q0, q1 +; CHECK-NEXT: vmov.i8 q1, #0xf8 +; CHECK-NEXT: vmax.s8 q0, q0, q1 ; CHECK-NEXT: bx lr entry: %0 = call <16 x i4> @llvm.sadd.sat.v16i4(<16 x i4> %src1, <16 x i4> %src2) @@ -137,10 +138,9 @@ define arm_aapcs_vfpcc <16 x i4> @uadd_int4_t(<16 x i4> %src1, <16 x i4> %src2) { ; CHECK-LABEL: uadd_int4_t: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vshl.i8 q1, q1, #4 -; CHECK-NEXT: vshl.i8 q0, q0, #4 -; CHECK-NEXT: vqadd.u8 q0, q0, q1 -; CHECK-NEXT: vshr.u8 q0, q0, #4 +; CHECK-NEXT: vadd.i8 q0, q0, q1 +; CHECK-NEXT: vmov.i8 q1, #0xf +; CHECK-NEXT: vmin.u8 q0, q0, q1 ; CHECK-NEXT: bx lr entry: %0 = call <16 x i4> @llvm.uadd.sat.v16i4(<16 x i4> %src1, <16 x i4> %src2) @@ -227,10 +227,11 @@ define arm_aapcs_vfpcc <16 x i4> @ssub_int4_t(<16 x i4> %src1, <16 x i4> %src2) { ; CHECK-LABEL: ssub_int4_t: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vshl.i8 q1, q1, #4 -; CHECK-NEXT: vshl.i8 q0, q0, #4 -; CHECK-NEXT: vqsub.s8 q0, q0, q1 -; CHECK-NEXT: vshr.s8 q0, q0, #4 +; CHECK-NEXT: vsub.i8 q0, q0, q1 +; CHECK-NEXT: vmov.i8 q1, #0x7 +; CHECK-NEXT: vmin.s8 q0, q0, q1 +; CHECK-NEXT: vmov.i8 q1, #0xf8 +; CHECK-NEXT: vmax.s8 q0, q0, q1 ; CHECK-NEXT: bx lr entry: %0 = call <16 x i4> @llvm.ssub.sat.v16i4(<16 x i4> %src1, <16 x i4> %src2) @@ -365,10 +366,7 @@ define arm_aapcs_vfpcc <16 x i4> @usub_int4_t(<16 x i4> %src1, <16 x i4> %src2) { ; CHECK-LABEL: usub_int4_t: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vshl.i8 q1, q1, #4 -; CHECK-NEXT: vshl.i8 q0, q0, #4 ; CHECK-NEXT: vqsub.u8 q0, q0, q1 -; CHECK-NEXT: vshr.u8 q0, q0, #4 ; CHECK-NEXT: bx lr entry: %0 = call <16 x i4> @llvm.usub.sat.v16i4(<16 x i4> %src1, <16 x i4> %src2) Index: llvm/test/CodeGen/X86/sadd_sat_vec.ll =================================================================== --- llvm/test/CodeGen/X86/sadd_sat_vec.ll +++ llvm/test/CodeGen/X86/sadd_sat_vec.ll @@ -480,75 +480,105 @@ ; Promotion define <16 x i4> @v16i4(<16 x i4> %x, <16 x i4> %y) nounwind { -; SSE-LABEL: v16i4: -; SSE: # %bb.0: -; SSE-NEXT: psllw $4, %xmm1 -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] -; SSE-NEXT: pand %xmm2, %xmm1 -; SSE-NEXT: psllw $4, %xmm0 -; SSE-NEXT: pand %xmm2, %xmm0 -; SSE-NEXT: paddsb %xmm1, %xmm0 -; SSE-NEXT: psrlw $4, %xmm0 -; SSE-NEXT: pand {{.*}}(%rip), %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] -; SSE-NEXT: pxor %xmm1, %xmm0 -; SSE-NEXT: psubb %xmm1, %xmm0 -; SSE-NEXT: retq +; SSE2-LABEL: v16i4: +; SSE2: # %bb.0: +; SSE2-NEXT: psllw $4, %xmm1 +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] +; SSE2-NEXT: pand %xmm2, %xmm1 +; SSE2-NEXT: psllw $4, %xmm0 +; SSE2-NEXT: pand %xmm2, %xmm0 +; SSE2-NEXT: paddsb %xmm1, %xmm0 +; SSE2-NEXT: psrlw $4, %xmm0 +; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; SSE2-NEXT: pxor %xmm1, %xmm0 +; SSE2-NEXT: psubb %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: v16i4: +; SSSE3: # %bb.0: +; SSSE3-NEXT: psllw $4, %xmm1 +; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] +; SSSE3-NEXT: pand %xmm2, %xmm1 +; SSSE3-NEXT: psllw $4, %xmm0 +; SSSE3-NEXT: pand %xmm2, %xmm0 +; SSSE3-NEXT: paddsb %xmm1, %xmm0 +; SSSE3-NEXT: psrlw $4, %xmm0 +; SSSE3-NEXT: pand {{.*}}(%rip), %xmm0 +; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; SSSE3-NEXT: pxor %xmm1, %xmm0 +; SSSE3-NEXT: psubb %xmm1, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: v16i4: +; SSE41: # %bb.0: +; SSE41-NEXT: paddb %xmm1, %xmm0 +; SSE41-NEXT: pminsb {{.*}}(%rip), %xmm0 +; SSE41-NEXT: pmaxsb {{.*}}(%rip), %xmm0 +; SSE41-NEXT: retq ; ; AVX-LABEL: v16i4: ; AVX: # %bb.0: -; AVX-NEXT: vpsllw $4, %xmm1, %xmm1 -; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] -; AVX-NEXT: vpand %xmm2, %xmm1, %xmm1 -; AVX-NEXT: vpsllw $4, %xmm0, %xmm0 -; AVX-NEXT: vpand %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpaddsb %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] -; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpsubb %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpminsb {{.*}}(%rip), %xmm0, %xmm0 +; AVX-NEXT: vpmaxsb {{.*}}(%rip), %xmm0, %xmm0 ; AVX-NEXT: retq %z = call <16 x i4> @llvm.sadd.sat.v16i4(<16 x i4> %x, <16 x i4> %y) ret <16 x i4> %z } define <16 x i1> @v16i1(<16 x i1> %x, <16 x i1> %y) nounwind { -; SSE-LABEL: v16i1: -; SSE: # %bb.0: -; SSE-NEXT: psllw $7, %xmm1 -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] -; SSE-NEXT: pand %xmm2, %xmm1 -; SSE-NEXT: psllw $7, %xmm0 -; SSE-NEXT: pand %xmm2, %xmm0 -; SSE-NEXT: paddsb %xmm1, %xmm0 -; SSE-NEXT: pxor %xmm1, %xmm1 -; SSE-NEXT: pcmpgtb %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: retq +; SSE2-LABEL: v16i1: +; SSE2: # %bb.0: +; SSE2-NEXT: psllw $7, %xmm1 +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; SSE2-NEXT: pand %xmm2, %xmm1 +; SSE2-NEXT: psllw $7, %xmm0 +; SSE2-NEXT: pand %xmm2, %xmm0 +; SSE2-NEXT: paddsb %xmm1, %xmm0 +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: pcmpgtb %xmm0, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: v16i1: +; SSSE3: # %bb.0: +; SSSE3-NEXT: psllw $7, %xmm1 +; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; SSSE3-NEXT: pand %xmm2, %xmm1 +; SSSE3-NEXT: psllw $7, %xmm0 +; SSSE3-NEXT: pand %xmm2, %xmm0 +; SSSE3-NEXT: paddsb %xmm1, %xmm0 +; SSSE3-NEXT: pxor %xmm1, %xmm1 +; SSSE3-NEXT: pcmpgtb %xmm0, %xmm1 +; SSSE3-NEXT: movdqa %xmm1, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: v16i1: +; SSE41: # %bb.0: +; SSE41-NEXT: paddb %xmm1, %xmm0 +; SSE41-NEXT: pxor %xmm1, %xmm1 +; SSE41-NEXT: pminsb %xmm1, %xmm0 +; SSE41-NEXT: pcmpeqd %xmm1, %xmm1 +; SSE41-NEXT: pmaxsb %xmm1, %xmm0 +; SSE41-NEXT: retq ; ; AVX1-LABEL: v16i1: ; AVX1: # %bb.0: -; AVX1-NEXT: vpsllw $7, %xmm1, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] -; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpsllw $7, %xmm0, %xmm0 -; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpaddsb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpminsb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: v16i1: ; AVX2: # %bb.0: -; AVX2-NEXT: vpsllw $7, %xmm1, %xmm1 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] -; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpsllw $7, %xmm0, %xmm0 -; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpaddsb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpminsb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq ; ; AVX512-LABEL: v16i1: Index: llvm/test/CodeGen/X86/ssub_sat_vec.ll =================================================================== --- llvm/test/CodeGen/X86/ssub_sat_vec.ll +++ llvm/test/CodeGen/X86/ssub_sat_vec.ll @@ -480,75 +480,105 @@ ; Promotion define <16 x i4> @v16i4(<16 x i4> %x, <16 x i4> %y) nounwind { -; SSE-LABEL: v16i4: -; SSE: # %bb.0: -; SSE-NEXT: psllw $4, %xmm1 -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] -; SSE-NEXT: pand %xmm2, %xmm1 -; SSE-NEXT: psllw $4, %xmm0 -; SSE-NEXT: pand %xmm2, %xmm0 -; SSE-NEXT: psubsb %xmm1, %xmm0 -; SSE-NEXT: psrlw $4, %xmm0 -; SSE-NEXT: pand {{.*}}(%rip), %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] -; SSE-NEXT: pxor %xmm1, %xmm0 -; SSE-NEXT: psubb %xmm1, %xmm0 -; SSE-NEXT: retq +; SSE2-LABEL: v16i4: +; SSE2: # %bb.0: +; SSE2-NEXT: psllw $4, %xmm1 +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] +; SSE2-NEXT: pand %xmm2, %xmm1 +; SSE2-NEXT: psllw $4, %xmm0 +; SSE2-NEXT: pand %xmm2, %xmm0 +; SSE2-NEXT: psubsb %xmm1, %xmm0 +; SSE2-NEXT: psrlw $4, %xmm0 +; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; SSE2-NEXT: pxor %xmm1, %xmm0 +; SSE2-NEXT: psubb %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: v16i4: +; SSSE3: # %bb.0: +; SSSE3-NEXT: psllw $4, %xmm1 +; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] +; SSSE3-NEXT: pand %xmm2, %xmm1 +; SSSE3-NEXT: psllw $4, %xmm0 +; SSSE3-NEXT: pand %xmm2, %xmm0 +; SSSE3-NEXT: psubsb %xmm1, %xmm0 +; SSSE3-NEXT: psrlw $4, %xmm0 +; SSSE3-NEXT: pand {{.*}}(%rip), %xmm0 +; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; SSSE3-NEXT: pxor %xmm1, %xmm0 +; SSSE3-NEXT: psubb %xmm1, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: v16i4: +; SSE41: # %bb.0: +; SSE41-NEXT: psubb %xmm1, %xmm0 +; SSE41-NEXT: pminsb {{.*}}(%rip), %xmm0 +; SSE41-NEXT: pmaxsb {{.*}}(%rip), %xmm0 +; SSE41-NEXT: retq ; ; AVX-LABEL: v16i4: ; AVX: # %bb.0: -; AVX-NEXT: vpsllw $4, %xmm1, %xmm1 -; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] -; AVX-NEXT: vpand %xmm2, %xmm1, %xmm1 -; AVX-NEXT: vpsllw $4, %xmm0, %xmm0 -; AVX-NEXT: vpand %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpsubsb %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] -; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vpsubb %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpminsb {{.*}}(%rip), %xmm0, %xmm0 +; AVX-NEXT: vpmaxsb {{.*}}(%rip), %xmm0, %xmm0 ; AVX-NEXT: retq %z = call <16 x i4> @llvm.ssub.sat.v16i4(<16 x i4> %x, <16 x i4> %y) ret <16 x i4> %z } define <16 x i1> @v16i1(<16 x i1> %x, <16 x i1> %y) nounwind { -; SSE-LABEL: v16i1: -; SSE: # %bb.0: -; SSE-NEXT: psllw $7, %xmm1 -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] -; SSE-NEXT: pand %xmm2, %xmm1 -; SSE-NEXT: psllw $7, %xmm0 -; SSE-NEXT: pand %xmm2, %xmm0 -; SSE-NEXT: psubsb %xmm1, %xmm0 -; SSE-NEXT: pxor %xmm1, %xmm1 -; SSE-NEXT: pcmpgtb %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: retq +; SSE2-LABEL: v16i1: +; SSE2: # %bb.0: +; SSE2-NEXT: psllw $7, %xmm1 +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; SSE2-NEXT: pand %xmm2, %xmm1 +; SSE2-NEXT: psllw $7, %xmm0 +; SSE2-NEXT: pand %xmm2, %xmm0 +; SSE2-NEXT: psubsb %xmm1, %xmm0 +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: pcmpgtb %xmm0, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: v16i1: +; SSSE3: # %bb.0: +; SSSE3-NEXT: psllw $7, %xmm1 +; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; SSSE3-NEXT: pand %xmm2, %xmm1 +; SSSE3-NEXT: psllw $7, %xmm0 +; SSSE3-NEXT: pand %xmm2, %xmm0 +; SSSE3-NEXT: psubsb %xmm1, %xmm0 +; SSSE3-NEXT: pxor %xmm1, %xmm1 +; SSSE3-NEXT: pcmpgtb %xmm0, %xmm1 +; SSSE3-NEXT: movdqa %xmm1, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: v16i1: +; SSE41: # %bb.0: +; SSE41-NEXT: psubb %xmm1, %xmm0 +; SSE41-NEXT: pxor %xmm1, %xmm1 +; SSE41-NEXT: pminsb %xmm1, %xmm0 +; SSE41-NEXT: pcmpeqd %xmm1, %xmm1 +; SSE41-NEXT: pmaxsb %xmm1, %xmm0 +; SSE41-NEXT: retq ; ; AVX1-LABEL: v16i1: ; AVX1: # %bb.0: -; AVX1-NEXT: vpsllw $7, %xmm1, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] -; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpsllw $7, %xmm0, %xmm0 -; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpsubsb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpsubb %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpminsb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: v16i1: ; AVX2: # %bb.0: -; AVX2-NEXT: vpsllw $7, %xmm1, %xmm1 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] -; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpsllw $7, %xmm0, %xmm0 -; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpsubsb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpsubb %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpminsb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq ; ; AVX512-LABEL: v16i1: Index: llvm/test/CodeGen/X86/uadd_sat_vec.ll =================================================================== --- llvm/test/CodeGen/X86/uadd_sat_vec.ll +++ llvm/test/CodeGen/X86/uadd_sat_vec.ll @@ -462,26 +462,14 @@ define <16 x i4> @v16i4(<16 x i4> %x, <16 x i4> %y) nounwind { ; SSE-LABEL: v16i4: ; SSE: # %bb.0: -; SSE-NEXT: psllw $4, %xmm1 -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] -; SSE-NEXT: pand %xmm2, %xmm1 -; SSE-NEXT: psllw $4, %xmm0 -; SSE-NEXT: pand %xmm2, %xmm0 -; SSE-NEXT: paddusb %xmm1, %xmm0 -; SSE-NEXT: psrlw $4, %xmm0 -; SSE-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE-NEXT: paddb %xmm1, %xmm0 +; SSE-NEXT: pminub {{.*}}(%rip), %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: v16i4: ; AVX: # %bb.0: -; AVX-NEXT: vpsllw $4, %xmm1, %xmm1 -; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] -; AVX-NEXT: vpand %xmm2, %xmm1, %xmm1 -; AVX-NEXT: vpsllw $4, %xmm0, %xmm0 -; AVX-NEXT: vpand %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpaddusb %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpminub {{.*}}(%rip), %xmm0, %xmm0 ; AVX-NEXT: retq %z = call <16 x i4> @llvm.uadd.sat.v16i4(<16 x i4> %x, <16 x i4> %y) ret <16 x i4> %z @@ -490,38 +478,20 @@ define <16 x i1> @v16i1(<16 x i1> %x, <16 x i1> %y) nounwind { ; SSE-LABEL: v16i1: ; SSE: # %bb.0: -; SSE-NEXT: psllw $7, %xmm1 -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] -; SSE-NEXT: pand %xmm2, %xmm1 -; SSE-NEXT: psllw $7, %xmm0 -; SSE-NEXT: pand %xmm2, %xmm0 -; SSE-NEXT: paddusb %xmm1, %xmm0 -; SSE-NEXT: psrlw $7, %xmm0 -; SSE-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE-NEXT: paddb %xmm1, %xmm0 +; SSE-NEXT: pminub {{.*}}(%rip), %xmm0 ; SSE-NEXT: retq ; ; AVX1-LABEL: v16i1: ; AVX1: # %bb.0: -; AVX1-NEXT: vpsllw $7, %xmm1, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] -; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpsllw $7, %xmm0, %xmm0 -; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpaddusb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpsrlw $7, %xmm0, %xmm0 -; AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpminub {{.*}}(%rip), %xmm0, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: v16i1: ; AVX2: # %bb.0: -; AVX2-NEXT: vpsllw $7, %xmm1, %xmm1 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] -; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpsllw $7, %xmm0, %xmm0 -; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpaddusb %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpsrlw $7, %xmm0, %xmm0 -; AVX2-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpminub {{.*}}(%rip), %xmm0, %xmm0 ; AVX2-NEXT: retq ; ; AVX512-LABEL: v16i1: Index: llvm/test/CodeGen/X86/usub_sat_vec.ll =================================================================== --- llvm/test/CodeGen/X86/usub_sat_vec.ll +++ llvm/test/CodeGen/X86/usub_sat_vec.ll @@ -462,26 +462,12 @@ define <16 x i4> @v16i4(<16 x i4> %x, <16 x i4> %y) nounwind { ; SSE-LABEL: v16i4: ; SSE: # %bb.0: -; SSE-NEXT: psllw $4, %xmm1 -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] -; SSE-NEXT: pand %xmm2, %xmm1 -; SSE-NEXT: psllw $4, %xmm0 -; SSE-NEXT: pand %xmm2, %xmm0 ; SSE-NEXT: psubusb %xmm1, %xmm0 -; SSE-NEXT: psrlw $4, %xmm0 -; SSE-NEXT: pand {{.*}}(%rip), %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: v16i4: ; AVX: # %bb.0: -; AVX-NEXT: vpsllw $4, %xmm1, %xmm1 -; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] -; AVX-NEXT: vpand %xmm2, %xmm1, %xmm1 -; AVX-NEXT: vpsllw $4, %xmm0, %xmm0 -; AVX-NEXT: vpand %xmm2, %xmm0, %xmm0 ; AVX-NEXT: vpsubusb %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 ; AVX-NEXT: retq %z = call <16 x i4> @llvm.usub.sat.v16i4(<16 x i4> %x, <16 x i4> %y) ret <16 x i4> %z @@ -490,38 +476,17 @@ define <16 x i1> @v16i1(<16 x i1> %x, <16 x i1> %y) nounwind { ; SSE-LABEL: v16i1: ; SSE: # %bb.0: -; SSE-NEXT: psllw $7, %xmm1 -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] -; SSE-NEXT: pand %xmm2, %xmm1 -; SSE-NEXT: psllw $7, %xmm0 -; SSE-NEXT: pand %xmm2, %xmm0 ; SSE-NEXT: psubusb %xmm1, %xmm0 -; SSE-NEXT: psrlw $7, %xmm0 -; SSE-NEXT: pand {{.*}}(%rip), %xmm0 ; SSE-NEXT: retq ; ; AVX1-LABEL: v16i1: ; AVX1: # %bb.0: -; AVX1-NEXT: vpsllw $7, %xmm1, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] -; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpsllw $7, %xmm0, %xmm0 -; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpsubusb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpsrlw $7, %xmm0, %xmm0 -; AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: v16i1: ; AVX2: # %bb.0: -; AVX2-NEXT: vpsllw $7, %xmm1, %xmm1 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] -; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpsllw $7, %xmm0, %xmm0 -; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpsubusb %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpsrlw $7, %xmm0, %xmm0 -; AVX2-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 ; AVX2-NEXT: retq ; ; AVX512-LABEL: v16i1: