Index: llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp =================================================================== --- llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp +++ llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp @@ -670,7 +670,7 @@ EVT PromotedType = Op1Promoted.getValueType(); unsigned NewBits = PromotedType.getScalarSizeInBits(); - if (TLI.isOperationLegalOrCustom(Opcode, PromotedType)) { + auto GenerateShiftedPromotion = [&]() { unsigned ShiftOp; switch (Opcode) { case ISD::SADDSAT: @@ -697,32 +697,52 @@ SDValue Result = DAG.getNode(Opcode, dl, PromotedType, Op1Promoted, Op2Promoted); return DAG.getNode(ShiftOp, dl, PromotedType, Result, ShiftAmount); - } else { - if (Opcode == ISD::USUBSAT) { - SDValue Max = - DAG.getNode(ISD::UMAX, dl, PromotedType, Op1Promoted, Op2Promoted); - return DAG.getNode(ISD::SUB, dl, PromotedType, Max, Op2Promoted); - } + }; - if (Opcode == ISD::UADDSAT) { - APInt MaxVal = APInt::getAllOnesValue(OldBits).zext(NewBits); - SDValue SatMax = DAG.getConstant(MaxVal, dl, PromotedType); - SDValue Add = - DAG.getNode(ISD::ADD, dl, PromotedType, Op1Promoted, Op2Promoted); - return DAG.getNode(ISD::UMIN, dl, PromotedType, Add, SatMax); - } + if (Opcode == ISD::UADDSAT) { + // Via a shift into the higher bits if the larger type has a legal + // UADDSAT and UMIN is not known to be legal. Otherwise use a standard add + // and UMIN if it is over the limit. + if (TLI.isOperationLegalOrCustom(Opcode, PromotedType) && + !TLI.isOperationLegal(ISD::UMIN, PromotedType)) + return GenerateShiftedPromotion(); - unsigned AddOp = Opcode == ISD::SADDSAT ? ISD::ADD : ISD::SUB; - APInt MinVal = APInt::getSignedMinValue(OldBits).sext(NewBits); - APInt MaxVal = APInt::getSignedMaxValue(OldBits).sext(NewBits); - SDValue SatMin = DAG.getConstant(MinVal, dl, PromotedType); + APInt MaxVal = APInt::getAllOnesValue(OldBits).zext(NewBits); SDValue SatMax = DAG.getConstant(MaxVal, dl, PromotedType); - SDValue Result = - DAG.getNode(AddOp, dl, PromotedType, Op1Promoted, Op2Promoted); - Result = DAG.getNode(ISD::SMIN, dl, PromotedType, Result, SatMax); - Result = DAG.getNode(ISD::SMAX, dl, PromotedType, Result, SatMin); - return Result; + SDValue Add = + DAG.getNode(ISD::ADD, dl, PromotedType, Op1Promoted, Op2Promoted); + return DAG.getNode(ISD::UMIN, dl, PromotedType, Add, SatMax); } + + if (Opcode == ISD::USUBSAT) { + // If the USUBSAT is legal in the higher type we can just use it + // (zero is zero, after all). + if (TLI.isOperationLegalOrCustom(Opcode, PromotedType)) + return DAG.getNode(Opcode, dl, PromotedType, Op1Promoted, Op2Promoted); + + SDValue Max = + DAG.getNode(ISD::UMAX, dl, PromotedType, Op1Promoted, Op2Promoted); + return DAG.getNode(ISD::SUB, dl, PromotedType, Max, Op2Promoted); + } + + // SADDSAT or SSUBSAT. Either shifted to the higher bits if the sat intrinsic + // is legal and min/max are not known to be legal. Else expanded to a min/max + // clamp using normal add/sub. + if (TLI.isOperationLegalOrCustom(Opcode, PromotedType) && + (!TLI.isOperationLegal(ISD::SMIN, PromotedType) || + !TLI.isOperationLegal(ISD::SMAX, PromotedType))) + return GenerateShiftedPromotion(); + + unsigned AddOp = Opcode == ISD::SADDSAT ? ISD::ADD : ISD::SUB; + APInt MinVal = APInt::getSignedMinValue(OldBits).sext(NewBits); + APInt MaxVal = APInt::getSignedMaxValue(OldBits).sext(NewBits); + SDValue SatMin = DAG.getConstant(MinVal, dl, PromotedType); + SDValue SatMax = DAG.getConstant(MaxVal, dl, PromotedType); + SDValue Result = + DAG.getNode(AddOp, dl, PromotedType, Op1Promoted, Op2Promoted); + Result = DAG.getNode(ISD::SMIN, dl, PromotedType, Result, SatMax); + Result = DAG.getNode(ISD::SMAX, dl, PromotedType, Result, SatMin); + return Result; } SDValue DAGTypeLegalizer::PromoteIntRes_MULFIX(SDNode *N) { Index: llvm/test/CodeGen/AArch64/sadd_sat_vec.ll =================================================================== --- llvm/test/CodeGen/AArch64/sadd_sat_vec.ll +++ llvm/test/CodeGen/AArch64/sadd_sat_vec.ll @@ -128,10 +128,11 @@ ; CHECK-NEXT: mov v1.h[2], w9 ; CHECK-NEXT: mov v0.h[3], w10 ; CHECK-NEXT: mov v1.h[3], w11 -; CHECK-NEXT: shl v1.4h, v1.4h, #8 -; CHECK-NEXT: shl v0.4h, v0.4h, #8 -; CHECK-NEXT: sqadd v0.4h, v0.4h, v1.4h -; CHECK-NEXT: sshr v0.4h, v0.4h, #8 +; CHECK-NEXT: add v0.4h, v0.4h, v1.4h +; CHECK-NEXT: movi v1.4h, #127 +; CHECK-NEXT: smin v0.4h, v0.4h, v1.4h +; CHECK-NEXT: mvni v1.4h, #127 +; CHECK-NEXT: smax v0.4h, v0.4h, v1.4h ; CHECK-NEXT: xtn v0.8b, v0.8h ; CHECK-NEXT: str s0, [x2] ; CHECK-NEXT: ret @@ -145,18 +146,19 @@ define void @v2i8(<2 x i8>* %px, <2 x i8>* %py, <2 x i8>* %pz) nounwind { ; CHECK-LABEL: v2i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ldrb w8, [x0] -; CHECK-NEXT: ldrb w9, [x1] -; CHECK-NEXT: ldrb w10, [x0, #1] -; CHECK-NEXT: ldrb w11, [x1, #1] +; CHECK-NEXT: ldrsb w8, [x0] +; CHECK-NEXT: ldrsb w9, [x1] +; CHECK-NEXT: ldrsb w10, [x0, #1] +; CHECK-NEXT: ldrsb w11, [x1, #1] ; CHECK-NEXT: fmov s0, w8 ; CHECK-NEXT: fmov s1, w9 ; CHECK-NEXT: mov v0.s[1], w10 ; CHECK-NEXT: mov v1.s[1], w11 -; CHECK-NEXT: shl v1.2s, v1.2s, #24 -; CHECK-NEXT: shl v0.2s, v0.2s, #24 -; CHECK-NEXT: sqadd v0.2s, v0.2s, v1.2s -; CHECK-NEXT: ushr v0.2s, v0.2s, #24 +; CHECK-NEXT: add v0.2s, v0.2s, v1.2s +; CHECK-NEXT: movi v1.2s, #127 +; CHECK-NEXT: smin v0.2s, v0.2s, v1.2s +; CHECK-NEXT: mvni v1.2s, #127 +; CHECK-NEXT: smax v0.2s, v0.2s, v1.2s ; CHECK-NEXT: mov w8, v0.s[1] ; CHECK-NEXT: fmov w9, s0 ; CHECK-NEXT: strb w8, [x2, #1] @@ -187,18 +189,19 @@ define void @v2i16(<2 x i16>* %px, <2 x i16>* %py, <2 x i16>* %pz) nounwind { ; CHECK-LABEL: v2i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ldrh w8, [x0] -; CHECK-NEXT: ldrh w9, [x1] -; CHECK-NEXT: ldrh w10, [x0, #2] -; CHECK-NEXT: ldrh w11, [x1, #2] +; CHECK-NEXT: ldrsh w8, [x0] +; CHECK-NEXT: ldrsh w9, [x1] +; CHECK-NEXT: ldrsh w10, [x0, #2] +; CHECK-NEXT: ldrsh w11, [x1, #2] ; CHECK-NEXT: fmov s0, w8 ; CHECK-NEXT: fmov s1, w9 ; CHECK-NEXT: mov v0.s[1], w10 ; CHECK-NEXT: mov v1.s[1], w11 -; CHECK-NEXT: shl v1.2s, v1.2s, #16 -; CHECK-NEXT: shl v0.2s, v0.2s, #16 -; CHECK-NEXT: sqadd v0.2s, v0.2s, v1.2s -; CHECK-NEXT: ushr v0.2s, v0.2s, #16 +; CHECK-NEXT: add v0.2s, v0.2s, v1.2s +; CHECK-NEXT: movi v1.2s, #127, msl #8 +; CHECK-NEXT: smin v0.2s, v0.2s, v1.2s +; CHECK-NEXT: mvni v1.2s, #127, msl #8 +; CHECK-NEXT: smax v0.2s, v0.2s, v1.2s ; CHECK-NEXT: mov w8, v0.s[1] ; CHECK-NEXT: fmov w9, s0 ; CHECK-NEXT: strh w8, [x2, #2] @@ -273,11 +276,11 @@ ; CHECK-NEXT: shl v0.16b, v0.16b, #4 ; CHECK-NEXT: shl v1.16b, v1.16b, #4 ; CHECK-NEXT: sshr v0.16b, v0.16b, #4 -; CHECK-NEXT: sshr v1.16b, v1.16b, #4 -; CHECK-NEXT: shl v1.16b, v1.16b, #4 -; CHECK-NEXT: shl v0.16b, v0.16b, #4 -; CHECK-NEXT: sqadd v0.16b, v0.16b, v1.16b -; CHECK-NEXT: sshr v0.16b, v0.16b, #4 +; CHECK-NEXT: movi v2.16b, #7 +; CHECK-NEXT: ssra v0.16b, v1.16b, #4 +; CHECK-NEXT: smin v0.16b, v0.16b, v2.16b +; CHECK-NEXT: movi v1.16b, #248 +; CHECK-NEXT: smax v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret %z = call <16 x i4> @llvm.sadd.sat.v16i4(<16 x i4> %x, <16 x i4> %y) ret <16 x i4> %z @@ -289,11 +292,11 @@ ; CHECK-NEXT: shl v0.16b, v0.16b, #7 ; CHECK-NEXT: shl v1.16b, v1.16b, #7 ; CHECK-NEXT: sshr v0.16b, v0.16b, #7 -; CHECK-NEXT: sshr v1.16b, v1.16b, #7 -; CHECK-NEXT: shl v1.16b, v1.16b, #7 -; CHECK-NEXT: shl v0.16b, v0.16b, #7 -; CHECK-NEXT: sqadd v0.16b, v0.16b, v1.16b -; CHECK-NEXT: sshr v0.16b, v0.16b, #7 +; CHECK-NEXT: movi v2.2d, #0000000000000000 +; CHECK-NEXT: ssra v0.16b, v1.16b, #7 +; CHECK-NEXT: smin v0.16b, v0.16b, v2.16b +; CHECK-NEXT: movi v1.2d, #0xffffffffffffffff +; CHECK-NEXT: smax v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret %z = call <16 x i1> @llvm.sadd.sat.v16i1(<16 x i1> %x, <16 x i1> %y) ret <16 x i1> %z Index: llvm/test/CodeGen/AArch64/ssub_sat_vec.ll =================================================================== --- llvm/test/CodeGen/AArch64/ssub_sat_vec.ll +++ llvm/test/CodeGen/AArch64/ssub_sat_vec.ll @@ -129,10 +129,11 @@ ; CHECK-NEXT: mov v1.h[2], w9 ; CHECK-NEXT: mov v0.h[3], w10 ; CHECK-NEXT: mov v1.h[3], w11 -; CHECK-NEXT: shl v1.4h, v1.4h, #8 -; CHECK-NEXT: shl v0.4h, v0.4h, #8 -; CHECK-NEXT: sqsub v0.4h, v0.4h, v1.4h -; CHECK-NEXT: sshr v0.4h, v0.4h, #8 +; CHECK-NEXT: sub v0.4h, v0.4h, v1.4h +; CHECK-NEXT: movi v1.4h, #127 +; CHECK-NEXT: smin v0.4h, v0.4h, v1.4h +; CHECK-NEXT: mvni v1.4h, #127 +; CHECK-NEXT: smax v0.4h, v0.4h, v1.4h ; CHECK-NEXT: xtn v0.8b, v0.8h ; CHECK-NEXT: str s0, [x2] ; CHECK-NEXT: ret @@ -146,18 +147,19 @@ define void @v2i8(<2 x i8>* %px, <2 x i8>* %py, <2 x i8>* %pz) nounwind { ; CHECK-LABEL: v2i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ldrb w8, [x0] -; CHECK-NEXT: ldrb w9, [x1] -; CHECK-NEXT: ldrb w10, [x0, #1] -; CHECK-NEXT: ldrb w11, [x1, #1] +; CHECK-NEXT: ldrsb w8, [x0] +; CHECK-NEXT: ldrsb w9, [x1] +; CHECK-NEXT: ldrsb w10, [x0, #1] +; CHECK-NEXT: ldrsb w11, [x1, #1] ; CHECK-NEXT: fmov s0, w8 ; CHECK-NEXT: fmov s1, w9 ; CHECK-NEXT: mov v0.s[1], w10 ; CHECK-NEXT: mov v1.s[1], w11 -; CHECK-NEXT: shl v1.2s, v1.2s, #24 -; CHECK-NEXT: shl v0.2s, v0.2s, #24 -; CHECK-NEXT: sqsub v0.2s, v0.2s, v1.2s -; CHECK-NEXT: ushr v0.2s, v0.2s, #24 +; CHECK-NEXT: sub v0.2s, v0.2s, v1.2s +; CHECK-NEXT: movi v1.2s, #127 +; CHECK-NEXT: smin v0.2s, v0.2s, v1.2s +; CHECK-NEXT: mvni v1.2s, #127 +; CHECK-NEXT: smax v0.2s, v0.2s, v1.2s ; CHECK-NEXT: mov w8, v0.s[1] ; CHECK-NEXT: fmov w9, s0 ; CHECK-NEXT: strb w8, [x2, #1] @@ -188,18 +190,19 @@ define void @v2i16(<2 x i16>* %px, <2 x i16>* %py, <2 x i16>* %pz) nounwind { ; CHECK-LABEL: v2i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ldrh w8, [x0] -; CHECK-NEXT: ldrh w9, [x1] -; CHECK-NEXT: ldrh w10, [x0, #2] -; CHECK-NEXT: ldrh w11, [x1, #2] +; CHECK-NEXT: ldrsh w8, [x0] +; CHECK-NEXT: ldrsh w9, [x1] +; CHECK-NEXT: ldrsh w10, [x0, #2] +; CHECK-NEXT: ldrsh w11, [x1, #2] ; CHECK-NEXT: fmov s0, w8 ; CHECK-NEXT: fmov s1, w9 ; CHECK-NEXT: mov v0.s[1], w10 ; CHECK-NEXT: mov v1.s[1], w11 -; CHECK-NEXT: shl v1.2s, v1.2s, #16 -; CHECK-NEXT: shl v0.2s, v0.2s, #16 -; CHECK-NEXT: sqsub v0.2s, v0.2s, v1.2s -; CHECK-NEXT: ushr v0.2s, v0.2s, #16 +; CHECK-NEXT: sub v0.2s, v0.2s, v1.2s +; CHECK-NEXT: movi v1.2s, #127, msl #8 +; CHECK-NEXT: smin v0.2s, v0.2s, v1.2s +; CHECK-NEXT: mvni v1.2s, #127, msl #8 +; CHECK-NEXT: smax v0.2s, v0.2s, v1.2s ; CHECK-NEXT: mov w8, v0.s[1] ; CHECK-NEXT: fmov w9, s0 ; CHECK-NEXT: strh w8, [x2, #2] @@ -271,14 +274,15 @@ define <16 x i4> @v16i4(<16 x i4> %x, <16 x i4> %y) nounwind { ; CHECK-LABEL: v16i4: ; CHECK: // %bb.0: -; CHECK-NEXT: shl v0.16b, v0.16b, #4 -; CHECK-NEXT: shl v1.16b, v1.16b, #4 -; CHECK-NEXT: sshr v0.16b, v0.16b, #4 -; CHECK-NEXT: sshr v1.16b, v1.16b, #4 ; CHECK-NEXT: shl v1.16b, v1.16b, #4 ; CHECK-NEXT: shl v0.16b, v0.16b, #4 -; CHECK-NEXT: sqsub v0.16b, v0.16b, v1.16b +; CHECK-NEXT: sshr v1.16b, v1.16b, #4 ; CHECK-NEXT: sshr v0.16b, v0.16b, #4 +; CHECK-NEXT: movi v2.16b, #7 +; CHECK-NEXT: sub v0.16b, v0.16b, v1.16b +; CHECK-NEXT: smin v0.16b, v0.16b, v2.16b +; CHECK-NEXT: movi v1.16b, #248 +; CHECK-NEXT: smax v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret %z = call <16 x i4> @llvm.ssub.sat.v16i4(<16 x i4> %x, <16 x i4> %y) ret <16 x i4> %z @@ -287,14 +291,15 @@ define <16 x i1> @v16i1(<16 x i1> %x, <16 x i1> %y) nounwind { ; CHECK-LABEL: v16i1: ; CHECK: // %bb.0: -; CHECK-NEXT: shl v0.16b, v0.16b, #7 -; CHECK-NEXT: shl v1.16b, v1.16b, #7 -; CHECK-NEXT: sshr v0.16b, v0.16b, #7 -; CHECK-NEXT: sshr v1.16b, v1.16b, #7 ; CHECK-NEXT: shl v1.16b, v1.16b, #7 ; CHECK-NEXT: shl v0.16b, v0.16b, #7 -; CHECK-NEXT: sqsub v0.16b, v0.16b, v1.16b +; CHECK-NEXT: sshr v1.16b, v1.16b, #7 ; CHECK-NEXT: sshr v0.16b, v0.16b, #7 +; CHECK-NEXT: movi v2.2d, #0000000000000000 +; CHECK-NEXT: sub v0.16b, v0.16b, v1.16b +; CHECK-NEXT: smin v0.16b, v0.16b, v2.16b +; CHECK-NEXT: movi v1.2d, #0xffffffffffffffff +; CHECK-NEXT: smax v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret %z = call <16 x i1> @llvm.ssub.sat.v16i1(<16 x i1> %x, <16 x i1> %y) ret <16 x i1> %z Index: llvm/test/CodeGen/AArch64/uadd_sat_vec.ll =================================================================== --- llvm/test/CodeGen/AArch64/uadd_sat_vec.ll +++ llvm/test/CodeGen/AArch64/uadd_sat_vec.ll @@ -116,22 +116,21 @@ ; CHECK-NEXT: ldrb w9, [x1] ; CHECK-NEXT: ldrb w10, [x0, #1] ; CHECK-NEXT: ldrb w11, [x1, #1] +; CHECK-NEXT: ldrb w12, [x0, #2] ; CHECK-NEXT: fmov s0, w8 +; CHECK-NEXT: ldrb w8, [x1, #2] ; CHECK-NEXT: fmov s1, w9 -; CHECK-NEXT: ldrb w8, [x0, #2] -; CHECK-NEXT: ldrb w9, [x1, #2] ; CHECK-NEXT: mov v0.h[1], w10 +; CHECK-NEXT: ldrb w9, [x0, #3] +; CHECK-NEXT: ldrb w10, [x1, #3] ; CHECK-NEXT: mov v1.h[1], w11 -; CHECK-NEXT: ldrb w10, [x0, #3] -; CHECK-NEXT: ldrb w11, [x1, #3] -; CHECK-NEXT: mov v0.h[2], w8 -; CHECK-NEXT: mov v1.h[2], w9 -; CHECK-NEXT: mov v0.h[3], w10 -; CHECK-NEXT: mov v1.h[3], w11 -; CHECK-NEXT: shl v1.4h, v1.4h, #8 -; CHECK-NEXT: shl v0.4h, v0.4h, #8 -; CHECK-NEXT: uqadd v0.4h, v0.4h, v1.4h -; CHECK-NEXT: ushr v0.4h, v0.4h, #8 +; CHECK-NEXT: mov v0.h[2], w12 +; CHECK-NEXT: mov v1.h[2], w8 +; CHECK-NEXT: mov v0.h[3], w9 +; CHECK-NEXT: mov v1.h[3], w10 +; CHECK-NEXT: movi d2, #0xff00ff00ff00ff +; CHECK-NEXT: add v0.4h, v0.4h, v1.4h +; CHECK-NEXT: umin v0.4h, v0.4h, v2.4h ; CHECK-NEXT: xtn v0.8b, v0.8h ; CHECK-NEXT: str s0, [x2] ; CHECK-NEXT: ret @@ -150,13 +149,12 @@ ; CHECK-NEXT: ldrb w10, [x0, #1] ; CHECK-NEXT: ldrb w11, [x1, #1] ; CHECK-NEXT: fmov s0, w8 -; CHECK-NEXT: fmov s1, w9 +; CHECK-NEXT: fmov s2, w9 ; CHECK-NEXT: mov v0.s[1], w10 -; CHECK-NEXT: mov v1.s[1], w11 -; CHECK-NEXT: shl v1.2s, v1.2s, #24 -; CHECK-NEXT: shl v0.2s, v0.2s, #24 -; CHECK-NEXT: uqadd v0.2s, v0.2s, v1.2s -; CHECK-NEXT: ushr v0.2s, v0.2s, #24 +; CHECK-NEXT: mov v2.s[1], w11 +; CHECK-NEXT: movi d1, #0x0000ff000000ff +; CHECK-NEXT: add v0.2s, v0.2s, v2.2s +; CHECK-NEXT: umin v0.2s, v0.2s, v1.2s ; CHECK-NEXT: mov w8, v0.s[1] ; CHECK-NEXT: fmov w9, s0 ; CHECK-NEXT: strb w8, [x2, #1] @@ -192,13 +190,12 @@ ; CHECK-NEXT: ldrh w10, [x0, #2] ; CHECK-NEXT: ldrh w11, [x1, #2] ; CHECK-NEXT: fmov s0, w8 -; CHECK-NEXT: fmov s1, w9 +; CHECK-NEXT: fmov s2, w9 ; CHECK-NEXT: mov v0.s[1], w10 -; CHECK-NEXT: mov v1.s[1], w11 -; CHECK-NEXT: shl v1.2s, v1.2s, #16 -; CHECK-NEXT: shl v0.2s, v0.2s, #16 -; CHECK-NEXT: uqadd v0.2s, v0.2s, v1.2s -; CHECK-NEXT: ushr v0.2s, v0.2s, #16 +; CHECK-NEXT: mov v2.s[1], w11 +; CHECK-NEXT: movi d1, #0x00ffff0000ffff +; CHECK-NEXT: add v0.2s, v0.2s, v2.2s +; CHECK-NEXT: umin v0.2s, v0.2s, v1.2s ; CHECK-NEXT: mov w8, v0.s[1] ; CHECK-NEXT: fmov w9, s0 ; CHECK-NEXT: strh w8, [x2, #2] @@ -271,12 +268,10 @@ ; CHECK-LABEL: v16i4: ; CHECK: // %bb.0: ; CHECK-NEXT: movi v2.16b, #15 -; CHECK-NEXT: and v0.16b, v0.16b, v2.16b ; CHECK-NEXT: and v1.16b, v1.16b, v2.16b -; CHECK-NEXT: shl v1.16b, v1.16b, #4 -; CHECK-NEXT: shl v0.16b, v0.16b, #4 -; CHECK-NEXT: uqadd v0.16b, v0.16b, v1.16b -; CHECK-NEXT: ushr v0.16b, v0.16b, #4 +; CHECK-NEXT: and v0.16b, v0.16b, v2.16b +; CHECK-NEXT: add v0.16b, v0.16b, v1.16b +; CHECK-NEXT: umin v0.16b, v0.16b, v2.16b ; CHECK-NEXT: ret %z = call <16 x i4> @llvm.uadd.sat.v16i4(<16 x i4> %x, <16 x i4> %y) ret <16 x i4> %z @@ -286,12 +281,10 @@ ; CHECK-LABEL: v16i1: ; CHECK: // %bb.0: ; CHECK-NEXT: movi v2.16b, #1 -; CHECK-NEXT: and v0.16b, v0.16b, v2.16b ; CHECK-NEXT: and v1.16b, v1.16b, v2.16b -; CHECK-NEXT: shl v1.16b, v1.16b, #7 -; CHECK-NEXT: shl v0.16b, v0.16b, #7 -; CHECK-NEXT: uqadd v0.16b, v0.16b, v1.16b -; CHECK-NEXT: ushr v0.16b, v0.16b, #7 +; CHECK-NEXT: and v0.16b, v0.16b, v2.16b +; CHECK-NEXT: add v0.16b, v0.16b, v1.16b +; CHECK-NEXT: umin v0.16b, v0.16b, v2.16b ; CHECK-NEXT: ret %z = call <16 x i1> @llvm.uadd.sat.v16i1(<16 x i1> %x, <16 x i1> %y) ret <16 x i1> %z Index: llvm/test/CodeGen/AArch64/usub_sat_vec.ll =================================================================== --- llvm/test/CodeGen/AArch64/usub_sat_vec.ll +++ llvm/test/CodeGen/AArch64/usub_sat_vec.ll @@ -129,10 +129,7 @@ ; CHECK-NEXT: mov v1.h[2], w9 ; CHECK-NEXT: mov v0.h[3], w10 ; CHECK-NEXT: mov v1.h[3], w11 -; CHECK-NEXT: shl v1.4h, v1.4h, #8 -; CHECK-NEXT: shl v0.4h, v0.4h, #8 ; CHECK-NEXT: uqsub v0.4h, v0.4h, v1.4h -; CHECK-NEXT: ushr v0.4h, v0.4h, #8 ; CHECK-NEXT: xtn v0.8b, v0.8h ; CHECK-NEXT: str s0, [x2] ; CHECK-NEXT: ret @@ -154,10 +151,7 @@ ; CHECK-NEXT: fmov s1, w9 ; CHECK-NEXT: mov v0.s[1], w10 ; CHECK-NEXT: mov v1.s[1], w11 -; CHECK-NEXT: shl v1.2s, v1.2s, #24 -; CHECK-NEXT: shl v0.2s, v0.2s, #24 ; CHECK-NEXT: uqsub v0.2s, v0.2s, v1.2s -; CHECK-NEXT: ushr v0.2s, v0.2s, #24 ; CHECK-NEXT: mov w8, v0.s[1] ; CHECK-NEXT: fmov w9, s0 ; CHECK-NEXT: strb w8, [x2, #1] @@ -196,10 +190,7 @@ ; CHECK-NEXT: fmov s1, w9 ; CHECK-NEXT: mov v0.s[1], w10 ; CHECK-NEXT: mov v1.s[1], w11 -; CHECK-NEXT: shl v1.2s, v1.2s, #16 -; CHECK-NEXT: shl v0.2s, v0.2s, #16 ; CHECK-NEXT: uqsub v0.2s, v0.2s, v1.2s -; CHECK-NEXT: ushr v0.2s, v0.2s, #16 ; CHECK-NEXT: mov w8, v0.s[1] ; CHECK-NEXT: fmov w9, s0 ; CHECK-NEXT: strh w8, [x2, #2] @@ -272,12 +263,9 @@ ; CHECK-LABEL: v16i4: ; CHECK: // %bb.0: ; CHECK-NEXT: movi v2.16b, #15 -; CHECK-NEXT: and v0.16b, v0.16b, v2.16b ; CHECK-NEXT: and v1.16b, v1.16b, v2.16b -; CHECK-NEXT: shl v1.16b, v1.16b, #4 -; CHECK-NEXT: shl v0.16b, v0.16b, #4 +; CHECK-NEXT: and v0.16b, v0.16b, v2.16b ; CHECK-NEXT: uqsub v0.16b, v0.16b, v1.16b -; CHECK-NEXT: ushr v0.16b, v0.16b, #4 ; CHECK-NEXT: ret %z = call <16 x i4> @llvm.usub.sat.v16i4(<16 x i4> %x, <16 x i4> %y) ret <16 x i4> %z @@ -287,12 +275,9 @@ ; CHECK-LABEL: v16i1: ; CHECK: // %bb.0: ; CHECK-NEXT: movi v2.16b, #1 -; CHECK-NEXT: and v0.16b, v0.16b, v2.16b ; CHECK-NEXT: and v1.16b, v1.16b, v2.16b -; CHECK-NEXT: shl v1.16b, v1.16b, #7 -; CHECK-NEXT: shl v0.16b, v0.16b, #7 +; CHECK-NEXT: and v0.16b, v0.16b, v2.16b ; CHECK-NEXT: uqsub v0.16b, v0.16b, v1.16b -; CHECK-NEXT: ushr v0.16b, v0.16b, #7 ; CHECK-NEXT: ret %z = call <16 x i1> @llvm.usub.sat.v16i1(<16 x i1> %x, <16 x i1> %y) ret <16 x i1> %z Index: llvm/test/CodeGen/Thumb2/mve-saturating-arith.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-saturating-arith.ll +++ llvm/test/CodeGen/Thumb2/mve-saturating-arith.ll @@ -8,10 +8,11 @@ ; CHECK-NEXT: vshl.i8 q0, q0, #4 ; CHECK-NEXT: vshr.s8 q1, q1, #4 ; CHECK-NEXT: vshr.s8 q0, q0, #4 -; CHECK-NEXT: vshl.i8 q1, q1, #4 -; CHECK-NEXT: vshl.i8 q0, q0, #4 -; CHECK-NEXT: vqadd.s8 q0, q0, q1 -; CHECK-NEXT: vshr.s8 q0, q0, #4 +; CHECK-NEXT: vadd.i8 q0, q0, q1 +; CHECK-NEXT: vmov.i8 q1, #0x7 +; CHECK-NEXT: vmin.s8 q0, q0, q1 +; CHECK-NEXT: vmov.i8 q1, #0xf8 +; CHECK-NEXT: vmax.s8 q0, q0, q1 ; CHECK-NEXT: bx lr enrty: %0 = call <16 x i4> @llvm.sadd.sat.v16i4(<16 x i4> %src1, <16 x i4> %src2) @@ -33,10 +34,11 @@ ; CHECK: @ %bb.0: @ %enrty ; CHECK-NEXT: vmovlb.s8 q1, q1 ; CHECK-NEXT: vmovlb.s8 q0, q0 -; CHECK-NEXT: vshl.i16 q1, q1, #8 -; CHECK-NEXT: vshl.i16 q0, q0, #8 -; CHECK-NEXT: vqadd.s16 q0, q0, q1 -; CHECK-NEXT: vshr.s16 q0, q0, #8 +; CHECK-NEXT: vadd.i16 q0, q0, q1 +; CHECK-NEXT: vmov.i16 q1, #0x7f +; CHECK-NEXT: vmin.s16 q0, q0, q1 +; CHECK-NEXT: vmvn.i16 q1, #0x7f +; CHECK-NEXT: vmax.s16 q0, q0, q1 ; CHECK-NEXT: bx lr enrty: %0 = call <8 x i8> @llvm.sadd.sat.v8i8(<8 x i8> %src1, <8 x i8> %src2) @@ -46,10 +48,13 @@ define arm_aapcs_vfpcc <4 x i16> @sadd_v4i16(<4 x i16> %src1, <4 x i16> %src2) { ; CHECK-LABEL: sadd_v4i16: ; CHECK: @ %bb.0: @ %enrty -; CHECK-NEXT: vshl.i32 q1, q1, #16 -; CHECK-NEXT: vshl.i32 q0, q0, #16 -; CHECK-NEXT: vqadd.s32 q0, q0, q1 -; CHECK-NEXT: vshr.s32 q0, q0, #16 +; CHECK-NEXT: vmovlb.s16 q1, q1 +; CHECK-NEXT: vmovlb.s16 q0, q0 +; CHECK-NEXT: vadd.i32 q0, q0, q1 +; CHECK-NEXT: vmov.i32 q1, #0x7fff +; CHECK-NEXT: vmin.s32 q0, q0, q1 +; CHECK-NEXT: vmvn.i32 q1, #0x7fff +; CHECK-NEXT: vmax.s32 q0, q0, q1 ; CHECK-NEXT: bx lr enrty: %0 = call <4 x i16> @llvm.sadd.sat.v4i16(<4 x i16> %src1, <4 x i16> %src2) @@ -254,10 +259,8 @@ ; CHECK-NEXT: vmov.i8 q2, #0xf ; CHECK-NEXT: vand q1, q1, q2 ; CHECK-NEXT: vand q0, q0, q2 -; CHECK-NEXT: vshl.i8 q1, q1, #4 -; CHECK-NEXT: vshl.i8 q0, q0, #4 -; CHECK-NEXT: vqadd.u8 q0, q0, q1 -; CHECK-NEXT: vshr.u8 q0, q0, #4 +; CHECK-NEXT: vadd.i8 q0, q0, q1 +; CHECK-NEXT: vmin.u8 q0, q0, q2 ; CHECK-NEXT: bx lr enrty: %0 = call <16 x i4> @llvm.uadd.sat.v16i4(<16 x i4> %src1, <16 x i4> %src2) @@ -279,10 +282,9 @@ ; CHECK: @ %bb.0: @ %enrty ; CHECK-NEXT: vmovlb.u8 q1, q1 ; CHECK-NEXT: vmovlb.u8 q0, q0 -; CHECK-NEXT: vshl.i16 q1, q1, #8 -; CHECK-NEXT: vshl.i16 q0, q0, #8 -; CHECK-NEXT: vqadd.u16 q0, q0, q1 -; CHECK-NEXT: vshr.u16 q0, q0, #8 +; CHECK-NEXT: vadd.i16 q0, q0, q1 +; CHECK-NEXT: vmov.i16 q1, #0xff +; CHECK-NEXT: vmin.u16 q0, q0, q1 ; CHECK-NEXT: bx lr enrty: %0 = call <8 x i8> @llvm.uadd.sat.v8i8(<8 x i8> %src1, <8 x i8> %src2) @@ -292,10 +294,11 @@ define arm_aapcs_vfpcc <4 x i16> @uadd_v4i16(<4 x i16> %src1, <4 x i16> %src2) { ; CHECK-LABEL: uadd_v4i16: ; CHECK: @ %bb.0: @ %enrty -; CHECK-NEXT: vshl.i32 q1, q1, #16 -; CHECK-NEXT: vshl.i32 q0, q0, #16 -; CHECK-NEXT: vqadd.u32 q0, q0, q1 -; CHECK-NEXT: vshr.u32 q0, q0, #16 +; CHECK-NEXT: vmovlb.u16 q1, q1 +; CHECK-NEXT: vmovlb.u16 q0, q0 +; CHECK-NEXT: vadd.i32 q0, q0, q1 +; CHECK-NEXT: vmov.i32 q1, #0xffff +; CHECK-NEXT: vmin.u32 q0, q0, q1 ; CHECK-NEXT: bx lr enrty: %0 = call <4 x i16> @llvm.uadd.sat.v4i16(<4 x i16> %src1, <4 x i16> %src2) @@ -437,10 +440,11 @@ ; CHECK-NEXT: vshl.i8 q0, q0, #4 ; CHECK-NEXT: vshr.s8 q1, q1, #4 ; CHECK-NEXT: vshr.s8 q0, q0, #4 -; CHECK-NEXT: vshl.i8 q1, q1, #4 -; CHECK-NEXT: vshl.i8 q0, q0, #4 -; CHECK-NEXT: vqsub.s8 q0, q0, q1 -; CHECK-NEXT: vshr.s8 q0, q0, #4 +; CHECK-NEXT: vsub.i8 q0, q0, q1 +; CHECK-NEXT: vmov.i8 q1, #0x7 +; CHECK-NEXT: vmin.s8 q0, q0, q1 +; CHECK-NEXT: vmov.i8 q1, #0xf8 +; CHECK-NEXT: vmax.s8 q0, q0, q1 ; CHECK-NEXT: bx lr enrty: %0 = call <16 x i4> @llvm.ssub.sat.v16i4(<16 x i4> %src1, <16 x i4> %src2) @@ -462,10 +466,11 @@ ; CHECK: @ %bb.0: @ %enrty ; CHECK-NEXT: vmovlb.s8 q1, q1 ; CHECK-NEXT: vmovlb.s8 q0, q0 -; CHECK-NEXT: vshl.i16 q1, q1, #8 -; CHECK-NEXT: vshl.i16 q0, q0, #8 -; CHECK-NEXT: vqsub.s16 q0, q0, q1 -; CHECK-NEXT: vshr.s16 q0, q0, #8 +; CHECK-NEXT: vsub.i16 q0, q0, q1 +; CHECK-NEXT: vmov.i16 q1, #0x7f +; CHECK-NEXT: vmin.s16 q0, q0, q1 +; CHECK-NEXT: vmvn.i16 q1, #0x7f +; CHECK-NEXT: vmax.s16 q0, q0, q1 ; CHECK-NEXT: bx lr enrty: %0 = call <8 x i8> @llvm.ssub.sat.v8i8(<8 x i8> %src1, <8 x i8> %src2) @@ -475,10 +480,13 @@ define arm_aapcs_vfpcc <4 x i16> @ssub_v4i16(<4 x i16> %src1, <4 x i16> %src2) { ; CHECK-LABEL: ssub_v4i16: ; CHECK: @ %bb.0: @ %enrty -; CHECK-NEXT: vshl.i32 q1, q1, #16 -; CHECK-NEXT: vshl.i32 q0, q0, #16 -; CHECK-NEXT: vqsub.s32 q0, q0, q1 -; CHECK-NEXT: vshr.s32 q0, q0, #16 +; CHECK-NEXT: vmovlb.s16 q1, q1 +; CHECK-NEXT: vmovlb.s16 q0, q0 +; CHECK-NEXT: vsub.i32 q0, q0, q1 +; CHECK-NEXT: vmov.i32 q1, #0x7fff +; CHECK-NEXT: vmin.s32 q0, q0, q1 +; CHECK-NEXT: vmvn.i32 q1, #0x7fff +; CHECK-NEXT: vmax.s32 q0, q0, q1 ; CHECK-NEXT: bx lr enrty: %0 = call <4 x i16> @llvm.ssub.sat.v4i16(<4 x i16> %src1, <4 x i16> %src2) @@ -698,10 +706,7 @@ ; CHECK-NEXT: vmov.i8 q2, #0xf ; CHECK-NEXT: vand q1, q1, q2 ; CHECK-NEXT: vand q0, q0, q2 -; CHECK-NEXT: vshl.i8 q1, q1, #4 -; CHECK-NEXT: vshl.i8 q0, q0, #4 ; CHECK-NEXT: vqsub.u8 q0, q0, q1 -; CHECK-NEXT: vshr.u8 q0, q0, #4 ; CHECK-NEXT: bx lr enrty: %0 = call <16 x i4> @llvm.usub.sat.v16i4(<16 x i4> %src1, <16 x i4> %src2) @@ -723,10 +728,7 @@ ; CHECK: @ %bb.0: @ %enrty ; CHECK-NEXT: vmovlb.u8 q1, q1 ; CHECK-NEXT: vmovlb.u8 q0, q0 -; CHECK-NEXT: vshl.i16 q1, q1, #8 -; CHECK-NEXT: vshl.i16 q0, q0, #8 ; CHECK-NEXT: vqsub.u16 q0, q0, q1 -; CHECK-NEXT: vshr.u16 q0, q0, #8 ; CHECK-NEXT: bx lr enrty: %0 = call <8 x i8> @llvm.usub.sat.v8i8(<8 x i8> %src1, <8 x i8> %src2) @@ -736,10 +738,9 @@ define arm_aapcs_vfpcc <4 x i16> @usub_v4i16(<4 x i16> %src1, <4 x i16> %src2) { ; CHECK-LABEL: usub_v4i16: ; CHECK: @ %bb.0: @ %enrty -; CHECK-NEXT: vshl.i32 q1, q1, #16 -; CHECK-NEXT: vshl.i32 q0, q0, #16 +; CHECK-NEXT: vmovlb.u16 q1, q1 +; CHECK-NEXT: vmovlb.u16 q0, q0 ; CHECK-NEXT: vqsub.u32 q0, q0, q1 -; CHECK-NEXT: vshr.u32 q0, q0, #16 ; CHECK-NEXT: bx lr enrty: %0 = call <4 x i16> @llvm.usub.sat.v4i16(<4 x i16> %src1, <4 x i16> %src2) Index: llvm/test/CodeGen/X86/sadd_sat_vec.ll =================================================================== --- llvm/test/CodeGen/X86/sadd_sat_vec.ll +++ llvm/test/CodeGen/X86/sadd_sat_vec.ll @@ -480,63 +480,134 @@ ; Promotion define <16 x i4> @v16i4(<16 x i4> %x, <16 x i4> %y) nounwind { -; SSE-LABEL: v16i4: -; SSE: # %bb.0: -; SSE-NEXT: psllw $4, %xmm1 -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] -; SSE-NEXT: pand %xmm2, %xmm1 -; SSE-NEXT: psllw $4, %xmm0 -; SSE-NEXT: pand %xmm2, %xmm0 -; SSE-NEXT: paddsb %xmm1, %xmm0 -; SSE-NEXT: psrlw $4, %xmm0 -; SSE-NEXT: pand {{.*}}(%rip), %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] -; SSE-NEXT: pxor %xmm1, %xmm0 -; SSE-NEXT: psubb %xmm1, %xmm0 -; SSE-NEXT: retq +; SSE2-LABEL: v16i4: +; SSE2: # %bb.0: +; SSE2-NEXT: psllw $4, %xmm1 +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] +; SSE2-NEXT: pand %xmm2, %xmm1 +; SSE2-NEXT: psllw $4, %xmm0 +; SSE2-NEXT: pand %xmm2, %xmm0 +; SSE2-NEXT: paddsb %xmm1, %xmm0 +; SSE2-NEXT: psrlw $4, %xmm0 +; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; SSE2-NEXT: pxor %xmm1, %xmm0 +; SSE2-NEXT: psubb %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: v16i4: +; SSSE3: # %bb.0: +; SSSE3-NEXT: psllw $4, %xmm1 +; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] +; SSSE3-NEXT: pand %xmm2, %xmm1 +; SSSE3-NEXT: psllw $4, %xmm0 +; SSSE3-NEXT: pand %xmm2, %xmm0 +; SSSE3-NEXT: paddsb %xmm1, %xmm0 +; SSSE3-NEXT: psrlw $4, %xmm0 +; SSSE3-NEXT: pand {{.*}}(%rip), %xmm0 +; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; SSSE3-NEXT: pxor %xmm1, %xmm0 +; SSSE3-NEXT: psubb %xmm1, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: v16i4: +; SSE41: # %bb.0: +; SSE41-NEXT: psllw $4, %xmm0 +; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] +; SSE41-NEXT: pand %xmm2, %xmm0 +; SSE41-NEXT: psrlw $4, %xmm0 +; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; SSE41-NEXT: pxor %xmm3, %xmm0 +; SSE41-NEXT: psllw $4, %xmm1 +; SSE41-NEXT: pand %xmm1, %xmm2 +; SSE41-NEXT: psrlw $4, %xmm2 +; SSE41-NEXT: pxor %xmm3, %xmm2 +; SSE41-NEXT: paddb %xmm2, %xmm0 +; SSE41-NEXT: psubb {{.*}}(%rip), %xmm0 +; SSE41-NEXT: pminsb {{.*}}(%rip), %xmm0 +; SSE41-NEXT: pmaxsb {{.*}}(%rip), %xmm0 +; SSE41-NEXT: retq ; ; AVX-LABEL: v16i4: ; AVX: # %bb.0: -; AVX-NEXT: vpsllw $4, %xmm1, %xmm1 -; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] -; AVX-NEXT: vpand %xmm2, %xmm1, %xmm1 ; AVX-NEXT: vpsllw $4, %xmm0, %xmm0 +; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] ; AVX-NEXT: vpand %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpaddsb %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] -; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpsubb %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; AVX-NEXT: vpxor %xmm3, %xmm0, %xmm0 +; AVX-NEXT: vpsllw $4, %xmm1, %xmm1 +; AVX-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vpsrlw $4, %xmm1, %xmm1 +; AVX-NEXT: vpxor %xmm3, %xmm1, %xmm1 +; AVX-NEXT: vpaddb %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vpsubb {{.*}}(%rip), %xmm0, %xmm0 +; AVX-NEXT: vpminsb {{.*}}(%rip), %xmm0, %xmm0 +; AVX-NEXT: vpmaxsb {{.*}}(%rip), %xmm0, %xmm0 ; AVX-NEXT: retq %z = call <16 x i4> @llvm.sadd.sat.v16i4(<16 x i4> %x, <16 x i4> %y) ret <16 x i4> %z } define <16 x i1> @v16i1(<16 x i1> %x, <16 x i1> %y) nounwind { -; SSE-LABEL: v16i1: -; SSE: # %bb.0: -; SSE-NEXT: psllw $7, %xmm1 -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] -; SSE-NEXT: pand %xmm2, %xmm1 -; SSE-NEXT: psllw $7, %xmm0 -; SSE-NEXT: pand %xmm2, %xmm0 -; SSE-NEXT: paddsb %xmm1, %xmm0 -; SSE-NEXT: pxor %xmm1, %xmm1 -; SSE-NEXT: pcmpgtb %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: retq +; SSE2-LABEL: v16i1: +; SSE2: # %bb.0: +; SSE2-NEXT: psllw $7, %xmm1 +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; SSE2-NEXT: pand %xmm2, %xmm1 +; SSE2-NEXT: psllw $7, %xmm0 +; SSE2-NEXT: pand %xmm2, %xmm0 +; SSE2-NEXT: paddsb %xmm1, %xmm0 +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: pcmpgtb %xmm0, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: v16i1: +; SSSE3: # %bb.0: +; SSSE3-NEXT: psllw $7, %xmm1 +; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; SSSE3-NEXT: pand %xmm2, %xmm1 +; SSSE3-NEXT: psllw $7, %xmm0 +; SSSE3-NEXT: pand %xmm2, %xmm0 +; SSSE3-NEXT: paddsb %xmm1, %xmm0 +; SSSE3-NEXT: pxor %xmm1, %xmm1 +; SSSE3-NEXT: pcmpgtb %xmm0, %xmm1 +; SSSE3-NEXT: movdqa %xmm1, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: v16i1: +; SSE41: # %bb.0: +; SSE41-NEXT: psllw $7, %xmm1 +; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; SSE41-NEXT: pand %xmm2, %xmm1 +; SSE41-NEXT: pxor %xmm3, %xmm3 +; SSE41-NEXT: pxor %xmm4, %xmm4 +; SSE41-NEXT: pcmpgtb %xmm1, %xmm4 +; SSE41-NEXT: psllw $7, %xmm0 +; SSE41-NEXT: pand %xmm2, %xmm0 +; SSE41-NEXT: pxor %xmm1, %xmm1 +; SSE41-NEXT: pcmpgtb %xmm0, %xmm1 +; SSE41-NEXT: paddb %xmm4, %xmm1 +; SSE41-NEXT: pminsb %xmm3, %xmm1 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm0 +; SSE41-NEXT: pmaxsb %xmm1, %xmm0 +; SSE41-NEXT: retq ; ; AVX1-LABEL: v16i1: ; AVX1: # %bb.0: ; AVX1-NEXT: vpsllw $7, %xmm1, %xmm1 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] ; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpcmpgtb %xmm1, %xmm3, %xmm1 ; AVX1-NEXT: vpsllw $7, %xmm0, %xmm0 ; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpaddsb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpcmpgtb %xmm0, %xmm3, %xmm0 +; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpminsb %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: v16i1: @@ -544,11 +615,15 @@ ; AVX2-NEXT: vpsllw $7, %xmm1, %xmm1 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] ; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX2-NEXT: vpcmpgtb %xmm1, %xmm3, %xmm1 ; AVX2-NEXT: vpsllw $7, %xmm0, %xmm0 ; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpaddsb %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpcmpgtb %xmm0, %xmm3, %xmm0 +; AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpminsb %xmm3, %xmm0, %xmm0 +; AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq ; ; AVX512-LABEL: v16i1: Index: llvm/test/CodeGen/X86/ssub_sat_vec.ll =================================================================== --- llvm/test/CodeGen/X86/ssub_sat_vec.ll +++ llvm/test/CodeGen/X86/ssub_sat_vec.ll @@ -476,63 +476,132 @@ ; Promotion define <16 x i4> @v16i4(<16 x i4> %x, <16 x i4> %y) nounwind { -; SSE-LABEL: v16i4: -; SSE: # %bb.0: -; SSE-NEXT: psllw $4, %xmm1 -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] -; SSE-NEXT: pand %xmm2, %xmm1 -; SSE-NEXT: psllw $4, %xmm0 -; SSE-NEXT: pand %xmm2, %xmm0 -; SSE-NEXT: psubsb %xmm1, %xmm0 -; SSE-NEXT: psrlw $4, %xmm0 -; SSE-NEXT: pand {{.*}}(%rip), %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] -; SSE-NEXT: pxor %xmm1, %xmm0 -; SSE-NEXT: psubb %xmm1, %xmm0 -; SSE-NEXT: retq +; SSE2-LABEL: v16i4: +; SSE2: # %bb.0: +; SSE2-NEXT: psllw $4, %xmm1 +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] +; SSE2-NEXT: pand %xmm2, %xmm1 +; SSE2-NEXT: psllw $4, %xmm0 +; SSE2-NEXT: pand %xmm2, %xmm0 +; SSE2-NEXT: psubsb %xmm1, %xmm0 +; SSE2-NEXT: psrlw $4, %xmm0 +; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; SSE2-NEXT: pxor %xmm1, %xmm0 +; SSE2-NEXT: psubb %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: v16i4: +; SSSE3: # %bb.0: +; SSSE3-NEXT: psllw $4, %xmm1 +; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] +; SSSE3-NEXT: pand %xmm2, %xmm1 +; SSSE3-NEXT: psllw $4, %xmm0 +; SSSE3-NEXT: pand %xmm2, %xmm0 +; SSSE3-NEXT: psubsb %xmm1, %xmm0 +; SSSE3-NEXT: psrlw $4, %xmm0 +; SSSE3-NEXT: pand {{.*}}(%rip), %xmm0 +; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; SSSE3-NEXT: pxor %xmm1, %xmm0 +; SSSE3-NEXT: psubb %xmm1, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: v16i4: +; SSE41: # %bb.0: +; SSE41-NEXT: psllw $4, %xmm1 +; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] +; SSE41-NEXT: pand %xmm2, %xmm1 +; SSE41-NEXT: psrlw $4, %xmm1 +; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; SSE41-NEXT: pxor %xmm3, %xmm1 +; SSE41-NEXT: psllw $4, %xmm0 +; SSE41-NEXT: pand %xmm2, %xmm0 +; SSE41-NEXT: psrlw $4, %xmm0 +; SSE41-NEXT: pxor %xmm3, %xmm0 +; SSE41-NEXT: psubb %xmm1, %xmm0 +; SSE41-NEXT: pminsb {{.*}}(%rip), %xmm0 +; SSE41-NEXT: pmaxsb {{.*}}(%rip), %xmm0 +; SSE41-NEXT: retq ; ; AVX-LABEL: v16i4: ; AVX: # %bb.0: ; AVX-NEXT: vpsllw $4, %xmm1, %xmm1 ; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] ; AVX-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vpsrlw $4, %xmm1, %xmm1 +; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; AVX-NEXT: vpxor %xmm3, %xmm1, %xmm1 ; AVX-NEXT: vpsllw $4, %xmm0, %xmm0 ; AVX-NEXT: vpand %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpsubsb %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] -; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpxor %xmm3, %xmm0, %xmm0 ; AVX-NEXT: vpsubb %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpminsb {{.*}}(%rip), %xmm0, %xmm0 +; AVX-NEXT: vpmaxsb {{.*}}(%rip), %xmm0, %xmm0 ; AVX-NEXT: retq %z = call <16 x i4> @llvm.ssub.sat.v16i4(<16 x i4> %x, <16 x i4> %y) ret <16 x i4> %z } define <16 x i1> @v16i1(<16 x i1> %x, <16 x i1> %y) nounwind { -; SSE-LABEL: v16i1: -; SSE: # %bb.0: -; SSE-NEXT: psllw $7, %xmm1 -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] -; SSE-NEXT: pand %xmm2, %xmm1 -; SSE-NEXT: psllw $7, %xmm0 -; SSE-NEXT: pand %xmm2, %xmm0 -; SSE-NEXT: psubsb %xmm1, %xmm0 -; SSE-NEXT: pxor %xmm1, %xmm1 -; SSE-NEXT: pcmpgtb %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: retq +; SSE2-LABEL: v16i1: +; SSE2: # %bb.0: +; SSE2-NEXT: psllw $7, %xmm1 +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; SSE2-NEXT: pand %xmm2, %xmm1 +; SSE2-NEXT: psllw $7, %xmm0 +; SSE2-NEXT: pand %xmm2, %xmm0 +; SSE2-NEXT: psubsb %xmm1, %xmm0 +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: pcmpgtb %xmm0, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSSE3-LABEL: v16i1: +; SSSE3: # %bb.0: +; SSSE3-NEXT: psllw $7, %xmm1 +; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; SSSE3-NEXT: pand %xmm2, %xmm1 +; SSSE3-NEXT: psllw $7, %xmm0 +; SSSE3-NEXT: pand %xmm2, %xmm0 +; SSSE3-NEXT: psubsb %xmm1, %xmm0 +; SSSE3-NEXT: pxor %xmm1, %xmm1 +; SSSE3-NEXT: pcmpgtb %xmm0, %xmm1 +; SSSE3-NEXT: movdqa %xmm1, %xmm0 +; SSSE3-NEXT: retq +; +; SSE41-LABEL: v16i1: +; SSE41: # %bb.0: +; SSE41-NEXT: psllw $7, %xmm1 +; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; SSE41-NEXT: pand %xmm2, %xmm1 +; SSE41-NEXT: pxor %xmm3, %xmm3 +; SSE41-NEXT: pxor %xmm4, %xmm4 +; SSE41-NEXT: pcmpgtb %xmm1, %xmm4 +; SSE41-NEXT: psllw $7, %xmm0 +; SSE41-NEXT: pand %xmm2, %xmm0 +; SSE41-NEXT: pxor %xmm1, %xmm1 +; SSE41-NEXT: pcmpgtb %xmm0, %xmm1 +; SSE41-NEXT: psubb %xmm4, %xmm1 +; SSE41-NEXT: pminsb %xmm3, %xmm1 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm0 +; SSE41-NEXT: pmaxsb %xmm1, %xmm0 +; SSE41-NEXT: retq ; ; AVX1-LABEL: v16i1: ; AVX1: # %bb.0: ; AVX1-NEXT: vpsllw $7, %xmm1, %xmm1 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] ; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpcmpgtb %xmm1, %xmm3, %xmm1 ; AVX1-NEXT: vpsllw $7, %xmm0, %xmm0 ; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpsubsb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpcmpgtb %xmm0, %xmm3, %xmm0 +; AVX1-NEXT: vpsubb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpminsb %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: v16i1: @@ -540,11 +609,15 @@ ; AVX2-NEXT: vpsllw $7, %xmm1, %xmm1 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] ; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX2-NEXT: vpcmpgtb %xmm1, %xmm3, %xmm1 ; AVX2-NEXT: vpsllw $7, %xmm0, %xmm0 ; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpsubsb %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpcmpgtb %xmm0, %xmm3, %xmm0 +; AVX2-NEXT: vpsubb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpminsb %xmm3, %xmm0, %xmm0 +; AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq ; ; AVX512-LABEL: v16i1: Index: llvm/test/CodeGen/X86/uadd_sat_vec.ll =================================================================== --- llvm/test/CodeGen/X86/uadd_sat_vec.ll +++ llvm/test/CodeGen/X86/uadd_sat_vec.ll @@ -462,26 +462,20 @@ define <16 x i4> @v16i4(<16 x i4> %x, <16 x i4> %y) nounwind { ; SSE-LABEL: v16i4: ; SSE: # %bb.0: -; SSE-NEXT: psllw $4, %xmm1 -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; SSE-NEXT: pand %xmm2, %xmm1 -; SSE-NEXT: psllw $4, %xmm0 ; SSE-NEXT: pand %xmm2, %xmm0 -; SSE-NEXT: paddusb %xmm1, %xmm0 -; SSE-NEXT: psrlw $4, %xmm0 -; SSE-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE-NEXT: paddb %xmm1, %xmm0 +; SSE-NEXT: pminub %xmm2, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: v16i4: ; AVX: # %bb.0: -; AVX-NEXT: vpsllw $4, %xmm1, %xmm1 -; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] +; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX-NEXT: vpand %xmm2, %xmm1, %xmm1 -; AVX-NEXT: vpsllw $4, %xmm0, %xmm0 ; AVX-NEXT: vpand %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpaddusb %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpminub %xmm2, %xmm0, %xmm0 ; AVX-NEXT: retq %z = call <16 x i4> @llvm.uadd.sat.v16i4(<16 x i4> %x, <16 x i4> %y) ret <16 x i4> %z @@ -490,38 +484,29 @@ define <16 x i1> @v16i1(<16 x i1> %x, <16 x i1> %y) nounwind { ; SSE-LABEL: v16i1: ; SSE: # %bb.0: -; SSE-NEXT: psllw $7, %xmm1 -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] ; SSE-NEXT: pand %xmm2, %xmm1 -; SSE-NEXT: psllw $7, %xmm0 ; SSE-NEXT: pand %xmm2, %xmm0 -; SSE-NEXT: paddusb %xmm1, %xmm0 -; SSE-NEXT: psrlw $7, %xmm0 -; SSE-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE-NEXT: paddb %xmm1, %xmm0 +; SSE-NEXT: pminub %xmm2, %xmm0 ; SSE-NEXT: retq ; ; AVX1-LABEL: v16i1: ; AVX1: # %bb.0: -; AVX1-NEXT: vpsllw $7, %xmm1, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] ; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpsllw $7, %xmm0, %xmm0 ; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpaddusb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpsrlw $7, %xmm0, %xmm0 -; AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpminub %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: v16i1: ; AVX2: # %bb.0: -; AVX2-NEXT: vpsllw $7, %xmm1, %xmm1 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] ; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpsllw $7, %xmm0, %xmm0 ; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpaddusb %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpsrlw $7, %xmm0, %xmm0 -; AVX2-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpminub %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: retq ; ; AVX512-LABEL: v16i1: Index: llvm/test/CodeGen/X86/usub_sat_vec.ll =================================================================== --- llvm/test/CodeGen/X86/usub_sat_vec.ll +++ llvm/test/CodeGen/X86/usub_sat_vec.ll @@ -462,26 +462,18 @@ define <16 x i4> @v16i4(<16 x i4> %x, <16 x i4> %y) nounwind { ; SSE-LABEL: v16i4: ; SSE: # %bb.0: -; SSE-NEXT: psllw $4, %xmm1 -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; SSE-NEXT: pand %xmm2, %xmm1 -; SSE-NEXT: psllw $4, %xmm0 ; SSE-NEXT: pand %xmm2, %xmm0 ; SSE-NEXT: psubusb %xmm1, %xmm0 -; SSE-NEXT: psrlw $4, %xmm0 -; SSE-NEXT: pand {{.*}}(%rip), %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: v16i4: ; AVX: # %bb.0: -; AVX-NEXT: vpsllw $4, %xmm1, %xmm1 -; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] +; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] ; AVX-NEXT: vpand %xmm2, %xmm1, %xmm1 -; AVX-NEXT: vpsllw $4, %xmm0, %xmm0 ; AVX-NEXT: vpand %xmm2, %xmm0, %xmm0 ; AVX-NEXT: vpsubusb %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 ; AVX-NEXT: retq %z = call <16 x i4> @llvm.usub.sat.v16i4(<16 x i4> %x, <16 x i4> %y) ret <16 x i4> %z @@ -490,38 +482,26 @@ define <16 x i1> @v16i1(<16 x i1> %x, <16 x i1> %y) nounwind { ; SSE-LABEL: v16i1: ; SSE: # %bb.0: -; SSE-NEXT: psllw $7, %xmm1 -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; SSE-NEXT: movdqa {{.*#+}} xmm2 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] ; SSE-NEXT: pand %xmm2, %xmm1 -; SSE-NEXT: psllw $7, %xmm0 ; SSE-NEXT: pand %xmm2, %xmm0 ; SSE-NEXT: psubusb %xmm1, %xmm0 -; SSE-NEXT: psrlw $7, %xmm0 -; SSE-NEXT: pand {{.*}}(%rip), %xmm0 ; SSE-NEXT: retq ; ; AVX1-LABEL: v16i1: ; AVX1: # %bb.0: -; AVX1-NEXT: vpsllw $7, %xmm1, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] ; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpsllw $7, %xmm0, %xmm0 ; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpsubusb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpsrlw $7, %xmm0, %xmm0 -; AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: v16i1: ; AVX2: # %bb.0: -; AVX2-NEXT: vpsllw $7, %xmm1, %xmm1 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128] +; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] ; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpsllw $7, %xmm0, %xmm0 ; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpsubusb %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpsrlw $7, %xmm0, %xmm0 -; AVX2-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 ; AVX2-NEXT: retq ; ; AVX512-LABEL: v16i1: