diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -9698,6 +9698,51 @@ return DAG.getSelect(DL, N1.getValueType(), WideSetCC, N1, N2); } } + + // Match VSELECTs into add with unsigned saturation. + if (hasOperation(ISD::UADDSAT, VT)) { + // Check if one of the arms of the VSELECT is vector with all bits set. + // If it's on the left side invert the predicate to simplify logic below. + SDValue Other; + ISD::CondCode SatCC = CC; + if (ISD::isBuildVectorAllOnes(N1.getNode())) { + Other = N2; + SatCC = ISD::getSetCCInverse(SatCC, VT.getScalarType()); + } else if (ISD::isBuildVectorAllOnes(N2.getNode())) { + Other = N1; + } + + if (Other && Other.getOpcode() == ISD::ADD) { + SDValue CondLHS = LHS, CondRHS = RHS; + SDValue OpLHS = Other.getOperand(0), OpRHS = Other.getOperand(1); + + // Canonicalize condition operands. + if (SatCC == ISD::SETUGE) { + std::swap(CondLHS, CondRHS); + SatCC = ISD::SETULE; + } + + // We can test against either of the addition operands. + // x <= x+y ? x+y : ~0 --> uaddsat x, y + // x+y >= x ? x+y : ~0 --> uaddsat x, y + if (SatCC == ISD::SETULE && Other == CondRHS && + (OpLHS == CondLHS || OpRHS == CondLHS)) + return DAG.getNode(ISD::UADDSAT, DL, VT, OpLHS, OpRHS); + + if (isa(OpRHS) && isa(CondRHS) && + CondLHS == OpLHS) { + // If the RHS is a constant we have to reverse the const + // canonicalization. + // x >= ~C ? x+C : ~0 --> uaddsat x, C + auto MatchUADDSAT = [](ConstantSDNode *Op, ConstantSDNode *Cond) { + return Cond->getAPIntValue() == ~Op->getAPIntValue(); + }; + if (SatCC == ISD::SETULE && + ISD::matchBinaryPredicate(OpRHS, CondRHS, MatchUADDSAT)) + return DAG.getNode(ISD::UADDSAT, DL, VT, OpLHS, OpRHS); + } + } + } } if (SimplifySelectOps(N, N1, N2)) diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp --- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -7525,13 +7525,13 @@ assert(VT.isInteger() && "Expected operands to be integers"); // usub.sat(a, b) -> umax(a, b) - b - if (Opcode == ISD::USUBSAT && isOperationLegalOrCustom(ISD::UMAX, VT)) { + if (Opcode == ISD::USUBSAT && isOperationLegal(ISD::UMAX, VT)) { SDValue Max = DAG.getNode(ISD::UMAX, dl, VT, LHS, RHS); return DAG.getNode(ISD::SUB, dl, VT, Max, RHS); } // uadd.sat(a, b) -> umin(a, ~b) + b - if (Opcode == ISD::UADDSAT && isOperationLegalOrCustom(ISD::UMIN, VT)) { + if (Opcode == ISD::UADDSAT && isOperationLegal(ISD::UMIN, VT)) { SDValue InvRHS = DAG.getNOT(dl, RHS, VT); SDValue Min = DAG.getNode(ISD::UMIN, dl, VT, LHS, InvRHS); return DAG.getNode(ISD::ADD, dl, VT, Min, RHS); diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -922,9 +922,7 @@ setOperationAction(ISD::SADDSAT, MVT::v8i16, Legal); setOperationAction(ISD::USUBSAT, MVT::v8i16, Legal); setOperationAction(ISD::SSUBSAT, MVT::v8i16, Legal); - setOperationAction(ISD::UADDSAT, MVT::v4i32, Custom); setOperationAction(ISD::USUBSAT, MVT::v4i32, Custom); - setOperationAction(ISD::UADDSAT, MVT::v2i64, Custom); setOperationAction(ISD::USUBSAT, MVT::v2i64, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom); @@ -1103,6 +1101,8 @@ setOperationAction(ISD::UMIN, MVT::v8i16, Legal); setOperationAction(ISD::UMIN, MVT::v4i32, Legal); + setOperationAction(ISD::UADDSAT, MVT::v4i32, Custom); + // FIXME: Do we need to handle scalar-to-vector here? setOperationAction(ISD::MUL, MVT::v4i32, Legal); @@ -1143,6 +1143,10 @@ } } + if (!Subtarget.useSoftFloat() && Subtarget.hasSSE42()) { + setOperationAction(ISD::UADDSAT, MVT::v2i64, Custom); + } + if (!Subtarget.useSoftFloat() && Subtarget.hasXOP()) { for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64, MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) @@ -26889,17 +26893,6 @@ EVT SetCCResultType = TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT); - if (Opcode == ISD::UADDSAT && !TLI.isOperationLegal(ISD::UMIN, VT)) { - // uaddsat X, Y --> (X >u (X + Y)) ? -1 : X + Y - SDValue Add = DAG.getNode(ISD::ADD, DL, VT, X, Y); - SDValue Cmp = DAG.getSetCC(DL, SetCCResultType, X, Add, ISD::SETUGT); - // TODO: Move this to DAGCombiner? - if (SetCCResultType == VT && - DAG.ComputeNumSignBits(Cmp) == VT.getScalarSizeInBits()) - return DAG.getNode(ISD::OR, DL, VT, Cmp, Add); - return DAG.getSelect(DL, VT, Cmp, DAG.getAllOnesConstant(DL, VT), Add); - } - if (Opcode == ISD::USUBSAT && !TLI.isOperationLegal(ISD::UMAX, VT)) { // usubsat X, Y --> (X >u Y) ? X - Y : 0 SDValue Sub = DAG.getNode(ISD::SUB, DL, VT, X, Y); @@ -40988,59 +40981,6 @@ } } - // Match VSELECTs into add with unsigned saturation. - if (N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC && - // paddus is available in SSE2 for i8 and i16 vectors. - Subtarget.hasSSE2() && VT.getVectorNumElements() >= 2 && - isPowerOf2_32(VT.getVectorNumElements()) && - (VT.getVectorElementType() == MVT::i8 || - VT.getVectorElementType() == MVT::i16)) { - ISD::CondCode CC = cast(Cond.getOperand(2))->get(); - - SDValue CondLHS = Cond->getOperand(0); - SDValue CondRHS = Cond->getOperand(1); - - // Check if one of the arms of the VSELECT is vector with all bits set. - // If it's on the left side invert the predicate to simplify logic below. - SDValue Other; - if (ISD::isBuildVectorAllOnes(LHS.getNode())) { - Other = RHS; - CC = ISD::getSetCCInverse(CC, VT.getVectorElementType()); - } else if (ISD::isBuildVectorAllOnes(RHS.getNode())) { - Other = LHS; - } - - if (Other.getNode() && Other.getOpcode() == ISD::ADD) { - SDValue OpLHS = Other.getOperand(0), OpRHS = Other.getOperand(1); - - // Canonicalize condition operands. - if (CC == ISD::SETUGE) { - std::swap(CondLHS, CondRHS); - CC = ISD::SETULE; - } - - // We can test against either of the addition operands. - // x <= x+y ? x+y : ~0 --> addus x, y - // x+y >= x ? x+y : ~0 --> addus x, y - if (CC == ISD::SETULE && Other == CondRHS && - (OpLHS == CondLHS || OpRHS == CondLHS)) - return DAG.getNode(ISD::UADDSAT, DL, VT, OpLHS, OpRHS); - - if (isa(OpRHS) && isa(CondRHS) && - CondLHS == OpLHS) { - // If the RHS is a constant we have to reverse the const - // canonicalization. - // x > ~C ? x+C : ~0 --> addus x, C - auto MatchUADDSAT = [](ConstantSDNode *Op, ConstantSDNode *Cond) { - return Cond->getAPIntValue() == ~Op->getAPIntValue(); - }; - if (CC == ISD::SETULE && - ISD::matchBinaryPredicate(OpRHS, CondRHS, MatchUADDSAT)) - return DAG.getNode(ISD::UADDSAT, DL, VT, OpLHS, OpRHS); - } - } - } - // Check if the first operand is all zeros and Cond type is vXi1. // If this an avx512 target we can improve the use of zero masking by // swapping the operands and inverting the condition. diff --git a/llvm/test/CodeGen/AArch64/sat-add.ll b/llvm/test/CodeGen/AArch64/sat-add.ll --- a/llvm/test/CodeGen/AArch64/sat-add.ll +++ b/llvm/test/CodeGen/AArch64/sat-add.ll @@ -360,9 +360,7 @@ ; CHECK-LABEL: unsigned_sat_constant_v16i8_using_cmp_sum: ; CHECK: // %bb.0: ; CHECK-NEXT: movi v1.16b, #42 -; CHECK-NEXT: add v1.16b, v0.16b, v1.16b -; CHECK-NEXT: cmhi v0.16b, v0.16b, v1.16b -; CHECK-NEXT: orr v0.16b, v1.16b, v0.16b +; CHECK-NEXT: uqadd v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret %a = add <16 x i8> %x, %c = icmp ugt <16 x i8> %x, %a @@ -374,10 +372,7 @@ ; CHECK-LABEL: unsigned_sat_constant_v16i8_using_cmp_notval: ; CHECK: // %bb.0: ; CHECK-NEXT: movi v1.16b, #42 -; CHECK-NEXT: movi v2.16b, #213 -; CHECK-NEXT: add v1.16b, v0.16b, v1.16b -; CHECK-NEXT: cmhi v0.16b, v0.16b, v2.16b -; CHECK-NEXT: orr v0.16b, v1.16b, v0.16b +; CHECK-NEXT: uqadd v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret %a = add <16 x i8> %x, %c = icmp ugt <16 x i8> %x, @@ -403,9 +398,7 @@ ; CHECK-LABEL: unsigned_sat_constant_v8i16_using_cmp_sum: ; CHECK: // %bb.0: ; CHECK-NEXT: movi v1.8h, #42 -; CHECK-NEXT: add v1.8h, v0.8h, v1.8h -; CHECK-NEXT: cmhi v0.8h, v0.8h, v1.8h -; CHECK-NEXT: orr v0.16b, v1.16b, v0.16b +; CHECK-NEXT: uqadd v0.8h, v0.8h, v1.8h ; CHECK-NEXT: ret %a = add <8 x i16> %x, %c = icmp ugt <8 x i16> %x, %a @@ -417,10 +410,7 @@ ; CHECK-LABEL: unsigned_sat_constant_v8i16_using_cmp_notval: ; CHECK: // %bb.0: ; CHECK-NEXT: movi v1.8h, #42 -; CHECK-NEXT: mvni v2.8h, #42 -; CHECK-NEXT: add v1.8h, v0.8h, v1.8h -; CHECK-NEXT: cmhi v0.8h, v0.8h, v2.8h -; CHECK-NEXT: orr v0.16b, v1.16b, v0.16b +; CHECK-NEXT: uqadd v0.8h, v0.8h, v1.8h ; CHECK-NEXT: ret %a = add <8 x i16> %x, %c = icmp ugt <8 x i16> %x, @@ -446,9 +436,7 @@ ; CHECK-LABEL: unsigned_sat_constant_v4i32_using_cmp_sum: ; CHECK: // %bb.0: ; CHECK-NEXT: movi v1.4s, #42 -; CHECK-NEXT: add v1.4s, v0.4s, v1.4s -; CHECK-NEXT: cmhi v0.4s, v0.4s, v1.4s -; CHECK-NEXT: orr v0.16b, v1.16b, v0.16b +; CHECK-NEXT: uqadd v0.4s, v0.4s, v1.4s ; CHECK-NEXT: ret %a = add <4 x i32> %x, %c = icmp ugt <4 x i32> %x, %a @@ -460,10 +448,7 @@ ; CHECK-LABEL: unsigned_sat_constant_v4i32_using_cmp_notval: ; CHECK: // %bb.0: ; CHECK-NEXT: movi v1.4s, #42 -; CHECK-NEXT: mvni v2.4s, #42 -; CHECK-NEXT: add v1.4s, v0.4s, v1.4s -; CHECK-NEXT: cmhi v0.4s, v0.4s, v2.4s -; CHECK-NEXT: orr v0.16b, v1.16b, v0.16b +; CHECK-NEXT: uqadd v0.4s, v0.4s, v1.4s ; CHECK-NEXT: ret %a = add <4 x i32> %x, %c = icmp ugt <4 x i32> %x, @@ -493,9 +478,7 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: mov w8, #42 ; CHECK-NEXT: dup v1.2d, x8 -; CHECK-NEXT: add v1.2d, v0.2d, v1.2d -; CHECK-NEXT: cmhi v0.2d, v0.2d, v1.2d -; CHECK-NEXT: orr v0.16b, v1.16b, v0.16b +; CHECK-NEXT: uqadd v0.2d, v0.2d, v1.2d ; CHECK-NEXT: ret %a = add <2 x i64> %x, %c = icmp ugt <2 x i64> %x, %a @@ -507,12 +490,8 @@ ; CHECK-LABEL: unsigned_sat_constant_v2i64_using_cmp_notval: ; CHECK: // %bb.0: ; CHECK-NEXT: mov w8, #42 -; CHECK-NEXT: mov x9, #-43 ; CHECK-NEXT: dup v1.2d, x8 -; CHECK-NEXT: dup v2.2d, x9 -; CHECK-NEXT: add v1.2d, v0.2d, v1.2d -; CHECK-NEXT: cmhi v0.2d, v0.2d, v2.2d -; CHECK-NEXT: orr v0.16b, v1.16b, v0.16b +; CHECK-NEXT: uqadd v0.2d, v0.2d, v1.2d ; CHECK-NEXT: ret %a = add <2 x i64> %x, %c = icmp ugt <2 x i64> %x, @@ -537,9 +516,7 @@ define <16 x i8> @unsigned_sat_variable_v16i8_using_cmp_sum(<16 x i8> %x, <16 x i8> %y) { ; CHECK-LABEL: unsigned_sat_variable_v16i8_using_cmp_sum: ; CHECK: // %bb.0: -; CHECK-NEXT: add v1.16b, v0.16b, v1.16b -; CHECK-NEXT: cmhi v0.16b, v0.16b, v1.16b -; CHECK-NEXT: orr v0.16b, v1.16b, v0.16b +; CHECK-NEXT: uqadd v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret %a = add <16 x i8> %x, %y %c = icmp ugt <16 x i8> %x, %a @@ -579,9 +556,7 @@ define <8 x i16> @unsigned_sat_variable_v8i16_using_cmp_sum(<8 x i16> %x, <8 x i16> %y) { ; CHECK-LABEL: unsigned_sat_variable_v8i16_using_cmp_sum: ; CHECK: // %bb.0: -; CHECK-NEXT: add v1.8h, v0.8h, v1.8h -; CHECK-NEXT: cmhi v0.8h, v0.8h, v1.8h -; CHECK-NEXT: orr v0.16b, v1.16b, v0.16b +; CHECK-NEXT: uqadd v0.8h, v0.8h, v1.8h ; CHECK-NEXT: ret %a = add <8 x i16> %x, %y %c = icmp ugt <8 x i16> %x, %a @@ -621,9 +596,7 @@ define <4 x i32> @unsigned_sat_variable_v4i32_using_cmp_sum(<4 x i32> %x, <4 x i32> %y) { ; CHECK-LABEL: unsigned_sat_variable_v4i32_using_cmp_sum: ; CHECK: // %bb.0: -; CHECK-NEXT: add v1.4s, v0.4s, v1.4s -; CHECK-NEXT: cmhi v0.4s, v0.4s, v1.4s -; CHECK-NEXT: orr v0.16b, v1.16b, v0.16b +; CHECK-NEXT: uqadd v0.4s, v0.4s, v1.4s ; CHECK-NEXT: ret %a = add <4 x i32> %x, %y %c = icmp ugt <4 x i32> %x, %a @@ -664,9 +637,7 @@ define <2 x i64> @unsigned_sat_variable_v2i64_using_cmp_sum(<2 x i64> %x, <2 x i64> %y) { ; CHECK-LABEL: unsigned_sat_variable_v2i64_using_cmp_sum: ; CHECK: // %bb.0: -; CHECK-NEXT: add v1.2d, v0.2d, v1.2d -; CHECK-NEXT: cmhi v0.2d, v0.2d, v1.2d -; CHECK-NEXT: orr v0.16b, v1.16b, v0.16b +; CHECK-NEXT: uqadd v0.2d, v0.2d, v1.2d ; CHECK-NEXT: ret %a = add <2 x i64> %x, %y %c = icmp ugt <2 x i64> %x, %a diff --git a/llvm/test/CodeGen/PowerPC/sat-add.ll b/llvm/test/CodeGen/PowerPC/sat-add.ll --- a/llvm/test/CodeGen/PowerPC/sat-add.ll +++ b/llvm/test/CodeGen/PowerPC/sat-add.ll @@ -396,12 +396,9 @@ ; CHECK-LABEL: unsigned_sat_constant_v16i8_using_cmp_sum: ; CHECK: # %bb.0: ; CHECK-NEXT: addis 3, 2, .LCPI25_0@toc@ha -; CHECK-NEXT: xxleqv 0, 0, 0 ; CHECK-NEXT: addi 3, 3, .LCPI25_0@toc@l ; CHECK-NEXT: lvx 3, 0, 3 -; CHECK-NEXT: vaddubm 3, 2, 3 -; CHECK-NEXT: vcmpgtub 2, 2, 3 -; CHECK-NEXT: xxsel 34, 35, 0, 34 +; CHECK-NEXT: vaddubs 2, 2, 3 ; CHECK-NEXT: blr %a = add <16 x i8> %x, %c = icmp ugt <16 x i8> %x, %a @@ -412,16 +409,10 @@ define <16 x i8> @unsigned_sat_constant_v16i8_using_cmp_notval(<16 x i8> %x) { ; CHECK-LABEL: unsigned_sat_constant_v16i8_using_cmp_notval: ; CHECK: # %bb.0: -; CHECK-NEXT: addis 3, 2, .LCPI26_1@toc@ha -; CHECK-NEXT: xxleqv 0, 0, 0 -; CHECK-NEXT: addi 3, 3, .LCPI26_1@toc@l -; CHECK-NEXT: lvx 3, 0, 3 ; CHECK-NEXT: addis 3, 2, .LCPI26_0@toc@ha ; CHECK-NEXT: addi 3, 3, .LCPI26_0@toc@l -; CHECK-NEXT: vcmpgtub 3, 2, 3 -; CHECK-NEXT: lvx 4, 0, 3 -; CHECK-NEXT: vaddubm 2, 2, 4 -; CHECK-NEXT: xxsel 34, 34, 0, 35 +; CHECK-NEXT: lvx 3, 0, 3 +; CHECK-NEXT: vaddubs 2, 2, 3 ; CHECK-NEXT: blr %a = add <16 x i8> %x, %c = icmp ugt <16 x i8> %x, @@ -451,12 +442,9 @@ ; CHECK-LABEL: unsigned_sat_constant_v8i16_using_cmp_sum: ; CHECK: # %bb.0: ; CHECK-NEXT: addis 3, 2, .LCPI28_0@toc@ha -; CHECK-NEXT: xxleqv 0, 0, 0 ; CHECK-NEXT: addi 3, 3, .LCPI28_0@toc@l ; CHECK-NEXT: lvx 3, 0, 3 -; CHECK-NEXT: vadduhm 3, 2, 3 -; CHECK-NEXT: vcmpgtuh 2, 2, 3 -; CHECK-NEXT: xxsel 34, 35, 0, 34 +; CHECK-NEXT: vadduhs 2, 2, 3 ; CHECK-NEXT: blr %a = add <8 x i16> %x, %c = icmp ugt <8 x i16> %x, %a @@ -467,16 +455,10 @@ define <8 x i16> @unsigned_sat_constant_v8i16_using_cmp_notval(<8 x i16> %x) { ; CHECK-LABEL: unsigned_sat_constant_v8i16_using_cmp_notval: ; CHECK: # %bb.0: -; CHECK-NEXT: addis 3, 2, .LCPI29_1@toc@ha -; CHECK-NEXT: xxleqv 0, 0, 0 -; CHECK-NEXT: addi 3, 3, .LCPI29_1@toc@l -; CHECK-NEXT: lvx 3, 0, 3 ; CHECK-NEXT: addis 3, 2, .LCPI29_0@toc@ha ; CHECK-NEXT: addi 3, 3, .LCPI29_0@toc@l -; CHECK-NEXT: vcmpgtuh 3, 2, 3 -; CHECK-NEXT: lvx 4, 0, 3 -; CHECK-NEXT: vadduhm 2, 2, 4 -; CHECK-NEXT: xxsel 34, 34, 0, 35 +; CHECK-NEXT: lvx 3, 0, 3 +; CHECK-NEXT: vadduhs 2, 2, 3 ; CHECK-NEXT: blr %a = add <8 x i16> %x, %c = icmp ugt <8 x i16> %x, @@ -506,12 +488,9 @@ ; CHECK-LABEL: unsigned_sat_constant_v4i32_using_cmp_sum: ; CHECK: # %bb.0: ; CHECK-NEXT: addis 3, 2, .LCPI31_0@toc@ha -; CHECK-NEXT: xxleqv 0, 0, 0 ; CHECK-NEXT: addi 3, 3, .LCPI31_0@toc@l ; CHECK-NEXT: lvx 3, 0, 3 -; CHECK-NEXT: vadduwm 3, 2, 3 -; CHECK-NEXT: vcmpgtuw 2, 2, 3 -; CHECK-NEXT: xxsel 34, 35, 0, 34 +; CHECK-NEXT: vadduws 2, 2, 3 ; CHECK-NEXT: blr %a = add <4 x i32> %x, %c = icmp ugt <4 x i32> %x, %a @@ -522,16 +501,10 @@ define <4 x i32> @unsigned_sat_constant_v4i32_using_cmp_notval(<4 x i32> %x) { ; CHECK-LABEL: unsigned_sat_constant_v4i32_using_cmp_notval: ; CHECK: # %bb.0: -; CHECK-NEXT: addis 3, 2, .LCPI32_1@toc@ha -; CHECK-NEXT: xxleqv 0, 0, 0 -; CHECK-NEXT: addi 3, 3, .LCPI32_1@toc@l -; CHECK-NEXT: lvx 3, 0, 3 ; CHECK-NEXT: addis 3, 2, .LCPI32_0@toc@ha ; CHECK-NEXT: addi 3, 3, .LCPI32_0@toc@l -; CHECK-NEXT: vcmpgtuw 3, 2, 3 -; CHECK-NEXT: lvx 4, 0, 3 -; CHECK-NEXT: vadduwm 2, 2, 4 -; CHECK-NEXT: xxsel 34, 34, 0, 35 +; CHECK-NEXT: lvx 3, 0, 3 +; CHECK-NEXT: vadduws 2, 2, 3 ; CHECK-NEXT: blr %a = add <4 x i32> %x, %c = icmp ugt <4 x i32> %x, @@ -616,10 +589,7 @@ define <16 x i8> @unsigned_sat_variable_v16i8_using_cmp_sum(<16 x i8> %x, <16 x i8> %y) { ; CHECK-LABEL: unsigned_sat_variable_v16i8_using_cmp_sum: ; CHECK: # %bb.0: -; CHECK-NEXT: vaddubm 3, 2, 3 -; CHECK-NEXT: xxleqv 0, 0, 0 -; CHECK-NEXT: vcmpgtub 2, 2, 3 -; CHECK-NEXT: xxsel 34, 35, 0, 34 +; CHECK-NEXT: vaddubs 2, 2, 3 ; CHECK-NEXT: blr %a = add <16 x i8> %x, %y %c = icmp ugt <16 x i8> %x, %a @@ -660,10 +630,7 @@ define <8 x i16> @unsigned_sat_variable_v8i16_using_cmp_sum(<8 x i16> %x, <8 x i16> %y) { ; CHECK-LABEL: unsigned_sat_variable_v8i16_using_cmp_sum: ; CHECK: # %bb.0: -; CHECK-NEXT: vadduhm 3, 2, 3 -; CHECK-NEXT: xxleqv 0, 0, 0 -; CHECK-NEXT: vcmpgtuh 2, 2, 3 -; CHECK-NEXT: xxsel 34, 35, 0, 34 +; CHECK-NEXT: vadduhs 2, 2, 3 ; CHECK-NEXT: blr %a = add <8 x i16> %x, %y %c = icmp ugt <8 x i16> %x, %a @@ -704,10 +671,7 @@ define <4 x i32> @unsigned_sat_variable_v4i32_using_cmp_sum(<4 x i32> %x, <4 x i32> %y) { ; CHECK-LABEL: unsigned_sat_variable_v4i32_using_cmp_sum: ; CHECK: # %bb.0: -; CHECK-NEXT: vadduwm 3, 2, 3 -; CHECK-NEXT: xxleqv 0, 0, 0 -; CHECK-NEXT: vcmpgtuw 2, 2, 3 -; CHECK-NEXT: xxsel 34, 35, 0, 34 +; CHECK-NEXT: vadduws 2, 2, 3 ; CHECK-NEXT: blr %a = add <4 x i32> %x, %y %c = icmp ugt <4 x i32> %x, %a diff --git a/llvm/test/CodeGen/X86/sat-add.ll b/llvm/test/CodeGen/X86/sat-add.ll --- a/llvm/test/CodeGen/X86/sat-add.ll +++ b/llvm/test/CodeGen/X86/sat-add.ll @@ -385,19 +385,10 @@ ; SSE-NEXT: paddusb {{.*}}(%rip), %xmm0 ; SSE-NEXT: retq ; -; AVX2-LABEL: unsigned_sat_constant_v16i8_using_cmp_sum: -; AVX2: # %bb.0: -; AVX2-NEXT: vpaddusb {{.*}}(%rip), %xmm0, %xmm0 -; AVX2-NEXT: retq -; -; AVX512-LABEL: unsigned_sat_constant_v16i8_using_cmp_sum: -; AVX512: # %bb.0: -; AVX512-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm1 -; AVX512-NEXT: vpminub %xmm1, %xmm0, %xmm2 -; AVX512-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm2 -; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 -; AVX512-NEXT: vpternlogq $222, %xmm2, %xmm1, %xmm0 -; AVX512-NEXT: retq +; AVX-LABEL: unsigned_sat_constant_v16i8_using_cmp_sum: +; AVX: # %bb.0: +; AVX-NEXT: vpaddusb {{.*}}(%rip), %xmm0, %xmm0 +; AVX-NEXT: retq %a = add <16 x i8> %x, %c = icmp ugt <16 x i8> %x, %a %r = select <16 x i1> %c, <16 x i8> , <16 x i8> %a @@ -410,18 +401,10 @@ ; SSE-NEXT: paddusb {{.*}}(%rip), %xmm0 ; SSE-NEXT: retq ; -; AVX2-LABEL: unsigned_sat_constant_v16i8_using_cmp_notval: -; AVX2: # %bb.0: -; AVX2-NEXT: vpaddusb {{.*}}(%rip), %xmm0, %xmm0 -; AVX2-NEXT: retq -; -; AVX512-LABEL: unsigned_sat_constant_v16i8_using_cmp_notval: -; AVX512: # %bb.0: -; AVX512-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm1 -; AVX512-NEXT: vpmaxub {{.*}}(%rip), %xmm0, %xmm2 -; AVX512-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm0 -; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: retq +; AVX-LABEL: unsigned_sat_constant_v16i8_using_cmp_notval: +; AVX: # %bb.0: +; AVX-NEXT: vpaddusb {{.*}}(%rip), %xmm0, %xmm0 +; AVX-NEXT: retq %a = add <16 x i8> %x, %c = icmp ugt <16 x i8> %x, %r = select <16 x i1> %c, <16 x i8> , <16 x i8> %a @@ -460,19 +443,10 @@ ; SSE-NEXT: paddusw {{.*}}(%rip), %xmm0 ; SSE-NEXT: retq ; -; AVX2-LABEL: unsigned_sat_constant_v8i16_using_cmp_sum: -; AVX2: # %bb.0: -; AVX2-NEXT: vpaddusw {{.*}}(%rip), %xmm0, %xmm0 -; AVX2-NEXT: retq -; -; AVX512-LABEL: unsigned_sat_constant_v8i16_using_cmp_sum: -; AVX512: # %bb.0: -; AVX512-NEXT: vpaddw {{.*}}(%rip), %xmm0, %xmm1 -; AVX512-NEXT: vpminuw %xmm1, %xmm0, %xmm2 -; AVX512-NEXT: vpcmpeqw %xmm2, %xmm0, %xmm2 -; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 -; AVX512-NEXT: vpternlogq $222, %xmm2, %xmm1, %xmm0 -; AVX512-NEXT: retq +; AVX-LABEL: unsigned_sat_constant_v8i16_using_cmp_sum: +; AVX: # %bb.0: +; AVX-NEXT: vpaddusw {{.*}}(%rip), %xmm0, %xmm0 +; AVX-NEXT: retq %a = add <8 x i16> %x, %c = icmp ugt <8 x i16> %x, %a %r = select <8 x i1> %c, <8 x i16> , <8 x i16> %a @@ -485,18 +459,10 @@ ; SSE-NEXT: paddusw {{.*}}(%rip), %xmm0 ; SSE-NEXT: retq ; -; AVX2-LABEL: unsigned_sat_constant_v8i16_using_cmp_notval: -; AVX2: # %bb.0: -; AVX2-NEXT: vpaddusw {{.*}}(%rip), %xmm0, %xmm0 -; AVX2-NEXT: retq -; -; AVX512-LABEL: unsigned_sat_constant_v8i16_using_cmp_notval: -; AVX512: # %bb.0: -; AVX512-NEXT: vpaddw {{.*}}(%rip), %xmm0, %xmm1 -; AVX512-NEXT: vpmaxuw {{.*}}(%rip), %xmm0, %xmm2 -; AVX512-NEXT: vpcmpeqw %xmm2, %xmm0, %xmm0 -; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: retq +; AVX-LABEL: unsigned_sat_constant_v8i16_using_cmp_notval: +; AVX: # %bb.0: +; AVX-NEXT: vpaddusw {{.*}}(%rip), %xmm0, %xmm0 +; AVX-NEXT: retq %a = add <8 x i16> %x, %c = icmp ugt <8 x i16> %x, %r = select <8 x i1> %c, <8 x i16> , <8 x i16> %a @@ -555,35 +521,22 @@ ; ; SSE4-LABEL: unsigned_sat_constant_v4i32_using_cmp_sum: ; SSE4: # %bb.0: -; SSE4-NEXT: movdqa {{.*#+}} xmm2 = [42,42,42,42] -; SSE4-NEXT: paddd %xmm0, %xmm2 -; SSE4-NEXT: movdqa %xmm0, %xmm1 -; SSE4-NEXT: pminud %xmm2, %xmm1 -; SSE4-NEXT: pcmpeqd %xmm0, %xmm1 -; SSE4-NEXT: pcmpeqd %xmm0, %xmm0 -; SSE4-NEXT: pxor %xmm0, %xmm1 -; SSE4-NEXT: por %xmm2, %xmm1 -; SSE4-NEXT: movdqa %xmm1, %xmm0 +; SSE4-NEXT: pminud {{.*}}(%rip), %xmm0 +; SSE4-NEXT: paddd {{.*}}(%rip), %xmm0 ; SSE4-NEXT: retq ; ; AVX2-LABEL: unsigned_sat_constant_v4i32_using_cmp_sum: ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [42,42,42,42] -; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm1 -; AVX2-NEXT: vpminud %xmm1, %xmm0, %xmm2 -; AVX2-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [4294967253,4294967253,4294967253,4294967253] +; AVX2-NEXT: vpminud %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq ; ; AVX512-LABEL: unsigned_sat_constant_v4i32_using_cmp_sum: ; AVX512: # %bb.0: -; AVX512-NEXT: vpaddd {{.*}}(%rip){1to4}, %xmm0, %xmm1 -; AVX512-NEXT: vpcmpnleud %xmm1, %xmm0, %k1 -; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 -; AVX512-NEXT: vmovdqa32 %xmm0, %xmm1 {%k1} -; AVX512-NEXT: vmovdqa %xmm1, %xmm0 +; AVX512-NEXT: vpminud {{.*}}(%rip){1to4}, %xmm0, %xmm0 +; AVX512-NEXT: vpaddd {{.*}}(%rip){1to4}, %xmm0, %xmm0 ; AVX512-NEXT: retq %a = add <4 x i32> %x, %c = icmp ugt <4 x i32> %x, %a @@ -603,31 +556,22 @@ ; ; SSE4-LABEL: unsigned_sat_constant_v4i32_using_cmp_notval: ; SSE4: # %bb.0: -; SSE4-NEXT: movdqa {{.*#+}} xmm1 = [42,42,42,42] -; SSE4-NEXT: paddd %xmm0, %xmm1 -; SSE4-NEXT: movdqa {{.*#+}} xmm2 = [4294967254,4294967254,4294967254,4294967254] -; SSE4-NEXT: pmaxud %xmm0, %xmm2 -; SSE4-NEXT: pcmpeqd %xmm2, %xmm0 -; SSE4-NEXT: por %xmm1, %xmm0 +; SSE4-NEXT: pminud {{.*}}(%rip), %xmm0 +; SSE4-NEXT: paddd {{.*}}(%rip), %xmm0 ; SSE4-NEXT: retq ; ; AVX2-LABEL: unsigned_sat_constant_v4i32_using_cmp_notval: ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [42,42,42,42] -; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm1 -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [4294967254,4294967254,4294967254,4294967254] -; AVX2-NEXT: vpmaxud %xmm2, %xmm0, %xmm2 -; AVX2-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [4294967253,4294967253,4294967253,4294967253] +; AVX2-NEXT: vpminud %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq ; ; AVX512-LABEL: unsigned_sat_constant_v4i32_using_cmp_notval: ; AVX512: # %bb.0: -; AVX512-NEXT: vpaddd {{.*}}(%rip){1to4}, %xmm0, %xmm1 -; AVX512-NEXT: vpcmpnleud {{.*}}(%rip){1to4}, %xmm0, %k1 -; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 -; AVX512-NEXT: vmovdqa32 %xmm0, %xmm1 {%k1} -; AVX512-NEXT: vmovdqa %xmm1, %xmm0 +; AVX512-NEXT: vpminud {{.*}}(%rip){1to4}, %xmm0, %xmm0 +; AVX512-NEXT: vpaddd {{.*}}(%rip){1to4}, %xmm0, %xmm0 ; AVX512-NEXT: retq %a = add <4 x i32> %x, %c = icmp ugt <4 x i32> %x, @@ -647,30 +591,15 @@ ; ; SSE4-LABEL: unsigned_sat_constant_v4i32_using_cmp_notval_nonsplat: ; SSE4: # %bb.0: -; SSE4-NEXT: movdqa {{.*#+}} xmm1 = [43,44,45,46] -; SSE4-NEXT: paddd %xmm0, %xmm1 -; SSE4-NEXT: movdqa {{.*#+}} xmm2 = [4294967253,4294967252,4294967251,4294967250] -; SSE4-NEXT: pmaxud %xmm0, %xmm2 -; SSE4-NEXT: pcmpeqd %xmm2, %xmm0 -; SSE4-NEXT: por %xmm1, %xmm0 +; SSE4-NEXT: pminud {{.*}}(%rip), %xmm0 +; SSE4-NEXT: paddd {{.*}}(%rip), %xmm0 ; SSE4-NEXT: retq ; -; AVX2-LABEL: unsigned_sat_constant_v4i32_using_cmp_notval_nonsplat: -; AVX2: # %bb.0: -; AVX2-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm1 -; AVX2-NEXT: vpmaxud {{.*}}(%rip), %xmm0, %xmm2 -; AVX2-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: retq -; -; AVX512-LABEL: unsigned_sat_constant_v4i32_using_cmp_notval_nonsplat: -; AVX512: # %bb.0: -; AVX512-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm1 -; AVX512-NEXT: vpcmpnleud {{.*}}(%rip), %xmm0, %k1 -; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 -; AVX512-NEXT: vmovdqa32 %xmm0, %xmm1 {%k1} -; AVX512-NEXT: vmovdqa %xmm1, %xmm0 -; AVX512-NEXT: retq +; AVX-LABEL: unsigned_sat_constant_v4i32_using_cmp_notval_nonsplat: +; AVX: # %bb.0: +; AVX-NEXT: vpminud {{.*}}(%rip), %xmm0, %xmm0 +; AVX-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0 +; AVX-NEXT: retq %a = add <4 x i32> %x, %c = icmp ugt <4 x i32> %x, %r = select <4 x i1> %c, <4 x i32> , <4 x i32> %a @@ -788,32 +717,30 @@ ; ; SSE42-LABEL: unsigned_sat_constant_v2i64_using_cmp_sum: ; SSE42: # %bb.0: -; SSE42-NEXT: movdqa {{.*#+}} xmm1 = [42,42] -; SSE42-NEXT: paddq %xmm0, %xmm1 ; SSE42-NEXT: movdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] -; SSE42-NEXT: pxor %xmm2, %xmm0 -; SSE42-NEXT: pxor %xmm1, %xmm2 -; SSE42-NEXT: pcmpgtq %xmm2, %xmm0 -; SSE42-NEXT: por %xmm1, %xmm0 +; SSE42-NEXT: movdqa %xmm0, %xmm1 +; SSE42-NEXT: pxor %xmm2, %xmm1 +; SSE42-NEXT: paddq {{.*}}(%rip), %xmm0 +; SSE42-NEXT: pxor %xmm0, %xmm2 +; SSE42-NEXT: pcmpgtq %xmm2, %xmm1 +; SSE42-NEXT: por %xmm0, %xmm1 +; SSE42-NEXT: movdqa %xmm1, %xmm0 ; SSE42-NEXT: retq ; ; AVX2-LABEL: unsigned_sat_constant_v2i64_using_cmp_sum: ; AVX2: # %bb.0: -; AVX2-NEXT: vpaddq {{.*}}(%rip), %xmm0, %xmm1 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] -; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm2 -; AVX2-NEXT: vpcmpgtq %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808] +; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm2 +; AVX2-NEXT: vpaddq {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm1 +; AVX2-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1 ; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq ; ; AVX512-LABEL: unsigned_sat_constant_v2i64_using_cmp_sum: ; AVX512: # %bb.0: -; AVX512-NEXT: vpaddq {{.*}}(%rip), %xmm0, %xmm1 -; AVX512-NEXT: vpcmpnleuq %xmm1, %xmm0, %k1 -; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 -; AVX512-NEXT: vmovdqa64 %xmm0, %xmm1 {%k1} -; AVX512-NEXT: vmovdqa %xmm1, %xmm0 +; AVX512-NEXT: vpminuq {{.*}}(%rip), %xmm0, %xmm0 +; AVX512-NEXT: vpaddq {{.*}}(%rip), %xmm0, %xmm0 ; AVX512-NEXT: retq %a = add <2 x i64> %x, %c = icmp ugt <2 x i64> %x, %a @@ -858,28 +785,30 @@ ; ; SSE42-LABEL: unsigned_sat_constant_v2i64_using_cmp_notval: ; SSE42: # %bb.0: -; SSE42-NEXT: movdqa {{.*#+}} xmm1 = [42,42] -; SSE42-NEXT: paddq %xmm0, %xmm1 -; SSE42-NEXT: pxor {{.*}}(%rip), %xmm0 -; SSE42-NEXT: pcmpgtq {{.*}}(%rip), %xmm0 -; SSE42-NEXT: por %xmm1, %xmm0 +; SSE42-NEXT: movdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; SSE42-NEXT: movdqa %xmm0, %xmm1 +; SSE42-NEXT: pxor %xmm2, %xmm1 +; SSE42-NEXT: paddq {{.*}}(%rip), %xmm0 +; SSE42-NEXT: pxor %xmm0, %xmm2 +; SSE42-NEXT: pcmpgtq %xmm2, %xmm1 +; SSE42-NEXT: por %xmm0, %xmm1 +; SSE42-NEXT: movdqa %xmm1, %xmm0 ; SSE42-NEXT: retq ; ; AVX2-LABEL: unsigned_sat_constant_v2i64_using_cmp_notval: ; AVX2: # %bb.0: -; AVX2-NEXT: vpaddq {{.*}}(%rip), %xmm0, %xmm1 -; AVX2-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 -; AVX2-NEXT: vpcmpgtq {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808] +; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm2 +; AVX2-NEXT: vpaddq {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm1 +; AVX2-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1 ; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq ; ; AVX512-LABEL: unsigned_sat_constant_v2i64_using_cmp_notval: ; AVX512: # %bb.0: -; AVX512-NEXT: vpaddq {{.*}}(%rip), %xmm0, %xmm1 -; AVX512-NEXT: vpcmpnleuq {{.*}}(%rip), %xmm0, %k1 -; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 -; AVX512-NEXT: vmovdqa64 %xmm0, %xmm1 {%k1} -; AVX512-NEXT: vmovdqa %xmm1, %xmm0 +; AVX512-NEXT: vpminuq {{.*}}(%rip), %xmm0, %xmm0 +; AVX512-NEXT: vpaddq {{.*}}(%rip), %xmm0, %xmm0 ; AVX512-NEXT: retq %a = add <2 x i64> %x, %c = icmp ugt <2 x i64> %x, @@ -924,19 +853,10 @@ ; SSE-NEXT: paddusb %xmm1, %xmm0 ; SSE-NEXT: retq ; -; AVX2-LABEL: unsigned_sat_variable_v16i8_using_cmp_sum: -; AVX2: # %bb.0: -; AVX2-NEXT: vpaddusb %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: retq -; -; AVX512-LABEL: unsigned_sat_variable_v16i8_using_cmp_sum: -; AVX512: # %bb.0: -; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm1 -; AVX512-NEXT: vpminub %xmm1, %xmm0, %xmm2 -; AVX512-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm2 -; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 -; AVX512-NEXT: vpternlogq $222, %xmm2, %xmm1, %xmm0 -; AVX512-NEXT: retq +; AVX-LABEL: unsigned_sat_variable_v16i8_using_cmp_sum: +; AVX: # %bb.0: +; AVX-NEXT: vpaddusb %xmm1, %xmm0, %xmm0 +; AVX-NEXT: retq %a = add <16 x i8> %x, %y %c = icmp ugt <16 x i8> %x, %a %r = select <16 x i1> %c, <16 x i8> , <16 x i8> %a @@ -1030,19 +950,10 @@ ; SSE-NEXT: paddusw %xmm1, %xmm0 ; SSE-NEXT: retq ; -; AVX2-LABEL: unsigned_sat_variable_v8i16_using_cmp_sum: -; AVX2: # %bb.0: -; AVX2-NEXT: vpaddusw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: retq -; -; AVX512-LABEL: unsigned_sat_variable_v8i16_using_cmp_sum: -; AVX512: # %bb.0: -; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm1 -; AVX512-NEXT: vpminuw %xmm1, %xmm0, %xmm2 -; AVX512-NEXT: vpcmpeqw %xmm2, %xmm0, %xmm2 -; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 -; AVX512-NEXT: vpternlogq $222, %xmm2, %xmm1, %xmm0 -; AVX512-NEXT: retq +; AVX-LABEL: unsigned_sat_variable_v8i16_using_cmp_sum: +; AVX: # %bb.0: +; AVX-NEXT: vpaddusw %xmm1, %xmm0, %xmm0 +; AVX-NEXT: retq %a = add <8 x i16> %x, %y %c = icmp ugt <8 x i16> %x, %a %r = select <8 x i1> %c, <8 x i16> , <8 x i16> %a @@ -1159,33 +1070,26 @@ ; ; SSE4-LABEL: unsigned_sat_variable_v4i32_using_cmp_sum: ; SSE4: # %bb.0: -; SSE4-NEXT: paddd %xmm0, %xmm1 -; SSE4-NEXT: movdqa %xmm0, %xmm2 -; SSE4-NEXT: pminud %xmm1, %xmm2 -; SSE4-NEXT: pcmpeqd %xmm0, %xmm2 -; SSE4-NEXT: pcmpeqd %xmm0, %xmm0 -; SSE4-NEXT: pxor %xmm0, %xmm2 -; SSE4-NEXT: por %xmm1, %xmm2 -; SSE4-NEXT: movdqa %xmm2, %xmm0 +; SSE4-NEXT: pcmpeqd %xmm2, %xmm2 +; SSE4-NEXT: pxor %xmm1, %xmm2 +; SSE4-NEXT: pminud %xmm2, %xmm0 +; SSE4-NEXT: paddd %xmm1, %xmm0 ; SSE4-NEXT: retq ; ; AVX2-LABEL: unsigned_sat_variable_v4i32_using_cmp_sum: ; AVX2: # %bb.0: -; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm1 -; AVX2-NEXT: vpminud %xmm1, %xmm0, %xmm2 -; AVX2-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm2 +; AVX2-NEXT: vpminud %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq ; ; AVX512-LABEL: unsigned_sat_variable_v4i32_using_cmp_sum: ; AVX512: # %bb.0: -; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm1 -; AVX512-NEXT: vpcmpnleud %xmm1, %xmm0, %k1 -; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 -; AVX512-NEXT: vmovdqa32 %xmm0, %xmm1 {%k1} -; AVX512-NEXT: vmovdqa %xmm1, %xmm0 +; AVX512-NEXT: vmovdqa %xmm1, %xmm2 +; AVX512-NEXT: vpternlogq $15, %xmm1, %xmm1, %xmm2 +; AVX512-NEXT: vpminud %xmm2, %xmm0, %xmm0 +; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: retq %a = add <4 x i32> %x, %y %c = icmp ugt <4 x i32> %x, %a @@ -1364,8 +1268,8 @@ ; ; SSE42-LABEL: unsigned_sat_variable_v2i64_using_cmp_sum: ; SSE42: # %bb.0: -; SSE42-NEXT: paddq %xmm0, %xmm1 ; SSE42-NEXT: movdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; SSE42-NEXT: paddq %xmm0, %xmm1 ; SSE42-NEXT: pxor %xmm2, %xmm0 ; SSE42-NEXT: pxor %xmm1, %xmm2 ; SSE42-NEXT: pcmpgtq %xmm2, %xmm0 @@ -1374,21 +1278,20 @@ ; ; AVX2-LABEL: unsigned_sat_variable_v2i64_using_cmp_sum: ; AVX2: # %bb.0: -; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm1 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] -; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm2 -; AVX2-NEXT: vpcmpgtq %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm3 +; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm1 +; AVX2-NEXT: vpcmpgtq %xmm1, %xmm3, %xmm1 ; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq ; ; AVX512-LABEL: unsigned_sat_variable_v2i64_using_cmp_sum: ; AVX512: # %bb.0: -; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm1 -; AVX512-NEXT: vpcmpnleuq %xmm1, %xmm0, %k1 -; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 -; AVX512-NEXT: vmovdqa64 %xmm0, %xmm1 {%k1} -; AVX512-NEXT: vmovdqa %xmm1, %xmm0 +; AVX512-NEXT: vmovdqa %xmm1, %xmm2 +; AVX512-NEXT: vpternlogq $15, %xmm1, %xmm1, %xmm2 +; AVX512-NEXT: vpminuq %xmm2, %xmm0, %xmm0 +; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: retq %a = add <2 x i64> %x, %y %c = icmp ugt <2 x i64> %x, %a diff --git a/llvm/test/CodeGen/X86/uadd_sat_vec.ll b/llvm/test/CodeGen/X86/uadd_sat_vec.ll --- a/llvm/test/CodeGen/X86/uadd_sat_vec.ll +++ b/llvm/test/CodeGen/X86/uadd_sat_vec.ll @@ -937,7 +937,7 @@ ; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm1 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm3, %xmm1 -; AVX1-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: v2i64: @@ -947,7 +947,7 @@ ; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm1 ; AVX2-NEXT: vpcmpgtq %xmm1, %xmm3, %xmm1 -; AVX2-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq ; ; AVX512F-LABEL: v2i64: @@ -1012,12 +1012,12 @@ ; AVX1-NEXT: vpaddq %xmm5, %xmm2, %xmm2 ; AVX1-NEXT: vpxor %xmm3, %xmm2, %xmm5 ; AVX1-NEXT: vpcmpgtq %xmm5, %xmm4, %xmm4 -; AVX1-NEXT: vpor %xmm2, %xmm4, %xmm2 +; AVX1-NEXT: vpor %xmm4, %xmm2, %xmm2 ; AVX1-NEXT: vpxor %xmm3, %xmm0, %xmm4 ; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm3, %xmm0, %xmm1 ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm4, %xmm1 -; AVX1-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX1-NEXT: retq ; @@ -1028,7 +1028,7 @@ ; AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm1 ; AVX2-NEXT: vpcmpgtq %ymm1, %ymm3, %ymm1 -; AVX2-NEXT: vpor %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512F-LABEL: v4i64: @@ -1118,12 +1118,12 @@ ; AVX1-NEXT: vpaddq %xmm7, %xmm4, %xmm4 ; AVX1-NEXT: vpxor %xmm5, %xmm4, %xmm7 ; AVX1-NEXT: vpcmpgtq %xmm7, %xmm6, %xmm6 -; AVX1-NEXT: vpor %xmm4, %xmm6, %xmm4 +; AVX1-NEXT: vpor %xmm6, %xmm4, %xmm4 ; AVX1-NEXT: vpxor %xmm5, %xmm0, %xmm6 ; AVX1-NEXT: vpaddq %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm5, %xmm0, %xmm2 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm6, %xmm2 -; AVX1-NEXT: vpor %xmm0, %xmm2, %xmm0 +; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 ; AVX1-NEXT: vpxor %xmm5, %xmm2, %xmm4 @@ -1131,12 +1131,12 @@ ; AVX1-NEXT: vpaddq %xmm6, %xmm2, %xmm2 ; AVX1-NEXT: vpxor %xmm5, %xmm2, %xmm6 ; AVX1-NEXT: vpcmpgtq %xmm6, %xmm4, %xmm4 -; AVX1-NEXT: vpor %xmm2, %xmm4, %xmm2 +; AVX1-NEXT: vpor %xmm4, %xmm2, %xmm2 ; AVX1-NEXT: vpxor %xmm5, %xmm1, %xmm4 ; AVX1-NEXT: vpaddq %xmm3, %xmm1, %xmm1 ; AVX1-NEXT: vpxor %xmm5, %xmm1, %xmm3 ; AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3 -; AVX1-NEXT: vpor %xmm1, %xmm3, %xmm1 +; AVX1-NEXT: vpor %xmm3, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ; AVX1-NEXT: retq ; @@ -1147,12 +1147,12 @@ ; AVX2-NEXT: vpaddq %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpxor %ymm4, %ymm0, %ymm2 ; AVX2-NEXT: vpcmpgtq %ymm2, %ymm5, %ymm2 -; AVX2-NEXT: vpor %ymm0, %ymm2, %ymm0 +; AVX2-NEXT: vpor %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpxor %ymm4, %ymm1, %ymm2 ; AVX2-NEXT: vpaddq %ymm3, %ymm1, %ymm1 ; AVX2-NEXT: vpxor %ymm4, %ymm1, %ymm3 ; AVX2-NEXT: vpcmpgtq %ymm3, %ymm2, %ymm2 -; AVX2-NEXT: vpor %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vpor %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: retq ; ; AVX512-LABEL: v8i64: