diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -9685,6 +9685,51 @@ return DAG.getSelect(DL, N1.getValueType(), WideSetCC, N1, N2); } } + + // Match VSELECTs into add with unsigned saturation. + if (hasOperation(ISD::UADDSAT, VT)) { + // Check if one of the arms of the VSELECT is vector with all bits set. + // If it's on the left side invert the predicate to simplify logic below. + SDValue Other; + ISD::CondCode SatCC = CC; + if (ISD::isBuildVectorAllOnes(N1.getNode())) { + Other = N2; + SatCC = ISD::getSetCCInverse(SatCC, VT.getScalarType()); + } else if (ISD::isBuildVectorAllOnes(N2.getNode())) { + Other = N1; + } + + if (Other && Other.getOpcode() == ISD::ADD) { + SDValue CondLHS = LHS, CondRHS = RHS; + SDValue OpLHS = Other.getOperand(0), OpRHS = Other.getOperand(1); + + // Canonicalize condition operands. + if (SatCC == ISD::SETUGE) { + std::swap(CondLHS, CondRHS); + SatCC = ISD::SETULE; + } + + // We can test against either of the addition operands. + // x <= x+y ? x+y : ~0 --> addus x, y + // x+y >= x ? x+y : ~0 --> addus x, y + if (SatCC == ISD::SETULE && Other == CondRHS && + (OpLHS == CondLHS || OpRHS == CondLHS)) + return DAG.getNode(ISD::UADDSAT, DL, VT, OpLHS, OpRHS); + + if (isa(OpRHS) && isa(CondRHS) && + CondLHS == OpLHS) { + // If the RHS is a constant we have to reverse the const + // canonicalization. + // x >= ~C ? x+C : ~0 --> addus x, C + auto MatchUADDSAT = [](ConstantSDNode *Op, ConstantSDNode *Cond) { + return Cond->getAPIntValue() == ~Op->getAPIntValue(); + }; + if (SatCC == ISD::SETULE && + ISD::matchBinaryPredicate(OpRHS, CondRHS, MatchUADDSAT)) + return DAG.getNode(ISD::UADDSAT, DL, VT, OpLHS, OpRHS); + } + } + } } if (SimplifySelectOps(N, N1, N2)) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -41036,59 +41036,6 @@ } } - // Match VSELECTs into add with unsigned saturation. - if (N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC && - // paddus is available in SSE2 for i8 and i16 vectors. - Subtarget.hasSSE2() && VT.getVectorNumElements() >= 2 && - isPowerOf2_32(VT.getVectorNumElements()) && - (VT.getVectorElementType() == MVT::i8 || - VT.getVectorElementType() == MVT::i16)) { - ISD::CondCode CC = cast(Cond.getOperand(2))->get(); - - SDValue CondLHS = Cond->getOperand(0); - SDValue CondRHS = Cond->getOperand(1); - - // Check if one of the arms of the VSELECT is vector with all bits set. - // If it's on the left side invert the predicate to simplify logic below. - SDValue Other; - if (ISD::isBuildVectorAllOnes(LHS.getNode())) { - Other = RHS; - CC = ISD::getSetCCInverse(CC, VT.getVectorElementType()); - } else if (ISD::isBuildVectorAllOnes(RHS.getNode())) { - Other = LHS; - } - - if (Other.getNode() && Other.getOpcode() == ISD::ADD) { - SDValue OpLHS = Other.getOperand(0), OpRHS = Other.getOperand(1); - - // Canonicalize condition operands. - if (CC == ISD::SETUGE) { - std::swap(CondLHS, CondRHS); - CC = ISD::SETULE; - } - - // We can test against either of the addition operands. - // x <= x+y ? x+y : ~0 --> addus x, y - // x+y >= x ? x+y : ~0 --> addus x, y - if (CC == ISD::SETULE && Other == CondRHS && - (OpLHS == CondLHS || OpRHS == CondLHS)) - return DAG.getNode(ISD::UADDSAT, DL, VT, OpLHS, OpRHS); - - if (isa(OpRHS) && isa(CondRHS) && - CondLHS == OpLHS) { - // If the RHS is a constant we have to reverse the const - // canonicalization. - // x > ~C ? x+C : ~0 --> addus x, C - auto MatchUADDSAT = [](ConstantSDNode *Op, ConstantSDNode *Cond) { - return Cond->getAPIntValue() == ~Op->getAPIntValue(); - }; - if (CC == ISD::SETULE && - ISD::matchBinaryPredicate(OpRHS, CondRHS, MatchUADDSAT)) - return DAG.getNode(ISD::UADDSAT, DL, VT, OpLHS, OpRHS); - } - } - } - // Check if the first operand is all zeros and Cond type is vXi1. // If this an avx512 target we can improve the use of zero masking by // swapping the operands and inverting the condition. diff --git a/llvm/test/CodeGen/AArch64/sat-add.ll b/llvm/test/CodeGen/AArch64/sat-add.ll --- a/llvm/test/CodeGen/AArch64/sat-add.ll +++ b/llvm/test/CodeGen/AArch64/sat-add.ll @@ -360,9 +360,7 @@ ; CHECK-LABEL: unsigned_sat_constant_v16i8_using_cmp_sum: ; CHECK: // %bb.0: ; CHECK-NEXT: movi v1.16b, #42 -; CHECK-NEXT: add v1.16b, v0.16b, v1.16b -; CHECK-NEXT: cmhi v0.16b, v0.16b, v1.16b -; CHECK-NEXT: orr v0.16b, v1.16b, v0.16b +; CHECK-NEXT: uqadd v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret %a = add <16 x i8> %x, %c = icmp ugt <16 x i8> %x, %a @@ -374,10 +372,7 @@ ; CHECK-LABEL: unsigned_sat_constant_v16i8_using_cmp_notval: ; CHECK: // %bb.0: ; CHECK-NEXT: movi v1.16b, #42 -; CHECK-NEXT: movi v2.16b, #213 -; CHECK-NEXT: add v1.16b, v0.16b, v1.16b -; CHECK-NEXT: cmhi v0.16b, v0.16b, v2.16b -; CHECK-NEXT: orr v0.16b, v1.16b, v0.16b +; CHECK-NEXT: uqadd v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret %a = add <16 x i8> %x, %c = icmp ugt <16 x i8> %x, @@ -403,9 +398,7 @@ ; CHECK-LABEL: unsigned_sat_constant_v8i16_using_cmp_sum: ; CHECK: // %bb.0: ; CHECK-NEXT: movi v1.8h, #42 -; CHECK-NEXT: add v1.8h, v0.8h, v1.8h -; CHECK-NEXT: cmhi v0.8h, v0.8h, v1.8h -; CHECK-NEXT: orr v0.16b, v1.16b, v0.16b +; CHECK-NEXT: uqadd v0.8h, v0.8h, v1.8h ; CHECK-NEXT: ret %a = add <8 x i16> %x, %c = icmp ugt <8 x i16> %x, %a @@ -417,10 +410,7 @@ ; CHECK-LABEL: unsigned_sat_constant_v8i16_using_cmp_notval: ; CHECK: // %bb.0: ; CHECK-NEXT: movi v1.8h, #42 -; CHECK-NEXT: mvni v2.8h, #42 -; CHECK-NEXT: add v1.8h, v0.8h, v1.8h -; CHECK-NEXT: cmhi v0.8h, v0.8h, v2.8h -; CHECK-NEXT: orr v0.16b, v1.16b, v0.16b +; CHECK-NEXT: uqadd v0.8h, v0.8h, v1.8h ; CHECK-NEXT: ret %a = add <8 x i16> %x, %c = icmp ugt <8 x i16> %x, @@ -446,9 +436,7 @@ ; CHECK-LABEL: unsigned_sat_constant_v4i32_using_cmp_sum: ; CHECK: // %bb.0: ; CHECK-NEXT: movi v1.4s, #42 -; CHECK-NEXT: add v1.4s, v0.4s, v1.4s -; CHECK-NEXT: cmhi v0.4s, v0.4s, v1.4s -; CHECK-NEXT: orr v0.16b, v1.16b, v0.16b +; CHECK-NEXT: uqadd v0.4s, v0.4s, v1.4s ; CHECK-NEXT: ret %a = add <4 x i32> %x, %c = icmp ugt <4 x i32> %x, %a @@ -460,10 +448,7 @@ ; CHECK-LABEL: unsigned_sat_constant_v4i32_using_cmp_notval: ; CHECK: // %bb.0: ; CHECK-NEXT: movi v1.4s, #42 -; CHECK-NEXT: mvni v2.4s, #42 -; CHECK-NEXT: add v1.4s, v0.4s, v1.4s -; CHECK-NEXT: cmhi v0.4s, v0.4s, v2.4s -; CHECK-NEXT: orr v0.16b, v1.16b, v0.16b +; CHECK-NEXT: uqadd v0.4s, v0.4s, v1.4s ; CHECK-NEXT: ret %a = add <4 x i32> %x, %c = icmp ugt <4 x i32> %x, @@ -493,9 +478,7 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: mov w8, #42 ; CHECK-NEXT: dup v1.2d, x8 -; CHECK-NEXT: add v1.2d, v0.2d, v1.2d -; CHECK-NEXT: cmhi v0.2d, v0.2d, v1.2d -; CHECK-NEXT: orr v0.16b, v1.16b, v0.16b +; CHECK-NEXT: uqadd v0.2d, v0.2d, v1.2d ; CHECK-NEXT: ret %a = add <2 x i64> %x, %c = icmp ugt <2 x i64> %x, %a @@ -507,12 +490,8 @@ ; CHECK-LABEL: unsigned_sat_constant_v2i64_using_cmp_notval: ; CHECK: // %bb.0: ; CHECK-NEXT: mov w8, #42 -; CHECK-NEXT: mov x9, #-43 ; CHECK-NEXT: dup v1.2d, x8 -; CHECK-NEXT: dup v2.2d, x9 -; CHECK-NEXT: add v1.2d, v0.2d, v1.2d -; CHECK-NEXT: cmhi v0.2d, v0.2d, v2.2d -; CHECK-NEXT: orr v0.16b, v1.16b, v0.16b +; CHECK-NEXT: uqadd v0.2d, v0.2d, v1.2d ; CHECK-NEXT: ret %a = add <2 x i64> %x, %c = icmp ugt <2 x i64> %x, @@ -537,9 +516,7 @@ define <16 x i8> @unsigned_sat_variable_v16i8_using_cmp_sum(<16 x i8> %x, <16 x i8> %y) { ; CHECK-LABEL: unsigned_sat_variable_v16i8_using_cmp_sum: ; CHECK: // %bb.0: -; CHECK-NEXT: add v1.16b, v0.16b, v1.16b -; CHECK-NEXT: cmhi v0.16b, v0.16b, v1.16b -; CHECK-NEXT: orr v0.16b, v1.16b, v0.16b +; CHECK-NEXT: uqadd v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret %a = add <16 x i8> %x, %y %c = icmp ugt <16 x i8> %x, %a @@ -579,9 +556,7 @@ define <8 x i16> @unsigned_sat_variable_v8i16_using_cmp_sum(<8 x i16> %x, <8 x i16> %y) { ; CHECK-LABEL: unsigned_sat_variable_v8i16_using_cmp_sum: ; CHECK: // %bb.0: -; CHECK-NEXT: add v1.8h, v0.8h, v1.8h -; CHECK-NEXT: cmhi v0.8h, v0.8h, v1.8h -; CHECK-NEXT: orr v0.16b, v1.16b, v0.16b +; CHECK-NEXT: uqadd v0.8h, v0.8h, v1.8h ; CHECK-NEXT: ret %a = add <8 x i16> %x, %y %c = icmp ugt <8 x i16> %x, %a @@ -621,9 +596,7 @@ define <4 x i32> @unsigned_sat_variable_v4i32_using_cmp_sum(<4 x i32> %x, <4 x i32> %y) { ; CHECK-LABEL: unsigned_sat_variable_v4i32_using_cmp_sum: ; CHECK: // %bb.0: -; CHECK-NEXT: add v1.4s, v0.4s, v1.4s -; CHECK-NEXT: cmhi v0.4s, v0.4s, v1.4s -; CHECK-NEXT: orr v0.16b, v1.16b, v0.16b +; CHECK-NEXT: uqadd v0.4s, v0.4s, v1.4s ; CHECK-NEXT: ret %a = add <4 x i32> %x, %y %c = icmp ugt <4 x i32> %x, %a @@ -664,9 +637,7 @@ define <2 x i64> @unsigned_sat_variable_v2i64_using_cmp_sum(<2 x i64> %x, <2 x i64> %y) { ; CHECK-LABEL: unsigned_sat_variable_v2i64_using_cmp_sum: ; CHECK: // %bb.0: -; CHECK-NEXT: add v1.2d, v0.2d, v1.2d -; CHECK-NEXT: cmhi v0.2d, v0.2d, v1.2d -; CHECK-NEXT: orr v0.16b, v1.16b, v0.16b +; CHECK-NEXT: uqadd v0.2d, v0.2d, v1.2d ; CHECK-NEXT: ret %a = add <2 x i64> %x, %y %c = icmp ugt <2 x i64> %x, %a diff --git a/llvm/test/CodeGen/PowerPC/sat-add.ll b/llvm/test/CodeGen/PowerPC/sat-add.ll --- a/llvm/test/CodeGen/PowerPC/sat-add.ll +++ b/llvm/test/CodeGen/PowerPC/sat-add.ll @@ -396,12 +396,9 @@ ; CHECK-LABEL: unsigned_sat_constant_v16i8_using_cmp_sum: ; CHECK: # %bb.0: ; CHECK-NEXT: addis 3, 2, .LCPI25_0@toc@ha -; CHECK-NEXT: xxleqv 0, 0, 0 ; CHECK-NEXT: addi 3, 3, .LCPI25_0@toc@l ; CHECK-NEXT: lvx 3, 0, 3 -; CHECK-NEXT: vaddubm 3, 2, 3 -; CHECK-NEXT: vcmpgtub 2, 2, 3 -; CHECK-NEXT: xxsel 34, 35, 0, 34 +; CHECK-NEXT: vaddubs 2, 2, 3 ; CHECK-NEXT: blr %a = add <16 x i8> %x, %c = icmp ugt <16 x i8> %x, %a @@ -412,16 +409,10 @@ define <16 x i8> @unsigned_sat_constant_v16i8_using_cmp_notval(<16 x i8> %x) { ; CHECK-LABEL: unsigned_sat_constant_v16i8_using_cmp_notval: ; CHECK: # %bb.0: -; CHECK-NEXT: addis 3, 2, .LCPI26_1@toc@ha -; CHECK-NEXT: xxleqv 0, 0, 0 -; CHECK-NEXT: addi 3, 3, .LCPI26_1@toc@l -; CHECK-NEXT: lvx 3, 0, 3 ; CHECK-NEXT: addis 3, 2, .LCPI26_0@toc@ha ; CHECK-NEXT: addi 3, 3, .LCPI26_0@toc@l -; CHECK-NEXT: vcmpgtub 3, 2, 3 -; CHECK-NEXT: lvx 4, 0, 3 -; CHECK-NEXT: vaddubm 2, 2, 4 -; CHECK-NEXT: xxsel 34, 34, 0, 35 +; CHECK-NEXT: lvx 3, 0, 3 +; CHECK-NEXT: vaddubs 2, 2, 3 ; CHECK-NEXT: blr %a = add <16 x i8> %x, %c = icmp ugt <16 x i8> %x, @@ -451,12 +442,9 @@ ; CHECK-LABEL: unsigned_sat_constant_v8i16_using_cmp_sum: ; CHECK: # %bb.0: ; CHECK-NEXT: addis 3, 2, .LCPI28_0@toc@ha -; CHECK-NEXT: xxleqv 0, 0, 0 ; CHECK-NEXT: addi 3, 3, .LCPI28_0@toc@l ; CHECK-NEXT: lvx 3, 0, 3 -; CHECK-NEXT: vadduhm 3, 2, 3 -; CHECK-NEXT: vcmpgtuh 2, 2, 3 -; CHECK-NEXT: xxsel 34, 35, 0, 34 +; CHECK-NEXT: vadduhs 2, 2, 3 ; CHECK-NEXT: blr %a = add <8 x i16> %x, %c = icmp ugt <8 x i16> %x, %a @@ -467,16 +455,10 @@ define <8 x i16> @unsigned_sat_constant_v8i16_using_cmp_notval(<8 x i16> %x) { ; CHECK-LABEL: unsigned_sat_constant_v8i16_using_cmp_notval: ; CHECK: # %bb.0: -; CHECK-NEXT: addis 3, 2, .LCPI29_1@toc@ha -; CHECK-NEXT: xxleqv 0, 0, 0 -; CHECK-NEXT: addi 3, 3, .LCPI29_1@toc@l -; CHECK-NEXT: lvx 3, 0, 3 ; CHECK-NEXT: addis 3, 2, .LCPI29_0@toc@ha ; CHECK-NEXT: addi 3, 3, .LCPI29_0@toc@l -; CHECK-NEXT: vcmpgtuh 3, 2, 3 -; CHECK-NEXT: lvx 4, 0, 3 -; CHECK-NEXT: vadduhm 2, 2, 4 -; CHECK-NEXT: xxsel 34, 34, 0, 35 +; CHECK-NEXT: lvx 3, 0, 3 +; CHECK-NEXT: vadduhs 2, 2, 3 ; CHECK-NEXT: blr %a = add <8 x i16> %x, %c = icmp ugt <8 x i16> %x, @@ -506,12 +488,9 @@ ; CHECK-LABEL: unsigned_sat_constant_v4i32_using_cmp_sum: ; CHECK: # %bb.0: ; CHECK-NEXT: addis 3, 2, .LCPI31_0@toc@ha -; CHECK-NEXT: xxleqv 0, 0, 0 ; CHECK-NEXT: addi 3, 3, .LCPI31_0@toc@l ; CHECK-NEXT: lvx 3, 0, 3 -; CHECK-NEXT: vadduwm 3, 2, 3 -; CHECK-NEXT: vcmpgtuw 2, 2, 3 -; CHECK-NEXT: xxsel 34, 35, 0, 34 +; CHECK-NEXT: vadduws 2, 2, 3 ; CHECK-NEXT: blr %a = add <4 x i32> %x, %c = icmp ugt <4 x i32> %x, %a @@ -522,16 +501,10 @@ define <4 x i32> @unsigned_sat_constant_v4i32_using_cmp_notval(<4 x i32> %x) { ; CHECK-LABEL: unsigned_sat_constant_v4i32_using_cmp_notval: ; CHECK: # %bb.0: -; CHECK-NEXT: addis 3, 2, .LCPI32_1@toc@ha -; CHECK-NEXT: xxleqv 0, 0, 0 -; CHECK-NEXT: addi 3, 3, .LCPI32_1@toc@l -; CHECK-NEXT: lvx 3, 0, 3 ; CHECK-NEXT: addis 3, 2, .LCPI32_0@toc@ha ; CHECK-NEXT: addi 3, 3, .LCPI32_0@toc@l -; CHECK-NEXT: vcmpgtuw 3, 2, 3 -; CHECK-NEXT: lvx 4, 0, 3 -; CHECK-NEXT: vadduwm 2, 2, 4 -; CHECK-NEXT: xxsel 34, 34, 0, 35 +; CHECK-NEXT: lvx 3, 0, 3 +; CHECK-NEXT: vadduws 2, 2, 3 ; CHECK-NEXT: blr %a = add <4 x i32> %x, %c = icmp ugt <4 x i32> %x, @@ -616,10 +589,7 @@ define <16 x i8> @unsigned_sat_variable_v16i8_using_cmp_sum(<16 x i8> %x, <16 x i8> %y) { ; CHECK-LABEL: unsigned_sat_variable_v16i8_using_cmp_sum: ; CHECK: # %bb.0: -; CHECK-NEXT: vaddubm 3, 2, 3 -; CHECK-NEXT: xxleqv 0, 0, 0 -; CHECK-NEXT: vcmpgtub 2, 2, 3 -; CHECK-NEXT: xxsel 34, 35, 0, 34 +; CHECK-NEXT: vaddubs 2, 2, 3 ; CHECK-NEXT: blr %a = add <16 x i8> %x, %y %c = icmp ugt <16 x i8> %x, %a @@ -660,10 +630,7 @@ define <8 x i16> @unsigned_sat_variable_v8i16_using_cmp_sum(<8 x i16> %x, <8 x i16> %y) { ; CHECK-LABEL: unsigned_sat_variable_v8i16_using_cmp_sum: ; CHECK: # %bb.0: -; CHECK-NEXT: vadduhm 3, 2, 3 -; CHECK-NEXT: xxleqv 0, 0, 0 -; CHECK-NEXT: vcmpgtuh 2, 2, 3 -; CHECK-NEXT: xxsel 34, 35, 0, 34 +; CHECK-NEXT: vadduhs 2, 2, 3 ; CHECK-NEXT: blr %a = add <8 x i16> %x, %y %c = icmp ugt <8 x i16> %x, %a @@ -704,10 +671,7 @@ define <4 x i32> @unsigned_sat_variable_v4i32_using_cmp_sum(<4 x i32> %x, <4 x i32> %y) { ; CHECK-LABEL: unsigned_sat_variable_v4i32_using_cmp_sum: ; CHECK: # %bb.0: -; CHECK-NEXT: vadduwm 3, 2, 3 -; CHECK-NEXT: xxleqv 0, 0, 0 -; CHECK-NEXT: vcmpgtuw 2, 2, 3 -; CHECK-NEXT: xxsel 34, 35, 0, 34 +; CHECK-NEXT: vadduws 2, 2, 3 ; CHECK-NEXT: blr %a = add <4 x i32> %x, %y %c = icmp ugt <4 x i32> %x, %a diff --git a/llvm/test/CodeGen/X86/sat-add.ll b/llvm/test/CodeGen/X86/sat-add.ll --- a/llvm/test/CodeGen/X86/sat-add.ll +++ b/llvm/test/CodeGen/X86/sat-add.ll @@ -462,26 +462,20 @@ define <4 x i32> @unsigned_sat_constant_v4i32_using_cmp_sum(<4 x i32> %x) { ; SSE2-LABEL: unsigned_sat_constant_v4i32_using_cmp_sum: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [42,42,42,42] -; SSE2-NEXT: paddd %xmm0, %xmm1 ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] -; SSE2-NEXT: pxor %xmm2, %xmm0 -; SSE2-NEXT: pxor %xmm1, %xmm2 -; SSE2-NEXT: pcmpgtd %xmm2, %xmm0 -; SSE2-NEXT: por %xmm1, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: pxor %xmm2, %xmm1 +; SSE2-NEXT: paddd {{.*}}(%rip), %xmm0 +; SSE2-NEXT: pxor %xmm0, %xmm2 +; SSE2-NEXT: pcmpgtd %xmm2, %xmm1 +; SSE2-NEXT: por %xmm0, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE4-LABEL: unsigned_sat_constant_v4i32_using_cmp_sum: ; SSE4: # %bb.0: -; SSE4-NEXT: movdqa {{.*#+}} xmm2 = [42,42,42,42] -; SSE4-NEXT: paddd %xmm0, %xmm2 -; SSE4-NEXT: movdqa %xmm0, %xmm1 -; SSE4-NEXT: pminud %xmm2, %xmm1 -; SSE4-NEXT: pcmpeqd %xmm0, %xmm1 -; SSE4-NEXT: pcmpeqd %xmm0, %xmm0 -; SSE4-NEXT: pxor %xmm0, %xmm1 -; SSE4-NEXT: por %xmm2, %xmm1 -; SSE4-NEXT: movdqa %xmm1, %xmm0 +; SSE4-NEXT: pminud {{.*}}(%rip), %xmm0 +; SSE4-NEXT: paddd {{.*}}(%rip), %xmm0 ; SSE4-NEXT: retq %a = add <4 x i32> %x, %c = icmp ugt <4 x i32> %x, %a @@ -492,21 +486,20 @@ define <4 x i32> @unsigned_sat_constant_v4i32_using_cmp_notval(<4 x i32> %x) { ; SSE2-LABEL: unsigned_sat_constant_v4i32_using_cmp_notval: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [42,42,42,42] -; SSE2-NEXT: paddd %xmm0, %xmm1 -; SSE2-NEXT: pxor {{.*}}(%rip), %xmm0 -; SSE2-NEXT: pcmpgtd {{.*}}(%rip), %xmm0 -; SSE2-NEXT: por %xmm1, %xmm0 +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: pxor %xmm2, %xmm1 +; SSE2-NEXT: paddd {{.*}}(%rip), %xmm0 +; SSE2-NEXT: pxor %xmm0, %xmm2 +; SSE2-NEXT: pcmpgtd %xmm2, %xmm1 +; SSE2-NEXT: por %xmm0, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE4-LABEL: unsigned_sat_constant_v4i32_using_cmp_notval: ; SSE4: # %bb.0: -; SSE4-NEXT: movdqa {{.*#+}} xmm1 = [42,42,42,42] -; SSE4-NEXT: paddd %xmm0, %xmm1 -; SSE4-NEXT: movdqa {{.*#+}} xmm2 = [4294967254,4294967254,4294967254,4294967254] -; SSE4-NEXT: pmaxud %xmm0, %xmm2 -; SSE4-NEXT: pcmpeqd %xmm2, %xmm0 -; SSE4-NEXT: por %xmm1, %xmm0 +; SSE4-NEXT: pminud {{.*}}(%rip), %xmm0 +; SSE4-NEXT: paddd {{.*}}(%rip), %xmm0 ; SSE4-NEXT: retq %a = add <4 x i32> %x, %c = icmp ugt <4 x i32> %x, @@ -517,21 +510,20 @@ define <4 x i32> @unsigned_sat_constant_v4i32_using_cmp_notval_nonsplat(<4 x i32> %x) { ; SSE2-LABEL: unsigned_sat_constant_v4i32_using_cmp_notval_nonsplat: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [43,44,45,46] -; SSE2-NEXT: paddd %xmm0, %xmm1 -; SSE2-NEXT: pxor {{.*}}(%rip), %xmm0 -; SSE2-NEXT: pcmpgtd {{.*}}(%rip), %xmm0 -; SSE2-NEXT: por %xmm1, %xmm0 +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: pxor %xmm2, %xmm1 +; SSE2-NEXT: paddd {{.*}}(%rip), %xmm0 +; SSE2-NEXT: pxor %xmm0, %xmm2 +; SSE2-NEXT: pcmpgtd %xmm2, %xmm1 +; SSE2-NEXT: por %xmm0, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE4-LABEL: unsigned_sat_constant_v4i32_using_cmp_notval_nonsplat: ; SSE4: # %bb.0: -; SSE4-NEXT: movdqa {{.*#+}} xmm1 = [43,44,45,46] -; SSE4-NEXT: paddd %xmm0, %xmm1 -; SSE4-NEXT: movdqa {{.*#+}} xmm2 = [4294967253,4294967252,4294967251,4294967250] -; SSE4-NEXT: pmaxud %xmm0, %xmm2 -; SSE4-NEXT: pcmpeqd %xmm2, %xmm0 -; SSE4-NEXT: por %xmm1, %xmm0 +; SSE4-NEXT: pminud {{.*}}(%rip), %xmm0 +; SSE4-NEXT: paddd {{.*}}(%rip), %xmm0 ; SSE4-NEXT: retq %a = add <4 x i32> %x, %c = icmp ugt <4 x i32> %x, @@ -598,49 +590,52 @@ define <2 x i64> @unsigned_sat_constant_v2i64_using_cmp_sum(<2 x i64> %x) { ; SSE2-LABEL: unsigned_sat_constant_v2i64_using_cmp_sum: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [42,42] -; SSE2-NEXT: paddq %xmm0, %xmm1 -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [9223372039002259456,9223372039002259456] -; SSE2-NEXT: pxor %xmm2, %xmm0 +; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [9223372039002259456,9223372039002259456] +; SSE2-NEXT: movdqa %xmm0, %xmm2 ; SSE2-NEXT: pxor %xmm1, %xmm2 -; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: pcmpgtd %xmm2, %xmm3 +; SSE2-NEXT: paddq {{.*}}(%rip), %xmm0 +; SSE2-NEXT: pxor %xmm0, %xmm1 +; SSE2-NEXT: movdqa %xmm2, %xmm3 +; SSE2-NEXT: pcmpgtd %xmm1, %xmm3 ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm0, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSE2-NEXT: pcmpeqd %xmm2, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] ; SSE2-NEXT: pand %xmm4, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3] -; SSE2-NEXT: por %xmm1, %xmm0 -; SSE2-NEXT: por %xmm2, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3] +; SSE2-NEXT: por %xmm0, %xmm1 +; SSE2-NEXT: por %xmm2, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: unsigned_sat_constant_v2i64_using_cmp_sum: ; SSE41: # %bb.0: -; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [42,42] -; SSE41-NEXT: paddq %xmm0, %xmm1 -; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [9223372039002259456,9223372039002259456] -; SSE41-NEXT: pxor %xmm2, %xmm0 +; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [9223372039002259456,9223372039002259456] +; SSE41-NEXT: movdqa %xmm0, %xmm2 ; SSE41-NEXT: pxor %xmm1, %xmm2 -; SSE41-NEXT: movdqa %xmm0, %xmm3 -; SSE41-NEXT: pcmpgtd %xmm2, %xmm3 +; SSE41-NEXT: paddq {{.*}}(%rip), %xmm0 +; SSE41-NEXT: pxor %xmm0, %xmm1 +; SSE41-NEXT: movdqa %xmm2, %xmm3 +; SSE41-NEXT: pcmpgtd %xmm1, %xmm3 ; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2] -; SSE41-NEXT: pcmpeqd %xmm0, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; SSE41-NEXT: pcmpeqd %xmm2, %xmm1 +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] ; SSE41-NEXT: pand %xmm4, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3] -; SSE41-NEXT: por %xmm1, %xmm0 -; SSE41-NEXT: por %xmm2, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3] +; SSE41-NEXT: por %xmm0, %xmm1 +; SSE41-NEXT: por %xmm2, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: retq ; ; SSE42-LABEL: unsigned_sat_constant_v2i64_using_cmp_sum: ; SSE42: # %bb.0: -; SSE42-NEXT: movdqa {{.*#+}} xmm1 = [42,42] -; SSE42-NEXT: paddq %xmm0, %xmm1 ; SSE42-NEXT: movdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] -; SSE42-NEXT: pxor %xmm2, %xmm0 -; SSE42-NEXT: pxor %xmm1, %xmm2 -; SSE42-NEXT: pcmpgtq %xmm2, %xmm0 -; SSE42-NEXT: por %xmm1, %xmm0 +; SSE42-NEXT: movdqa %xmm0, %xmm1 +; SSE42-NEXT: pxor %xmm2, %xmm1 +; SSE42-NEXT: paddq {{.*}}(%rip), %xmm0 +; SSE42-NEXT: pxor %xmm0, %xmm2 +; SSE42-NEXT: pcmpgtq %xmm2, %xmm1 +; SSE42-NEXT: por %xmm0, %xmm1 +; SSE42-NEXT: movdqa %xmm1, %xmm0 ; SSE42-NEXT: retq %a = add <2 x i64> %x, %c = icmp ugt <2 x i64> %x, %a @@ -651,45 +646,52 @@ define <2 x i64> @unsigned_sat_constant_v2i64_using_cmp_notval(<2 x i64> %x) { ; SSE2-LABEL: unsigned_sat_constant_v2i64_using_cmp_notval: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [42,42] -; SSE2-NEXT: paddq %xmm0, %xmm1 -; SSE2-NEXT: pxor {{.*}}(%rip), %xmm0 -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [9223372034707292117,9223372034707292117] -; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: pcmpgtd %xmm2, %xmm3 +; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [9223372039002259456,9223372039002259456] +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: pxor %xmm1, %xmm2 +; SSE2-NEXT: paddq {{.*}}(%rip), %xmm0 +; SSE2-NEXT: pxor %xmm0, %xmm1 +; SSE2-NEXT: movdqa %xmm2, %xmm3 +; SSE2-NEXT: pcmpgtd %xmm1, %xmm3 ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm2, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; SSE2-NEXT: pcmpeqd %xmm2, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] ; SSE2-NEXT: pand %xmm4, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3] -; SSE2-NEXT: por %xmm1, %xmm0 -; SSE2-NEXT: por %xmm2, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3] +; SSE2-NEXT: por %xmm0, %xmm1 +; SSE2-NEXT: por %xmm2, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: unsigned_sat_constant_v2i64_using_cmp_notval: ; SSE41: # %bb.0: -; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [42,42] -; SSE41-NEXT: paddq %xmm0, %xmm1 -; SSE41-NEXT: pxor {{.*}}(%rip), %xmm0 -; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [9223372034707292117,9223372034707292117] -; SSE41-NEXT: movdqa %xmm0, %xmm3 -; SSE41-NEXT: pcmpgtd %xmm2, %xmm3 +; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [9223372039002259456,9223372039002259456] +; SSE41-NEXT: movdqa %xmm0, %xmm2 +; SSE41-NEXT: pxor %xmm1, %xmm2 +; SSE41-NEXT: paddq {{.*}}(%rip), %xmm0 +; SSE41-NEXT: pxor %xmm0, %xmm1 +; SSE41-NEXT: movdqa %xmm2, %xmm3 +; SSE41-NEXT: pcmpgtd %xmm1, %xmm3 ; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2] -; SSE41-NEXT: pcmpeqd %xmm2, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; SSE41-NEXT: pcmpeqd %xmm2, %xmm1 +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] ; SSE41-NEXT: pand %xmm4, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3] -; SSE41-NEXT: por %xmm1, %xmm0 -; SSE41-NEXT: por %xmm2, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3] +; SSE41-NEXT: por %xmm0, %xmm1 +; SSE41-NEXT: por %xmm2, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: retq ; ; SSE42-LABEL: unsigned_sat_constant_v2i64_using_cmp_notval: ; SSE42: # %bb.0: -; SSE42-NEXT: movdqa {{.*#+}} xmm1 = [42,42] -; SSE42-NEXT: paddq %xmm0, %xmm1 -; SSE42-NEXT: pxor {{.*}}(%rip), %xmm0 -; SSE42-NEXT: pcmpgtq {{.*}}(%rip), %xmm0 -; SSE42-NEXT: por %xmm1, %xmm0 +; SSE42-NEXT: movdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; SSE42-NEXT: movdqa %xmm0, %xmm1 +; SSE42-NEXT: pxor %xmm2, %xmm1 +; SSE42-NEXT: paddq {{.*}}(%rip), %xmm0 +; SSE42-NEXT: pxor %xmm0, %xmm2 +; SSE42-NEXT: pcmpgtq %xmm2, %xmm1 +; SSE42-NEXT: por %xmm0, %xmm1 +; SSE42-NEXT: movdqa %xmm1, %xmm0 ; SSE42-NEXT: retq %a = add <2 x i64> %x, %c = icmp ugt <2 x i64> %x, @@ -841,8 +843,8 @@ define <4 x i32> @unsigned_sat_variable_v4i32_using_cmp_sum(<4 x i32> %x, <4 x i32> %y) { ; SSE2-LABEL: unsigned_sat_variable_v4i32_using_cmp_sum: ; SSE2: # %bb.0: -; SSE2-NEXT: paddd %xmm0, %xmm1 ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] +; SSE2-NEXT: paddd %xmm0, %xmm1 ; SSE2-NEXT: pxor %xmm2, %xmm0 ; SSE2-NEXT: pxor %xmm1, %xmm2 ; SSE2-NEXT: pcmpgtd %xmm2, %xmm0 @@ -851,14 +853,10 @@ ; ; SSE4-LABEL: unsigned_sat_variable_v4i32_using_cmp_sum: ; SSE4: # %bb.0: -; SSE4-NEXT: paddd %xmm0, %xmm1 -; SSE4-NEXT: movdqa %xmm0, %xmm2 -; SSE4-NEXT: pminud %xmm1, %xmm2 -; SSE4-NEXT: pcmpeqd %xmm0, %xmm2 -; SSE4-NEXT: pcmpeqd %xmm0, %xmm0 -; SSE4-NEXT: pxor %xmm0, %xmm2 -; SSE4-NEXT: por %xmm1, %xmm2 -; SSE4-NEXT: movdqa %xmm2, %xmm0 +; SSE4-NEXT: pcmpeqd %xmm2, %xmm2 +; SSE4-NEXT: pxor %xmm1, %xmm2 +; SSE4-NEXT: pminud %xmm2, %xmm0 +; SSE4-NEXT: paddd %xmm1, %xmm0 ; SSE4-NEXT: retq %a = add <4 x i32> %x, %y %c = icmp ugt <4 x i32> %x, %a @@ -963,8 +961,8 @@ define <2 x i64> @unsigned_sat_variable_v2i64_using_cmp_sum(<2 x i64> %x, <2 x i64> %y) { ; SSE2-LABEL: unsigned_sat_variable_v2i64_using_cmp_sum: ; SSE2: # %bb.0: -; SSE2-NEXT: paddq %xmm0, %xmm1 ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [9223372039002259456,9223372039002259456] +; SSE2-NEXT: paddq %xmm0, %xmm1 ; SSE2-NEXT: pxor %xmm2, %xmm0 ; SSE2-NEXT: pxor %xmm1, %xmm2 ; SSE2-NEXT: movdqa %xmm0, %xmm3 @@ -980,8 +978,8 @@ ; ; SSE41-LABEL: unsigned_sat_variable_v2i64_using_cmp_sum: ; SSE41: # %bb.0: -; SSE41-NEXT: paddq %xmm0, %xmm1 ; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [9223372039002259456,9223372039002259456] +; SSE41-NEXT: paddq %xmm0, %xmm1 ; SSE41-NEXT: pxor %xmm2, %xmm0 ; SSE41-NEXT: pxor %xmm1, %xmm2 ; SSE41-NEXT: movdqa %xmm0, %xmm3 @@ -997,8 +995,8 @@ ; ; SSE42-LABEL: unsigned_sat_variable_v2i64_using_cmp_sum: ; SSE42: # %bb.0: -; SSE42-NEXT: paddq %xmm0, %xmm1 ; SSE42-NEXT: movdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] +; SSE42-NEXT: paddq %xmm0, %xmm1 ; SSE42-NEXT: pxor %xmm2, %xmm0 ; SSE42-NEXT: pxor %xmm1, %xmm2 ; SSE42-NEXT: pcmpgtq %xmm2, %xmm0