Index: include/llvm/CodeGen/TargetLowering.h =================================================================== --- include/llvm/CodeGen/TargetLowering.h +++ include/llvm/CodeGen/TargetLowering.h @@ -3695,6 +3695,8 @@ SDValue N1, ISD::CondCode Cond, DAGCombinerInfo &DCI, const SDLoc &DL) const; + SDValue BuildREMEqFold(EVT VT, SDValue N0, SDValue N1, ISD::CondCode Cond, + DAGCombinerInfo &DCI, const SDLoc &DL) const; }; /// Given an LLVM IR type and return type attributes, compute the return value Index: lib/CodeGen/SelectionDAG/TargetLowering.cpp =================================================================== --- lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -2774,6 +2774,14 @@ return V; } + // Fold remainder of division by a a constant + if ((Cond == ISD::SETEQ || Cond == ISD::SETNE) && N0.hasOneUse() && + (N0.getOpcode() == ISD::UREM || N0.getOpcode() == ISD::SREM)) { + SDValue Folded = BuildREMEqFold(VT, N0, N1, Cond, DCI, dl); + if (Folded) + return Folded; + } + // Fold away ALL boolean setcc's. SDValue Temp; if (N0.getValueType().getScalarType() == MVT::i1 && foldBooleans) { @@ -3703,6 +3711,119 @@ return DAG.getSelect(dl, VT, IsOne, N0, Q); } +/// Given an ISD::UREM or ISD::SREM used only by an ISD::SETEQ or ISD::SETNE +/// where the divisor is constant and the comparison target is zero, +/// return a DAG expression that will generate the same comparison result +/// using only multiplications, additions and shifts/rotations. +/// Ref: "Hacker's Delight" 10-17. +SDValue TargetLowering::BuildREMEqFold(EVT VT, SDValue REMNode, + SDValue CompNode, ISD::CondCode Cond, + DAGCombinerInfo &DCI, + const SDLoc &DL) const { + // fold (seteq/ne (urem N, D), 0) -> (setule/gte (rotr (mul N, P), K), Q) + // fold (seteq/ne (srem N, D), 0) + // -> (setule/gte (rotr (add (mul N, P), Q''), K), (srl Q', K-1)) + // - D must be constant with D = D0 * 2^K where D0 is odd + // - P is the multiplicative inverse of D0 modulo 2^W + // - Q = floor((2^W - 1) / D0) + // - Q' = floor((2^(W - 1) - 1) / D0) + // - Q'' = Q' & -2^K + // where W is the width of the common type of N and D + SelectionDAG &DAG = DCI.DAG; + EVT REMVT = REMNode->getValueType(0); + bool IsEq = (Cond == ISD::SETEQ); + bool IsSigned = (REMNode->getOpcode() == ISD::SREM); + + if (!isTypeLegal(REMVT)) + return SDValue(); + + // Keep divrem when optimizing for minimum size + if (DAG.getMachineFunction().getFunction().optForMinSize()) + return SDValue(); + + // TODO: Add non-uniform constant support + ConstantSDNode *Divisor = isConstOrConstSplat(REMNode->getOperand(1)); + ConstantSDNode *CompTarget = isConstOrConstSplat(CompNode); + if (!Divisor || !CompTarget || Divisor->isNullValue() || + !CompTarget->isNullValue()) + return SDValue(); + + APInt D = Divisor->getAPIntValue(); + unsigned W = D.getBitWidth(); + + // The algorithm implemented below assumes D0 > 1 + // We use the fact that N % D == 0 <=> N % |D| == 0 to guarantee D > 0 here + if (IsSigned && D.isNegative()) + D.negate(); + + // Rewrite D = D0 * 2^K + unsigned K = D.countTrailingZeros(); + APInt D0 = D.lshr(K); + + // If D0 == 1, we cannot build this fold. Otherwise, D0 > 1, as needed. + if (D0.isOneValue()) + return SDValue(); + + // Calculate the multiplicative inverse P of D0 using Newton's method. + APInt tmp; + APInt P = D0; + while ((tmp = D0 * P) != 1) + P *= APInt(D0.getBitWidth(), 2) - tmp; + + // Q = floor((2^W - 1) / D0) + APInt Q = APInt::getAllOnesValue(W); + if (IsSigned) { + // Q' = floor((2^(W - 1) - 1) / D0) + Q.clearSignBit(); + } + Q = Q.udiv(D0); + if (IsSigned) { + // Q'' = Q' & -2^K + APInt Neg2K = APInt(W, 0); + Neg2K.setHighBits(W - K); + Q &= Neg2K; + } + + SDValue PVal = DAG.getConstant(P, DL, REMVT); + SDValue QVal = DAG.getConstant(Q, DL, REMVT); + // (mul N, P) + SDValue Op1 = DAG.getNode(ISD::MUL, DL, REMVT, REMNode->getOperand(0), PVal); + DCI.AddToWorklist(Op1.getNode()); + // Will change in the signed case + SDValue Op2 = QVal; + + if (IsSigned) { + // (add (mul N, P), Q') + Op1 = DAG.getNode(ISD::ADD, DL, REMVT, Op1, QVal); + DCI.AddToWorklist(Op1.getNode()); + } + + // Rotate right only if D was even and thus shifted (K > 0) + if (K) { + SDValue ShAmt = + DAG.getConstant(K, DL, getShiftAmountTy(REMVT, DAG.getDataLayout())); + SDNodeFlags Flags; + Flags.setExact(true); + // UREM: (rotr (mul N, P), K) + // SREM: (rotr (add (mul N, P), Q'), K) + Op1 = DAG.getNode(ISD::ROTR, DL, REMVT, Op1, ShAmt, Flags); + DCI.AddToWorklist(Op1.getNode()); + } + + // Don't insert a useless node if K - 1 == 0 + if (IsSigned && K != 1) { + if (K != 0) + Op2 = DAG.getConstant(Q.lshr(K - 1), DL, REMVT); + else + Op2 = DAG.getConstant(Q.shl(1), DL, REMVT); + } + + // UREM: (setule/setugt (rotr (mul N, P), K), Q) + // SREM: (setule/setugt (rotr (add (mul N, P), Q''), K), (srl Q', K-1)) + Op1 = DAG.getSetCC(DL, VT, Op1, Op2, (IsEq ? ISD::SETULE : ISD::SETUGT)); + return Op1; +} + bool TargetLowering:: verifyReturnAddressArgumentIsConstant(SDValue Op, SelectionDAG &DAG) const { if (!isa(Op.getOperand(0))) { Index: test/CodeGen/X86/jump_sign.ll =================================================================== --- test/CodeGen/X86/jump_sign.ll +++ test/CodeGen/X86/jump_sign.ll @@ -236,13 +236,11 @@ ; CHECK-NEXT: jne .LBB12_8 ; CHECK-NEXT: # %bb.4: # %if.end29 ; CHECK-NEXT: movzwl (%eax), %eax -; CHECK-NEXT: movzwl %ax, %eax -; CHECK-NEXT: imull $52429, %eax, %ecx # imm = 0xCCCD -; CHECK-NEXT: shrl $19, %ecx -; CHECK-NEXT: addl %ecx, %ecx -; CHECK-NEXT: leal (%ecx,%ecx,4), %ecx -; CHECK-NEXT: cmpw %cx, %ax -; CHECK-NEXT: jne .LBB12_5 +; CHECK-NEXT: imull $-13107, %eax, %eax +; CHECK-NEXT: rorw $1, %ax +; CHECK-NEXT: movzwl %ax, %eax +; CHECK-NEXT: cmpl $13108, %eax +; CHECK-NEXT: jae .LBB12_5 ; CHECK-NEXT: .LBB12_8: # %if.then44 ; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: testb %al, %al Index: test/CodeGen/X86/rem-seteq-optsize.ll =================================================================== --- /dev/null +++ test/CodeGen/X86/rem-seteq-optsize.ll @@ -0,0 +1,22 @@ +; RUN: llc < %s -mtriple=i386-unknown-unknown | FileCheck %s + +; On X86, division in expensive. BuildRemEqFold should therefore run even +; when optimizing for size. Only optimizing for minimum size retains a plain div. + +define i32 @test_minsize(i32 %X) optsize minsize nounwind readnone { +; CHECK: divl +; CHECK-NOT: imull + %rem = urem i32 %X, 5 + %cmp = icmp eq i32 %rem, 0 + %ret = select i1 %cmp, i32 42, i32 -10 + ret i32 %ret +} + +define i32 @test_optsize(i32 %X) optsize nounwind readnone { +; CHECK: imull +; CHECK-NOT: divl + %rem = urem i32 %X, 5 + %cmp = icmp eq i32 %rem, 0 + %ret = select i1 %cmp, i32 42, i32 -10 + ret i32 %ret +} Index: test/CodeGen/X86/rem-seteq.ll =================================================================== --- /dev/null +++ test/CodeGen/X86/rem-seteq.ll @@ -0,0 +1,264 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=i386-unknown-unknown | FileCheck %s + +; This tests the BuildREMEqFold optimization with UREM, i32, odd divisor, SETEQ. +; The corresponding pseudocode is: +; Q <- [N * multInv(5, 2^32)] <=> [N * 0xCCCCCCCD] <=> [N * (-858993459)] +; res <- [Q <= (2^32 - 1) / 5] <=> [Q <= 858993459] <=> [Q < 858993460] +define i32 @test_urem_odd(i32 %X) nounwind readnone { +; CHECK-LABEL: test_urem_odd: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: imull $-858993459, {{[0-9]+}}(%esp), %ecx # imm = 0xCCCCCCCD +; CHECK-NEXT: xorl %eax, %eax +; CHECK-NEXT: cmpl $858993460, %ecx # imm = 0x33333334 +; CHECK-NEXT: setb %al +; CHECK-NEXT: retl + +entry: + %0 = urem i32 %X, 5 + %cmp = icmp eq i32 %0, 0 + %ret = zext i1 %cmp to i32 + ret i32 %ret +} + +; This is like test_urem_odd, except the divisor has bit 30 set. +define i32 @test_urem_odd_bit30(i32 %X) nounwind readnone { +; CHECK-LABEL: test_urem_odd_bit30: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: imull $1789569707, {{[0-9]+}}(%esp), %ecx # imm = 0x6AAAAAAB +; CHECK-NEXT: xorl %eax, %eax +; CHECK-NEXT: cmpl $4, %ecx +; CHECK-NEXT: setb %al +; CHECK-NEXT: retl + +entry: + %0 = urem i32 %X, 1073741827 + %cmp = icmp eq i32 %0, 0 + %ret = zext i1 %cmp to i32 + ret i32 %ret +} + +; This is like test_urem_odd, except the divisor has bit 31 set. +define i32 @test_urem_odd_bit31(i32 %X) nounwind readnone { +; CHECK-LABEL: test_urem_odd_bit31: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: imull $715827883, {{[0-9]+}}(%esp), %ecx # imm = 0x2AAAAAAB +; CHECK-NEXT: xorl %eax, %eax +; CHECK-NEXT: cmpl $2, %ecx +; CHECK-NEXT: setb %al +; CHECK-NEXT: retl + +entry: + %0 = urem i32 %X, 2147483651 + %cmp = icmp eq i32 %0, 0 + %ret = zext i1 %cmp to i32 + ret i32 %ret +} + +; This tests the BuildREMEqFold optimization with UREM, i16, even divisor, SETNE. +; In this case, D <=> 14 <=> 7 * 2^1, so D0 = 7 and K = 1. +; The corresponding pseudocode is: +; Q <- [N * multInv(D0, 2^16)] <=> [N * multInv(7, 2^16)] <=> [N * 28087] +; Q <- [Q >>rot K] <=> [Q >>rot 1] +; res <- ![Q <= (2^16 - 1) / 7] <=> ![Q <= 9362] <=> [Q > 9362] +define i16 @test_urem_even(i16 %X) nounwind readnone { +; CHECK-LABEL: test_urem_even: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: imull $28087, {{[0-9]+}}(%esp), %eax # imm = 0x6DB7 +; CHECK-NEXT: rorw $1, %ax +; CHECK-NEXT: movzwl %ax, %ecx +; CHECK-NEXT: xorl %eax, %eax +; CHECK-NEXT: cmpl $9362, %ecx # imm = 0x2492 +; CHECK-NEXT: seta %al +; CHECK-NEXT: # kill: def $ax killed $ax killed $eax +; CHECK-NEXT: retl + +entry: + %0 = urem i16 %X, 14 + %cmp = icmp ne i16 %0, 0 + %ret = zext i1 %cmp to i16 + ret i16 %ret +} + +; This is like test_urem_even, except the divisor has bit 30 set. +define i32 @test_urem_even_bit30(i32 %X) nounwind readnone { +; CHECK-LABEL: test_urem_even_bit30: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: imull $-51622203, {{[0-9]+}}(%esp), %ecx # imm = 0xFCEC4EC5 +; CHECK-NEXT: rorl $3, %ecx +; CHECK-NEXT: xorl %eax, %eax +; CHECK-NEXT: cmpl $32, %ecx +; CHECK-NEXT: setb %al +; CHECK-NEXT: retl + +entry: + %0 = urem i32 %X, 1073741928 + %cmp = icmp eq i32 %0, 0 + %ret = zext i1 %cmp to i32 + ret i32 %ret +} + +; This is like test_urem_odd, except the divisor has bit 31 set. +define i32 @test_urem_even_bit31(i32 %X) nounwind readnone { +; CHECK-LABEL: test_urem_even_bit31: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: imull $-1157956869, {{[0-9]+}}(%esp), %ecx # imm = 0xBAFAFAFB +; CHECK-NEXT: rorl $1, %ecx +; CHECK-NEXT: xorl %eax, %eax +; CHECK-NEXT: cmpl $4, %ecx +; CHECK-NEXT: setb %al +; CHECK-NEXT: retl + +entry: + %0 = urem i32 %X, 2147483750 + %cmp = icmp eq i32 %0, 0 + %ret = zext i1 %cmp to i32 + ret i32 %ret +} + +; This tests the BuildREMEqFold optimization with SREM, i32, odd divisor, SETEQ. +; The corresponding pseudocode is: +; Q <- [N * multInv(31, 2^32)] <=> [N * 0xBDEF7BDF] <=> [N * (-1108378657)] +; a <- [((2^31 - 1) / 31)] <=> 69273666 +; Q' <- Q + a +; res <- [Q' <= 2a/2^0] <=> [Q' <= 138547332] <=> [Q' < 138547333] +define i32 @test_srem_odd(i32 %X) nounwind readnone { +; CHECK-LABEL: test_srem_odd: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: imull $-1108378657, {{[0-9]+}}(%esp), %ecx # imm = 0xBDEF7BDF +; CHECK-NEXT: addl $69273666, %ecx # imm = 0x4210842 +; CHECK-NEXT: xorl %eax, %eax +; CHECK-NEXT: cmpl $138547333, %ecx # imm = 0x8421085 +; CHECK-NEXT: setb %al +; CHECK-NEXT: retl + +entry: + %0 = srem i32 %X, 31 + %cmp = icmp eq i32 %0, 0 + %ret = zext i1 %cmp to i32 + ret i32 %ret +} + +; This is the same as test_srem_odd, but the divisor is negative. +; The output should be the same. +define i32 @test_srem_odd_neg(i32 %X) nounwind readnone { +; CHECK-LABEL: test_srem_odd_neg: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: imull $-1108378657, {{[0-9]+}}(%esp), %ecx # imm = 0xBDEF7BDF +; CHECK-NEXT: addl $69273666, %ecx # imm = 0x4210842 +; CHECK-NEXT: xorl %eax, %eax +; CHECK-NEXT: cmpl $138547333, %ecx # imm = 0x8421085 +; CHECK-NEXT: setb %al +; CHECK-NEXT: retl + +entry: + %0 = srem i32 %X, -31 + %cmp = icmp eq i32 %0, 0 + %ret = zext i1 %cmp to i32 + ret i32 %ret +} + +; This is like test_srem_odd, except the divisor has bit 30 set. +define i32 @test_srem_odd_bit30(i32 %X) nounwind readnone { +; CHECK-LABEL: test_srem_odd_bit30: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: imull $-1824318633, {{[0-9]+}}(%esp), %ecx # imm = 0x93431B57 +; CHECK-NEXT: incl %ecx +; CHECK-NEXT: xorl %eax, %eax +; CHECK-NEXT: cmpl $3, %ecx +; CHECK-NEXT: setb %al +; CHECK-NEXT: retl + +entry: + %0 = srem i32 %X, 1073741927 + %cmp = icmp eq i32 %0, 0 + %ret = zext i1 %cmp to i32 + ret i32 %ret +} + +; This tests the BuildREMEqFold optimization with SREM, i8, even divisor, SETNE. +; The corresponding pseudocode is (with 12 = 3 * 2^2 => D0 = 3, K = 2): +; Q <- [N * multInv(D0, 2^8)] <=> [N * multInv(3, 2^8)] <=> [N * 0xAB] <=> [N * (-85)] +; a <- [((2^7 - 1) / 3) & -2^2] <=> [42 & -4] <=> 40 +; Q' <- Q + a +; Q' <- [Q' >>rot 2] +; res <- ![Q' <= 2a/2^2 ] <=> ![Q' <= 20] <=> [Q' > 20] +define i8 @test_srem_even(i8 %X) nounwind readnone { +; CHECK-LABEL: test_srem_even: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: movb {{[0-9]+}}(%esp), %al +; CHECK-NEXT: movb $-85, %cl +; CHECK-NEXT: mulb %cl +; CHECK-NEXT: addb $40, %al +; CHECK-NEXT: rorb $2, %al +; CHECK-NEXT: cmpb $20, %al +; CHECK-NEXT: seta %al +; CHECK-NEXT: retl + +entry: + %0 = srem i8 %X, 12 + %cmp = icmp ne i8 %0, 0 + %ret = zext i1 %cmp to i8 + ret i8 %ret +} + +; This is like test_srem_odd_bit30, except the divisor is even. +define i32 @test_srem_even_bit30(i32 %X) nounwind readnone { +; CHECK-LABEL: test_srem_even_bit30: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: imull $-51622203, {{[0-9]+}}(%esp), %ecx # imm = 0xFCEC4EC5 +; CHECK-NEXT: addl $8, %ecx +; CHECK-NEXT: rorl $3, %ecx +; CHECK-NEXT: xorl %eax, %eax +; CHECK-NEXT: cmpl $3, %ecx +; CHECK-NEXT: setb %al +; CHECK-NEXT: retl + +entry: + %0 = srem i32 %X, 1073741928 + %cmp = icmp eq i32 %0, 0 + %ret = zext i1 %cmp to i32 + ret i32 %ret +} + +; This is the same as test_srem_even_bit30, but the divisor is negative. +; The output should be the same. +define i32 @test_srem_even_neg(i32 %X) nounwind readnone { +; CHECK-LABEL: test_srem_even_neg: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: imull $-51622203, {{[0-9]+}}(%esp), %ecx # imm = 0xFCEC4EC5 +; CHECK-NEXT: addl $8, %ecx +; CHECK-NEXT: rorl $3, %ecx +; CHECK-NEXT: xorl %eax, %eax +; CHECK-NEXT: cmpl $3, %ecx +; CHECK-NEXT: setb %al +; CHECK-NEXT: retl + +entry: + %0 = srem i32 %X, -1073741928 + %cmp = icmp eq i32 %0, 0 + %ret = zext i1 %cmp to i32 + ret i32 %ret +} + +; We should not proceed with this fold if the divisor is 1 or -1 +define i32 @test_srem_one(i32 %X) nounwind readnone { +; CHECK-NOT: imull +entry: + %0 = urem i32 %X, 1 + %cmp = icmp eq i32 %0, 0 + %ret = zext i1 %cmp to i32 + ret i32 %ret +} + +; We can lower remainder of division by powers of two much better elsewhere; +; also, BuildREMEqFold does not work when the only odd factor of the divisor is 1. +; This ensures we don't touch powers of two. +define i32 @test_urem_pow2(i32 %X) nounwind readnone { +; CHECK-NOT: imull +entry: + %0 = urem i32 %X, 16 + %cmp = icmp eq i32 %0, 0 + %ret = zext i1 %cmp to i32 + ret i32 %ret +} Index: test/CodeGen/X86/vselect-avx.ll =================================================================== --- test/CodeGen/X86/vselect-avx.ll +++ test/CodeGen/X86/vselect-avx.ll @@ -85,17 +85,9 @@ define void @test3(<4 x i32> %induction30, <4 x i16>* %tmp16, <4 x i16>* %tmp17, <4 x i16> %tmp3, <4 x i16> %tmp12) { ; AVX1-LABEL: test3: ; AVX1: ## %bb.0: -; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [1431655766,1431655766,1431655766,1431655766] -; AVX1-NEXT: vpmuldq %xmm4, %xmm3, %xmm3 -; AVX1-NEXT: vpmuldq %xmm4, %xmm0, %xmm4 -; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3],xmm4[4,5],xmm3[6,7] -; AVX1-NEXT: vpsrld $31, %xmm3, %xmm4 -; AVX1-NEXT: vpaddd %xmm4, %xmm3, %xmm3 -; AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm3, %xmm3 -; AVX1-NEXT: vpsubd %xmm3, %xmm0, %xmm0 -; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpmulld LCPI{{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vpaddd LCPI{{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vpminud LCPI{{.*}}(%rip), %xmm0, %xmm3 ; AVX1-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 ; AVX1-NEXT: vblendvps %xmm0, %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 @@ -106,19 +98,12 @@ ; ; AVX2-LABEL: test3: ; AVX2: ## %bb.0: -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [1431655766,1431655766,1431655766,1431655766] -; AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] -; AVX2-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[1,1,3,3] -; AVX2-NEXT: vpmuldq %xmm4, %xmm5, %xmm4 -; AVX2-NEXT: vpmuldq %xmm3, %xmm0, %xmm3 -; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; AVX2-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2],xmm4[3] -; AVX2-NEXT: vpsrld $31, %xmm3, %xmm4 -; AVX2-NEXT: vpaddd %xmm4, %xmm3, %xmm3 -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm4 = [3,3,3,3] -; AVX2-NEXT: vpmulld %xmm4, %xmm3, %xmm3 -; AVX2-NEXT: vpsubd %xmm3, %xmm0, %xmm0 -; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [2863311531,2863311531,2863311531,2863311531] +; AVX2-NEXT: vpmulld %xmm3, %xmm0, %xmm0 +; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [715827882,715827882,715827882,715827882] +; AVX2-NEXT: vpaddd %xmm3, %xmm0, %xmm0 +; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [1431655764,1431655764,1431655764,1431655764] +; AVX2-NEXT: vpminud %xmm3, %xmm0, %xmm3 ; AVX2-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0 ; AVX2-NEXT: vblendvps %xmm0, %xmm1, %xmm2, %xmm1 ; AVX2-NEXT: vpackssdw %xmm0, %xmm0, %xmm0